-
Notifications
You must be signed in to change notification settings - Fork 543
Add AWS support for spot instances #6149
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -377,11 +377,11 @@ def InitializeNodePoolForCloud( | |
| nodepool_config: container_service.BaseNodePoolConfig, | ||
| ): | ||
| nodepool_config.disk_type = ( | ||
| vm_config.DEFAULT_ROOT_DISK_TYPE # pytype: disable=attribute-error | ||
| ) | ||
| vm_config.DEFAULT_ROOT_DISK_TYPE | ||
| ) # pytype: disable=attribute-error | ||
| nodepool_config.disk_size = ( | ||
| vm_config.boot_disk_size # pytype: disable=attribute-error | ||
| ) | ||
| vm_config.boot_disk_size | ||
| ) # pytype: disable=attribute-error | ||
|
|
||
| def GetResourceMetadata(self): | ||
| """Returns a dict containing metadata about the cluster. | ||
|
|
@@ -547,6 +547,7 @@ def __init__(self, spec): | |
| super().__init__(spec) | ||
| self._ChooseSecondZone() | ||
| is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES | ||
| is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES | ||
| self.use_spot: bool = aws_flags.USE_AWS_SPOT_INSTANCES.value or is_rare_gpu | ||
|
|
||
| def InitializeNodePoolForCloud( | ||
|
|
@@ -652,6 +653,9 @@ def __init__(self, spec): | |
| self._ChooseSecondZone() | ||
| self.stack_name = f'Karpenter-{self.name}' | ||
| self.cluster_version: str = self.cluster_version or _DEAULT_K8S_VERSION | ||
| # Add spot instance support similar to EksAutoCluster | ||
| is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES | ||
| self.use_spot: bool = aws_flags.USE_AWS_SPOT_INSTANCES.value or is_rare_gpu | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's used by 2 implementations; may as well move this variable to BaseEksCluster's init. |
||
|
|
||
| def InitializeNodePoolForCloud( | ||
| self, | ||
|
|
@@ -700,9 +704,7 @@ def _Create(self): | |
| }], | ||
| }, | ||
| 'iamIdentityMappings': [{ | ||
| 'arn': ( | ||
| f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}' | ||
| ), | ||
| 'arn': f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}', | ||
| 'username': 'system:node:{{EC2PrivateDNSName}}', | ||
| 'groups': ['system:bootstrappers', 'system:nodes'], | ||
| }], | ||
|
|
@@ -1021,6 +1023,27 @@ def _PostCreate(self): | |
| # Ensure ALB ingress support: installs AWS Load Balancer Controller. | ||
| if FLAGS.eks_install_alb_controller: | ||
| self._InstallAwsLoadBalancerController() | ||
| # Create AWSServiceRoleForEC2Spot | ||
| if self.use_spot: | ||
| stdout, stderr, retcode = vm_util.IssueCommand( | ||
| [ | ||
| 'aws', | ||
| 'iam', | ||
| 'create-service-linked-role', | ||
| '--aws-service-name', | ||
| 'spot.amazonaws.com', | ||
hubatish marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| ], | ||
| raise_on_failure=False, | ||
| ) | ||
|
|
||
| if retcode: | ||
| logging.warning( | ||
| 'Failed to create service linked role spot.amazonaws.com: %s,' | ||
| ' error: %s', | ||
| stdout, | ||
| stderr, | ||
| ) | ||
|
|
||
| # Get the AMI version for current kubernetes version. | ||
| # See e.g. https://karpenter.sh/docs/tasks/managing-amis/ for not using | ||
| # @latest. | ||
|
|
@@ -1055,6 +1078,10 @@ def _PostCreate(self): | |
| 'container/karpenter/nodepool.yaml.j2', | ||
| CLUSTER_NAME=self.name, | ||
| ALIAS_VERSION=alias_version, | ||
| USE_SPOT=self.use_spot, | ||
| GPU_TYPE=virtual_machine.GPU_TYPE.value | ||
| if virtual_machine.GPU_TYPE.value | ||
| else None, | ||
| ) | ||
|
|
||
| def _Delete(self): | ||
|
|
@@ -1117,10 +1144,26 @@ def ResizeNodePool( | |
|
|
||
| def GetNodeSelectors(self, machine_type: str | None = None) -> list[str]: | ||
| """Gets the node selectors section of a yaml for the provider.""" | ||
| machine_family = util.GetMachineFamily(machine_type) | ||
| selectors = [] | ||
|
|
||
| # Machine family selector | ||
| machine_family = util.GetMachineFamily(self.default_nodepool.machine_type) | ||
| if machine_family: | ||
| return [f'karpenter.k8s.aws/instance-family: {machine_family}'] | ||
| return [] | ||
| selectors.append(f'karpenter.k8s.aws/instance-family: {machine_family}') | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does specifying instance family work with also specifying gpus? I think our default arguments would specify both. |
||
|
|
||
| # Spot instance selector | ||
| if self.use_spot: | ||
| selectors.append('karpenter.sh/capacity-type: spot') | ||
| else: | ||
| selectors.append('karpenter.sh/capacity-type: on-demand') | ||
|
|
||
| # GPU selector for Karpenter | ||
| if virtual_machine.GPU_TYPE.value: | ||
| selectors.append( | ||
| f'karpenter.k8s.aws/instance-gpu-name: {virtual_machine.GPU_TYPE.value}' | ||
| ) | ||
|
|
||
haykking marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return selectors | ||
|
|
||
| def GetNodePoolNames(self) -> list[str]: | ||
| """Gets node pool names for the cluster. | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this line duplicate with the line above it?