Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion perfkitbenchmarker/data/container/karpenter/nodepool.yaml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@ spec:
values: ["linux"]
- key: karpenter.sh/capacity-type
operator: In
values: ["on-demand"]
values: {% if USE_SPOT %}["spot"]{% else %}["on-demand"]{% endif %}
{% if GPU_TYPE -%}
- key: karpenter.k8s.aws/instance-gpu-name
operator: In
values: ["{{ GPU_TYPE }}"]
{% endif -%}
- key: karpenter.k8s.aws/instance-category
operator: In
values: ["c", "m", "r"]
Expand Down
63 changes: 53 additions & 10 deletions perfkitbenchmarker/providers/aws/elastic_kubernetes_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,11 +377,11 @@ def InitializeNodePoolForCloud(
nodepool_config: container_service.BaseNodePoolConfig,
):
nodepool_config.disk_type = (
vm_config.DEFAULT_ROOT_DISK_TYPE # pytype: disable=attribute-error
)
vm_config.DEFAULT_ROOT_DISK_TYPE
) # pytype: disable=attribute-error
nodepool_config.disk_size = (
vm_config.boot_disk_size # pytype: disable=attribute-error
)
vm_config.boot_disk_size
) # pytype: disable=attribute-error

def GetResourceMetadata(self):
"""Returns a dict containing metadata about the cluster.
Expand Down Expand Up @@ -547,6 +547,7 @@ def __init__(self, spec):
super().__init__(spec)
self._ChooseSecondZone()
is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES
is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this line duplicate with the line above it?

self.use_spot: bool = aws_flags.USE_AWS_SPOT_INSTANCES.value or is_rare_gpu

def InitializeNodePoolForCloud(
Expand Down Expand Up @@ -652,6 +653,9 @@ def __init__(self, spec):
self._ChooseSecondZone()
self.stack_name = f'Karpenter-{self.name}'
self.cluster_version: str = self.cluster_version or _DEAULT_K8S_VERSION
# Add spot instance support similar to EksAutoCluster
is_rare_gpu = virtual_machine.GPU_TYPE.value in _RARE_GPU_TYPES
self.use_spot: bool = aws_flags.USE_AWS_SPOT_INSTANCES.value or is_rare_gpu
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's used by 2 implementations; may as well move this variable to BaseEksCluster's init.


def InitializeNodePoolForCloud(
self,
Expand Down Expand Up @@ -700,9 +704,7 @@ def _Create(self):
}],
},
'iamIdentityMappings': [{
'arn': (
f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}'
),
'arn': f'arn:aws:iam::{self.account}:role/KarpenterNodeRole-{self.name}',
'username': 'system:node:{{EC2PrivateDNSName}}',
'groups': ['system:bootstrappers', 'system:nodes'],
}],
Expand Down Expand Up @@ -1021,6 +1023,27 @@ def _PostCreate(self):
# Ensure ALB ingress support: installs AWS Load Balancer Controller.
if FLAGS.eks_install_alb_controller:
self._InstallAwsLoadBalancerController()
# Create AWSServiceRoleForEC2Spot
if self.use_spot:
stdout, stderr, retcode = vm_util.IssueCommand(
[
'aws',
'iam',
'create-service-linked-role',
'--aws-service-name',
'spot.amazonaws.com',
],
raise_on_failure=False,
)

if retcode:
logging.warning(
'Failed to create service linked role spot.amazonaws.com: %s,'
' error: %s',
stdout,
stderr,
)

# Get the AMI version for current kubernetes version.
# See e.g. https://karpenter.sh/docs/tasks/managing-amis/ for not using
# @latest.
Expand Down Expand Up @@ -1055,6 +1078,10 @@ def _PostCreate(self):
'container/karpenter/nodepool.yaml.j2',
CLUSTER_NAME=self.name,
ALIAS_VERSION=alias_version,
USE_SPOT=self.use_spot,
GPU_TYPE=virtual_machine.GPU_TYPE.value
if virtual_machine.GPU_TYPE.value
else None,
)

def _Delete(self):
Expand Down Expand Up @@ -1117,10 +1144,26 @@ def ResizeNodePool(

def GetNodeSelectors(self, machine_type: str | None = None) -> list[str]:
"""Gets the node selectors section of a yaml for the provider."""
machine_family = util.GetMachineFamily(machine_type)
selectors = []

# Machine family selector
machine_family = util.GetMachineFamily(self.default_nodepool.machine_type)
if machine_family:
return [f'karpenter.k8s.aws/instance-family: {machine_family}']
return []
selectors.append(f'karpenter.k8s.aws/instance-family: {machine_family}')
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does specifying instance family work with also specifying gpus? I think our default arguments would specify both.


# Spot instance selector
if self.use_spot:
selectors.append('karpenter.sh/capacity-type: spot')
else:
selectors.append('karpenter.sh/capacity-type: on-demand')

# GPU selector for Karpenter
if virtual_machine.GPU_TYPE.value:
selectors.append(
f'karpenter.k8s.aws/instance-gpu-name: {virtual_machine.GPU_TYPE.value}'
)

return selectors

def GetNodePoolNames(self) -> list[str]:
"""Gets node pool names for the cluster.
Expand Down