diff --git a/perfkitbenchmarker/data/container/kubernetes_ai_inference/aws-gpu-nodepool.yaml.j2 b/perfkitbenchmarker/data/container/kubernetes_ai_inference/aws-gpu-nodepool.yaml.j2 new file mode 100644 index 0000000000..5f0fc870ef --- /dev/null +++ b/perfkitbenchmarker/data/container/kubernetes_ai_inference/aws-gpu-nodepool.yaml.j2 @@ -0,0 +1,27 @@ +apiVersion: karpenter.sh/v1 +kind: NodePool +metadata: + name: {{ gpu_nodepool_name | default('gpu') }} +spec: + disruption: + consolidateAfter: {{ gpu_consolidate_after | default('1h') }} + consolidationPolicy: {{ gpu_consolidation_policy | default('WhenEmpty') }} + template: + spec: + nodeClassRef: + group: eks.amazonaws.com + kind: NodeClass + name: {{ karpenter_nodeclass_name | default('default') }} + requirements: + - key: karpenter.sh/capacity-type + operator: In + values: {{ gpu_capacity_types | default(['on-demand']) }} + - key: kubernetes.io/arch + operator: In + values: {{ gpu_arch | default(['amd64']) }} + - key: eks.amazonaws.com/instance-family + operator: In + values: {{ gpu_instance_families | default(['g6','g6e']) }} + taints: + - key: {{ gpu_taint_key | default('nvidia.com/gpu') }} + effect: NoSchedule diff --git a/perfkitbenchmarker/data/container/kubernetes_ai_inference/serving_catalog_cli.yaml.j2 b/perfkitbenchmarker/data/container/kubernetes_ai_inference/serving_catalog_cli.yaml.j2 index d69469b791..f8c185bf3e 100644 --- a/perfkitbenchmarker/data/container/kubernetes_ai_inference/serving_catalog_cli.yaml.j2 +++ b/perfkitbenchmarker/data/container/kubernetes_ai_inference/serving_catalog_cli.yaml.j2 @@ -16,6 +16,10 @@ spec: value: {{generate_args}} - name: OUTPUT_FILE value: /output/output.yaml + - name: WG_SERVING_REPO_URL + value: "{{ wg_serving_repo_url }}" + - name: WG_SERVING_REPO_BRANCH + value: "{{ wg_serving_repo_branch }}" volumeMounts: - name: output mountPath: /output diff --git a/perfkitbenchmarker/resources/kubernetes/wg_serving_inference_server.py b/perfkitbenchmarker/resources/kubernetes/wg_serving_inference_server.py index 8f42cebca7..8a4f634747 100644 --- a/perfkitbenchmarker/resources/kubernetes/wg_serving_inference_server.py +++ b/perfkitbenchmarker/resources/kubernetes/wg_serving_inference_server.py @@ -51,6 +51,18 @@ 'The GCS bucket that has model data for inference server to use.', ) +WG_SERVING_REPO_URL = flags.DEFINE_string( + 'wg_serving_repo_url', + 'https://github.com/kubernetes-sigs/wg-serving', + 'URL of the WG Serving repository.', +) + +WG_SERVING_REPO_BRANCH = flags.DEFINE_string( + 'wg_serving_repo_branch', + 'main', + 'Branch of the WG Serving repository.', +) + @dataclasses.dataclass class PodStartupMetrics: @@ -569,6 +581,19 @@ def _InjectDefaultHuggingfaceToken(self) -> None: def _GetInferenceServerManifest(self) -> str: """Generates and retrieves the inference server manifest content.""" + # Ensure GPU capacity exists before scheduling GPU workloads + if FLAGS.cloud == 'AWS': + self.cluster.ApplyManifest( + 'container/kubernetes_ai_inference/aws-gpu-nodepool.yaml.j2', + gpu_nodepool_name='gpu', + gpu_consolidate_after='1h', + gpu_consolidation_policy='WhenEmpty', + karpenter_nodeclass_name='default', # must exist already + gpu_capacity_types=['on-demand'], + gpu_arch=['amd64'], + gpu_instance_families=['g6','g6e'], + gpu_taint_key='nvidia.com/gpu', + ) generate_args = { 'kind': 'core/deployment', 'model-server': self.spec.model_server, @@ -581,6 +606,8 @@ def _GetInferenceServerManifest(self) -> str: self.cluster.ApplyManifest( 'container/kubernetes_ai_inference/serving_catalog_cli.yaml.j2', image_repo=FLAG_IMAGE_REPO.value, + wg_serving_repo_url=WG_SERVING_REPO_URL.value, + wg_serving_repo_branch=WG_SERVING_REPO_BRANCH.value, generate_args=' '.join( [f'--{k} {v}' for k, v in generate_args.items()] ),