Skip to content

Commit 573e540

Browse files
hubatishcopybara-github
authored andcommitted
Support custom compute classes in GKE
GKE documentation: https://docs.cloud.google.com/kubernetes-engine/docs/concepts/about-custom-compute-classes Needed to move kubectl get-credentials setup to _Create rather than _PostCreate to setup the compute class. PiperOrigin-RevId: 882808151
1 parent 9087064 commit 573e540

File tree

5 files changed

+140
-35
lines changed

5 files changed

+140
-35
lines changed

perfkitbenchmarker/flags.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -100,7 +100,7 @@ def GetCurrentUser():
100100
"types that will be created for benchmarks that don't "
101101
'require a particular type.',
102102
)
103-
flags.DEFINE_list(
103+
K8S_MACHINE_FAMILIES = flags.DEFINE_list(
104104
'k8s_machine_families',
105105
[],
106106
'Machine familes used when creating Kubernetes nodes.',

perfkitbenchmarker/providers/gcp/google_kubernetes_engine.py

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -204,15 +204,14 @@ def _IssueResourceCreationCommand(self, cmd: util.GcloudCommand):
204204
util.CheckGcloudResponseKnownFailures(stderr, retcode)
205205
raise errors.Resource.CreationError(stderr)
206206

207-
def _PostCreate(self):
208-
"""Acquires cluster authentication."""
207+
def _GetKubeconfig(self):
208+
"""Returns the kubeconfig for the cluster."""
209209
cmd = self._GcloudCommand(
210210
'container', 'clusters', 'get-credentials', self.name
211211
)
212212
env = os.environ.copy()
213213
env['KUBECONFIG'] = FLAGS.kubeconfig
214214
cmd.IssueRetryable(env=env)
215-
super()._PostCreate()
216215

217216
def _IsDeleting(self):
218217
cmd = self._GcloudCommand('container', 'clusters', 'describe', self.name)
@@ -253,6 +252,15 @@ def GetNodePoolNames(self) -> list[str]:
253252
stdout, _, _ = cmd.Issue()
254253
return stdout.split()
255254

255+
def _UsesCustomComputeClass(
256+
self, nodepool_config: container.BaseNodePoolConfig
257+
) -> bool:
258+
"""Returns True if the nodepool config uses a custom compute class."""
259+
return bool(
260+
(nodepool_config.machine_type is None and not nodepool_config.cpus)
261+
or nodepool_config.machine_families
262+
)
263+
256264

257265
class GkeCluster(BaseGkeCluster):
258266
"""Class representing a Google Kubernetes Engine cluster."""
@@ -282,9 +290,7 @@ def InitializeNodePoolForCloud(
282290
vm_config: virtual_machine_spec.BaseVmSpec,
283291
nodepool_config: container.BaseNodePoolConfig,
284292
):
285-
vm_config = typing.cast(
286-
gce_virtual_machine.GceVmSpec, vm_config
287-
)
293+
vm_config = typing.cast(gce_virtual_machine.GceVmSpec, vm_config)
288294
nodepool_config.disk_type = vm_config.boot_disk_type
289295
nodepool_config.disk_size = vm_config.boot_disk_size
290296
nodepool_config.max_local_disks = vm_config.max_local_disks
@@ -351,6 +357,8 @@ def _Create(self):
351357
self.default_nodepool,
352358
cmd,
353359
)
360+
if self._UsesCustomComputeClass(self.default_nodepool):
361+
cmd.args.append('--enable-default-compute-class')
354362
enable_autoprovisioning = False
355363
if gcp_flags.MAX_CPU.value:
356364
cmd.flags['max-cpu'] = gcp_flags.MAX_CPU.value
@@ -375,6 +383,8 @@ def _Create(self):
375383
cmd.flags['metadata'] = util.MakeFormattedDefaultTags()
376384

377385
self._RunClusterCreateCommand(cmd)
386+
self._GetKubeconfig()
387+
self._CreateCustomComputeClass(self.default_nodepool)
378388
self._CreateNodePools()
379389

380390
def _CreateNodePools(self):
@@ -388,6 +398,57 @@ def _CreateNodePools(self):
388398
cmd,
389399
)
390400
self._IssueResourceCreationCommand(cmd)
401+
self._CreateCustomComputeClass(nodepool)
402+
403+
def _CreateCustomComputeClass(
404+
self, nodepool_config: container.BaseNodePoolConfig
405+
):
406+
"""Creates a custom compute class for the nodepool."""
407+
if not self._UsesCustomComputeClass(nodepool_config):
408+
return
409+
compute_manifest = {
410+
'apiVersion': 'cloud.google.com/v1',
411+
'kind': 'ComputeClass',
412+
'metadata': {
413+
'name': nodepool_config.name,
414+
},
415+
}
416+
priorities = []
417+
for machine_family in nodepool_config.machine_families:
418+
priorities.append({
419+
'machineFamily': machine_family,
420+
})
421+
is_default_class = (
422+
nodepool_config.name == container_cluster.DEFAULT_NODEPOOL
423+
)
424+
compute_manifest['spec'] = {'priorities': priorities}
425+
if is_default_class:
426+
compute_manifest['spec']['nodePoolAutoCreation'] = {'enabled': True}
427+
kubernetes_commands.ApplyYaml([compute_manifest])
428+
if is_default_class:
429+
return
430+
cmd = self._GcloudCommand(
431+
'container',
432+
'node-pools',
433+
'update',
434+
nodepool_config.name,
435+
'--cluster',
436+
self.name,
437+
'--node-labels',
438+
f'cloud.google.com/compute-class={nodepool_config.name}',
439+
)
440+
cmd.Issue()
441+
cmd = self._GcloudCommand(
442+
'container',
443+
'node-pools',
444+
'update',
445+
nodepool_config.name,
446+
'--cluster',
447+
self.name,
448+
'--node-taints',
449+
f'cloud.google.com/compute-class={nodepool_config.name}:NoSchedule',
450+
)
451+
cmd.Issue()
391452

392453
def _AddNodeParamsToCmd(
393454
self,
@@ -452,12 +513,16 @@ def _AddNodeParamsToCmd(
452513
if nodepool_config.zone:
453514
cmd.flags['node-locations'] = nodepool_config.zone
454515

455-
if nodepool_config.machine_type is None:
516+
if nodepool_config.machine_type:
517+
cmd.flags['machine-type'] = nodepool_config.machine_type
518+
elif nodepool_config.cpus and nodepool_config.memory_mib:
456519
cmd.flags['machine-type'] = 'custom-{}-{}'.format(
457520
nodepool_config.cpus, nodepool_config.memory_mib
458521
)
459522
else:
460-
cmd.flags['machine-type'] = nodepool_config.machine_type
523+
assert (
524+
nodepool_config.machine_families
525+
), 'No machine type nor custom type nor machine family specified.'
461526

462527
if FLAGS.gke_enable_gvnic:
463528
cmd.args.append('--enable-gvnic')
@@ -568,6 +633,7 @@ def _Create(self):
568633
cmd.flags['labels'] = util.MakeFormattedDefaultTags()
569634

570635
self._RunClusterCreateCommand(cmd)
636+
self._GetKubeconfig()
571637

572638
def GetResourceMetadata(self) -> dict[str, Any]:
573639
metadata = super().GetResourceMetadata()

perfkitbenchmarker/resources/container_service/container.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from absl import flags
2020
from perfkitbenchmarker import data
21+
from perfkitbenchmarker import flags as pkb_flags
2122
from perfkitbenchmarker import resource
2223
from perfkitbenchmarker import virtual_machine_spec
2324
from perfkitbenchmarker.configs import container_spec as container_spec_lib
@@ -144,6 +145,18 @@ def __init__(
144145
):
145146
self.machine_type: str | None = vm_spec.machine_type
146147
self.machine_families: list[str] = machine_families or []
148+
if self.machine_families and self.machine_type:
149+
if pkb_flags.K8S_MACHINE_FAMILIES.value:
150+
# Setting machine families via flag will override config specific
151+
# machine type for convenience, but specifying both via config_override
152+
# is a clear error.
153+
self.machine_type = None
154+
else:
155+
raise ValueError(
156+
f'Machine families was set to {self.machine_families}'
157+
f' while machine type was set to {self.machine_type}.'
158+
' Specify only one at a time.'
159+
)
147160
self.zone: str = vm_spec.zone
148161
self.name = NodePoolName(name)
149162
self.sandbox_config: container_spec_lib.SandboxSpec | None = None

perfkitbenchmarker/resources/container_service/container_cluster.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def _InitializeNodePool(
9797
nodepool_config = container.BaseNodePoolConfig(
9898
vm_config,
9999
name,
100+
nodepool_spec.machine_families,
100101
)
101102
nodepool_config.sandbox_config = nodepool_spec.sandbox_config
102103
nodepool_config.zone = zone

tests/providers/gcp/google_kubernetes_engine_test.py

Lines changed: 51 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,6 @@ def testCreate(self):
173173
with self.patch_critical_objects() as issue_command:
174174
cluster = google_kubernetes_engine.GkeCluster(spec)
175175
cluster._Create()
176-
177176
self.assertIn(
178177
'gcloud container clusters create', issue_command.all_commands
179178
)
@@ -258,21 +257,22 @@ def testCreateResourcesExhausted(self):
258257
):
259258
cluster._Create()
260259

261-
def testPostCreate(self):
260+
def testGetCredentials(self):
262261
spec = self.create_kubernetes_engine_spec()
263262
with self.patch_critical_objects() as issue_command, mock.patch.object(
264263
kubectl, 'RunKubectlCommand'
265264
) as mock_kubectl_command:
266265
cluster = google_kubernetes_engine.GkeCluster(spec)
266+
cluster._Create()
267267
cluster._PostCreate()
268-
269268
self.assertIn(
270269
'gcloud container clusters get-credentials pkb-{}'.format(_RUN_URI),
271270
issue_command.all_commands,
272271
)
273272
self.assertIn(
274273
'KUBECONFIG', issue_command.func_to_mock.call_args[1]['env']
275274
)
275+
276276
self.assertEqual(mock_kubectl_command.call_count, 1)
277277

278278
def testDelete(self):
@@ -529,27 +529,6 @@ def testCreateGpuH100(self):
529529
issue_command.all_commands,
530530
)
531531

532-
@mock.patch(
533-
'perfkitbenchmarker.resources.container_service.kubernetes_commands.CreateFromFile'
534-
)
535-
def testPostCreate(self, _):
536-
spec = self.create_kubernetes_engine_spec('k80')
537-
with self.patch_critical_objects() as issue_command, mock.patch.object(
538-
kubectl, 'RunKubectlCommand'
539-
) as mock_kubectl_command:
540-
cluster = google_kubernetes_engine.GkeCluster(spec)
541-
cluster._PostCreate()
542-
543-
self.assertIn(
544-
'gcloud container clusters get-credentials pkb-{}'.format(_RUN_URI),
545-
issue_command.all_commands,
546-
)
547-
self.assertIn(
548-
'KUBECONFIG', issue_command.func_to_mock.call_args[1]['env']
549-
)
550-
551-
self.assertEqual(mock_kubectl_command.call_count, 1)
552-
553532

554533
class GoogleKubernetesEngineGetNodesTestCase(GoogleKubernetesEngineTestCase):
555534

@@ -644,7 +623,7 @@ def testCreateRegionalCluster(self):
644623
with self.patch_critical_objects() as issue_command:
645624
cluster = google_kubernetes_engine.GkeCluster(spec)
646625
cluster._Create()
647-
create_cluster, create_nodepool1, create_nodepool2 = (
626+
create_cluster, _, create_nodepool1, create_nodepool2 = (
648627
call[0][0] for call in issue_command.func_to_mock.call_args_list
649628
)
650629
self.assertNotIn('--zone', create_cluster)
@@ -693,7 +672,7 @@ def testCreateRegionalClusterZonalNodepool(self):
693672
with self.patch_critical_objects() as issue_command:
694673
cluster = google_kubernetes_engine.GkeCluster(spec)
695674
cluster._Create()
696-
create_cluster, create_nodepool1, create_nodepool2 = (
675+
create_cluster, _, create_nodepool1, create_nodepool2 = (
697676
call[0][0] for call in issue_command.func_to_mock.call_args_list
698677
)
699678
self.assertNotIn('--zone', create_cluster)
@@ -726,6 +705,52 @@ def testCreateRegionalClusterZonalNodepool(self):
726705
)
727706

728707

708+
class GoogleKubernetesEngineMachineFamiliesTestCase(PatchedObjectsTestCase):
709+
710+
@staticmethod
711+
def create_kubernetes_engine_spec():
712+
kubernetes_engine_spec = container_spec.ContainerClusterSpec(
713+
'NAME',
714+
**{
715+
'cloud': 'GCP',
716+
'vm_spec': {
717+
'GCP': {
718+
'machine_type': 'fake-machine-type',
719+
'zone': 'us-central1-a',
720+
},
721+
},
722+
'nodepools': {
723+
'nodepool1': {
724+
'vm_spec': {
725+
'GCP': {
726+
'machine_type': '',
727+
'zone': 'us-central1-a',
728+
},
729+
},
730+
'machine_families': ['n2'],
731+
},
732+
},
733+
},
734+
)
735+
return kubernetes_engine_spec
736+
737+
def testCreateWithMachineFamilies(self):
738+
spec = self.create_kubernetes_engine_spec()
739+
with self.patch_critical_objects() as issue_command, mock.patch.object(
740+
kubernetes_commands, 'ApplyYaml'
741+
):
742+
cluster = google_kubernetes_engine.GkeCluster(spec)
743+
cluster._Create()
744+
self.assertIn(
745+
'gcloud container node-pools update nodepool1',
746+
issue_command.all_commands,
747+
)
748+
self.assertIn(
749+
'--node-labels cloud.google.com/compute-class=nodepool1',
750+
issue_command.all_commands,
751+
)
752+
753+
729754
class GoogleKubernetesEngineAutopilotTestCase(PatchedObjectsTestCase):
730755

731756
@staticmethod

0 commit comments

Comments
 (0)