feat: Add --multihost-gpu-node-count to gcloud ai endpoints deploy-mo…

…del command. PiperOrigin-RevId: 704835605
googleapis · Jan 31, 2025 · 0502892 · 0502892
1 parent ef596f5
commit 0502892
Show file tree

Hide file tree

Showing 2 changed files with 94 additions and 0 deletions.
diff --git a/google/cloud/aiplatform/models.py b/google/cloud/aiplatform/models.py
@@ -252,6 +252,7 @@ def create(
         reservation_affinity_values: Optional[List[str]] = None,
         spot: bool = False,
         required_replica_count: Optional[int] = 0,
+        multihost_gpu_node_count: Optional[int] = None,
     ) -> "DeploymentResourcePool":
         """Creates a new DeploymentResourcePool.
 
@@ -332,6 +333,9 @@ def create(
                 set, the model deploy/mutate operation will succeed once
                 available_replica_count reaches required_replica_count, and the
                 rest of the replicas will be retried.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
 
         Returns:
             DeploymentResourcePool
@@ -363,6 +367,7 @@ def create(
             sync=sync,
             create_request_timeout=create_request_timeout,
             required_replica_count=required_replica_count,
+            multihost_gpu_node_count=multihost_gpu_node_count,
         )
 
     @classmethod
@@ -389,6 +394,7 @@ def _create(
         sync=True,
         create_request_timeout: Optional[float] = None,
         required_replica_count: Optional[int] = 0,
+        multihost_gpu_node_count: Optional[int] = None,
     ) -> "DeploymentResourcePool":
         """Creates a new DeploymentResourcePool.
 
@@ -472,6 +478,9 @@ def _create(
                 set, the model deploy/mutate operation will succeed once
                 available_replica_count reaches required_replica_count, and the
                 rest of the replicas will be retried.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
 
         Returns:
             DeploymentResourcePool
@@ -505,6 +514,7 @@ def _create(
                 [autoscaling_metric_spec]
             )
 
+        # TODO(joelletiangco): accelerator_type present here
         if accelerator_type and accelerator_count:
             utils.validate_accelerator_type(accelerator_type)
             machine_spec.accelerator_type = accelerator_type
@@ -1327,6 +1337,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -1399,6 +1410,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -1500,6 +1514,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -1532,6 +1547,7 @@ def _deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -1601,6 +1617,9 @@ def _deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -1686,6 +1705,7 @@ def _deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -1725,6 +1745,7 @@ def _deploy_call(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -1803,6 +1824,9 @@ def _deploy_call(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -2030,6 +2054,9 @@ def _deploy_call(
                 if tpu_topology is not None:
                     machine_spec.tpu_topology = tpu_topology
 
+                if multihost_gpu_node_count is not None:
+                    machine_spec.multihost_gpu_node_count = multihost_gpu_node_count
+
                 dedicated_resources.machine_spec = machine_spec
                 deployed_model.dedicated_resources = dedicated_resources
                 if fast_tryout_enabled:
@@ -4012,6 +4039,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -4089,6 +4117,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Required for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -4190,6 +4221,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -5241,6 +5273,7 @@ def deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         service_account: Optional[str] = None,
         explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
         explanation_parameters: Optional[
@@ -5318,6 +5351,9 @@ def deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Requireid for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             service_account (str):
                 The service account that the DeployedModel's container runs as. Specify the
                 email address of the service account. If this service account is not
@@ -5462,6 +5498,7 @@ def deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,
@@ -5505,6 +5542,7 @@ def _deploy(
         accelerator_type: Optional[str] = None,
         accelerator_count: Optional[int] = None,
         tpu_topology: Optional[str] = None,
+        multihost_gpu_node_count: Optional[int] = None,
         reservation_affinity_type: Optional[str] = None,
         reservation_affinity_key: Optional[str] = None,
         reservation_affinity_values: Optional[List[str]] = None,
@@ -5579,6 +5617,9 @@ def _deploy(
             tpu_topology (str):
                 Optional. The TPU topology to use for the DeployedModel.
                 Requireid for CloudTPU multihost deployments.
+            multihost_gpu_node_count (int):
+                Optional. The number of nodes per replica for multihost GPU DeployedModel.
+                Required for multihost GPU deployments.
             reservation_affinity_type (str):
                 Optional. The type of reservation affinity.
                 One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
@@ -5713,6 +5754,7 @@ def _deploy(
             accelerator_type=accelerator_type,
             accelerator_count=accelerator_count,
             tpu_topology=tpu_topology,
+            multihost_gpu_node_count=multihost_gpu_node_count,
             reservation_affinity_type=reservation_affinity_type,
             reservation_affinity_key=reservation_affinity_key,
             reservation_affinity_values=reservation_affinity_values,

diff --git a/tests/unit/aiplatform/test_models.py b/tests/unit/aiplatform/test_models.py
@@ -146,6 +146,11 @@
 _TEST_TPU_MACHINE_TYPE = "ct5lp-hightpu-4t"
 _TEST_TPU_TOPOLOGY = "2x2"
 
+_TEST_GPU_MACHINE_TYPE = "a3-highgpu-8g"
+_TEST_GPU_ACCELERATOR_TYPE = "NVIDIA_TESLA_A100"
+_TEST_GPU_ACCELERATOR_COUNT = 8
+_TEST_MULTIHOST_GPU_NODE_COUNT = 2
+
 _TEST_BATCH_SIZE = 16
 
 _TEST_PIPELINE_RESOURCE_NAME = (
@@ -2239,6 +2244,53 @@ def test_deploy_no_endpoint_with_tpu_topology(self, deploy_model_mock, sync):
             timeout=None,
         )
 
+    @pytest.mark.usefixtures(
+        "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
+    )
+    @pytest.mark.parametrize("sync", [True, False])
+    def test_deploy_no_endpoint_with_multihost_gpu_node_count(self, deploy_model_mock, sync):
+        test_model = models.Model(_TEST_ID)
+        test_model._gca_resource.supported_deployment_resources_types.append(
+            aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES
+        )
+        test_endpoint = test_model.deploy(
+            machine_type=_TEST_GPU_MACHINE_TYPE,
+            accelerator_type=_TEST_GPU_ACCELERATOR_TYPE,
+            accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
+            multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
+            sync=sync,
+            deploy_request_timeout=None,
+        )
+
+        if not sync:
+            test_endpoint.wait()
+
+        expected_machine_spec = gca_machine_resources.MachineSpec(
+            machine_type=_TEST_GPU_MACHINE_TYPE,
+            accelerator_type=_TEST_GPU_ACCELERATOR_COUNT,
+            accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
+            multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
+        )
+        expected_dedicated_resources = gca_machine_resources.DedicatedResources(
+            machine_spec=expected_machine_spec,
+            min_replica_count=1,
+            max_replica_count=1,
+            spot=False,
+        )
+        expected_deployed_model = gca_endpoint.DeployedModel(
+            dedicated_resources=expected_dedicated_resources,
+            model=test_model.resource_name,
+            display_name=None,
+        )
+        deploy_model_mock.assert_called_once_with(
+            endpoint=test_endpoint.resource_name,
+            deployed_model=expected_deployed_model,
+            traffic_split={"0": 100},
+            metadata=(),
+            timeout=None,
+        )
+
+
     @pytest.mark.usefixtures(
         "get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
     )