Skip to content

Commit

Permalink
feat: Add --multihost-gpu-node-count to gcloud ai endpoints deploy-mo…
Browse files Browse the repository at this point in the history
…del command.

PiperOrigin-RevId: 704835605
  • Loading branch information
vertex-sdk-bot authored and copybara-github committed Jan 31, 2025
1 parent ef596f5 commit 0502892
Show file tree
Hide file tree
Showing 2 changed files with 94 additions and 0 deletions.
42 changes: 42 additions & 0 deletions google/cloud/aiplatform/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,7 @@ def create(
reservation_affinity_values: Optional[List[str]] = None,
spot: bool = False,
required_replica_count: Optional[int] = 0,
multihost_gpu_node_count: Optional[int] = None,
) -> "DeploymentResourcePool":
"""Creates a new DeploymentResourcePool.
Expand Down Expand Up @@ -332,6 +333,9 @@ def create(
set, the model deploy/mutate operation will succeed once
available_replica_count reaches required_replica_count, and the
rest of the replicas will be retried.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
Returns:
DeploymentResourcePool
Expand Down Expand Up @@ -363,6 +367,7 @@ def create(
sync=sync,
create_request_timeout=create_request_timeout,
required_replica_count=required_replica_count,
multihost_gpu_node_count=multihost_gpu_node_count,
)

@classmethod
Expand All @@ -389,6 +394,7 @@ def _create(
sync=True,
create_request_timeout: Optional[float] = None,
required_replica_count: Optional[int] = 0,
multihost_gpu_node_count: Optional[int] = None,
) -> "DeploymentResourcePool":
"""Creates a new DeploymentResourcePool.
Expand Down Expand Up @@ -472,6 +478,9 @@ def _create(
set, the model deploy/mutate operation will succeed once
available_replica_count reaches required_replica_count, and the
rest of the replicas will be retried.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
Returns:
DeploymentResourcePool
Expand Down Expand Up @@ -505,6 +514,7 @@ def _create(
[autoscaling_metric_spec]
)

# TODO(joelletiangco): accelerator_type present here
if accelerator_type and accelerator_count:
utils.validate_accelerator_type(accelerator_type)
machine_spec.accelerator_type = accelerator_type
Expand Down Expand Up @@ -1327,6 +1337,7 @@ def deploy(
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
tpu_topology: Optional[str] = None,
multihost_gpu_node_count: Optional[int] = None,
service_account: Optional[str] = None,
explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
explanation_parameters: Optional[
Expand Down Expand Up @@ -1399,6 +1410,9 @@ def deploy(
tpu_topology (str):
Optional. The TPU topology to use for the DeployedModel.
Required for CloudTPU multihost deployments.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
service_account (str):
The service account that the DeployedModel's container runs as. Specify the
email address of the service account. If this service account is not
Expand Down Expand Up @@ -1500,6 +1514,7 @@ def deploy(
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
tpu_topology=tpu_topology,
multihost_gpu_node_count=multihost_gpu_node_count,
reservation_affinity_type=reservation_affinity_type,
reservation_affinity_key=reservation_affinity_key,
reservation_affinity_values=reservation_affinity_values,
Expand Down Expand Up @@ -1532,6 +1547,7 @@ def _deploy(
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
tpu_topology: Optional[str] = None,
multihost_gpu_node_count: Optional[int] = None,
reservation_affinity_type: Optional[str] = None,
reservation_affinity_key: Optional[str] = None,
reservation_affinity_values: Optional[List[str]] = None,
Expand Down Expand Up @@ -1601,6 +1617,9 @@ def _deploy(
tpu_topology (str):
Optional. The TPU topology to use for the DeployedModel.
Required for CloudTPU multihost deployments.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
reservation_affinity_type (str):
Optional. The type of reservation affinity.
One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
Expand Down Expand Up @@ -1686,6 +1705,7 @@ def _deploy(
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
tpu_topology=tpu_topology,
multihost_gpu_node_count=multihost_gpu_node_count,
reservation_affinity_type=reservation_affinity_type,
reservation_affinity_key=reservation_affinity_key,
reservation_affinity_values=reservation_affinity_values,
Expand Down Expand Up @@ -1725,6 +1745,7 @@ def _deploy_call(
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
tpu_topology: Optional[str] = None,
multihost_gpu_node_count: Optional[int] = None,
reservation_affinity_type: Optional[str] = None,
reservation_affinity_key: Optional[str] = None,
reservation_affinity_values: Optional[List[str]] = None,
Expand Down Expand Up @@ -1803,6 +1824,9 @@ def _deploy_call(
tpu_topology (str):
Optional. The TPU topology to use for the DeployedModel.
Required for CloudTPU multihost deployments.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
reservation_affinity_type (str):
Optional. The type of reservation affinity.
One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
Expand Down Expand Up @@ -2030,6 +2054,9 @@ def _deploy_call(
if tpu_topology is not None:
machine_spec.tpu_topology = tpu_topology

if multihost_gpu_node_count is not None:
machine_spec.multihost_gpu_node_count = multihost_gpu_node_count

dedicated_resources.machine_spec = machine_spec
deployed_model.dedicated_resources = dedicated_resources
if fast_tryout_enabled:
Expand Down Expand Up @@ -4012,6 +4039,7 @@ def deploy(
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
tpu_topology: Optional[str] = None,
multihost_gpu_node_count: Optional[int] = None,
service_account: Optional[str] = None,
explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
explanation_parameters: Optional[
Expand Down Expand Up @@ -4089,6 +4117,9 @@ def deploy(
tpu_topology (str):
Optional. The TPU topology to use for the DeployedModel.
Required for CloudTPU multihost deployments.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
service_account (str):
The service account that the DeployedModel's container runs as. Specify the
email address of the service account. If this service account is not
Expand Down Expand Up @@ -4190,6 +4221,7 @@ def deploy(
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
tpu_topology=tpu_topology,
multihost_gpu_node_count=multihost_gpu_node_count,
reservation_affinity_type=reservation_affinity_type,
reservation_affinity_key=reservation_affinity_key,
reservation_affinity_values=reservation_affinity_values,
Expand Down Expand Up @@ -5241,6 +5273,7 @@ def deploy(
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
tpu_topology: Optional[str] = None,
multihost_gpu_node_count: Optional[int] = None,
service_account: Optional[str] = None,
explanation_metadata: Optional[aiplatform.explain.ExplanationMetadata] = None,
explanation_parameters: Optional[
Expand Down Expand Up @@ -5318,6 +5351,9 @@ def deploy(
tpu_topology (str):
Optional. The TPU topology to use for the DeployedModel.
Requireid for CloudTPU multihost deployments.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
service_account (str):
The service account that the DeployedModel's container runs as. Specify the
email address of the service account. If this service account is not
Expand Down Expand Up @@ -5462,6 +5498,7 @@ def deploy(
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
tpu_topology=tpu_topology,
multihost_gpu_node_count=multihost_gpu_node_count,
reservation_affinity_type=reservation_affinity_type,
reservation_affinity_key=reservation_affinity_key,
reservation_affinity_values=reservation_affinity_values,
Expand Down Expand Up @@ -5505,6 +5542,7 @@ def _deploy(
accelerator_type: Optional[str] = None,
accelerator_count: Optional[int] = None,
tpu_topology: Optional[str] = None,
multihost_gpu_node_count: Optional[int] = None,
reservation_affinity_type: Optional[str] = None,
reservation_affinity_key: Optional[str] = None,
reservation_affinity_values: Optional[List[str]] = None,
Expand Down Expand Up @@ -5579,6 +5617,9 @@ def _deploy(
tpu_topology (str):
Optional. The TPU topology to use for the DeployedModel.
Requireid for CloudTPU multihost deployments.
multihost_gpu_node_count (int):
Optional. The number of nodes per replica for multihost GPU DeployedModel.
Required for multihost GPU deployments.
reservation_affinity_type (str):
Optional. The type of reservation affinity.
One of NO_RESERVATION, ANY_RESERVATION, SPECIFIC_RESERVATION,
Expand Down Expand Up @@ -5713,6 +5754,7 @@ def _deploy(
accelerator_type=accelerator_type,
accelerator_count=accelerator_count,
tpu_topology=tpu_topology,
multihost_gpu_node_count=multihost_gpu_node_count,
reservation_affinity_type=reservation_affinity_type,
reservation_affinity_key=reservation_affinity_key,
reservation_affinity_values=reservation_affinity_values,
Expand Down
52 changes: 52 additions & 0 deletions tests/unit/aiplatform/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,11 @@
_TEST_TPU_MACHINE_TYPE = "ct5lp-hightpu-4t"
_TEST_TPU_TOPOLOGY = "2x2"

_TEST_GPU_MACHINE_TYPE = "a3-highgpu-8g"
_TEST_GPU_ACCELERATOR_TYPE = "NVIDIA_TESLA_A100"
_TEST_GPU_ACCELERATOR_COUNT = 8
_TEST_MULTIHOST_GPU_NODE_COUNT = 2

_TEST_BATCH_SIZE = 16

_TEST_PIPELINE_RESOURCE_NAME = (
Expand Down Expand Up @@ -2239,6 +2244,53 @@ def test_deploy_no_endpoint_with_tpu_topology(self, deploy_model_mock, sync):
timeout=None,
)

@pytest.mark.usefixtures(
"get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
)
@pytest.mark.parametrize("sync", [True, False])
def test_deploy_no_endpoint_with_multihost_gpu_node_count(self, deploy_model_mock, sync):
test_model = models.Model(_TEST_ID)
test_model._gca_resource.supported_deployment_resources_types.append(
aiplatform.gapic.Model.DeploymentResourcesType.DEDICATED_RESOURCES
)
test_endpoint = test_model.deploy(
machine_type=_TEST_GPU_MACHINE_TYPE,
accelerator_type=_TEST_GPU_ACCELERATOR_TYPE,
accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
sync=sync,
deploy_request_timeout=None,
)

if not sync:
test_endpoint.wait()

expected_machine_spec = gca_machine_resources.MachineSpec(
machine_type=_TEST_GPU_MACHINE_TYPE,
accelerator_type=_TEST_GPU_ACCELERATOR_COUNT,
accelerator_count=_TEST_GPU_ACCELERATOR_COUNT,
multihost_gpu_node_count=_TEST_MULTIHOST_GPU_NODE_COUNT,
)
expected_dedicated_resources = gca_machine_resources.DedicatedResources(
machine_spec=expected_machine_spec,
min_replica_count=1,
max_replica_count=1,
spot=False,
)
expected_deployed_model = gca_endpoint.DeployedModel(
dedicated_resources=expected_dedicated_resources,
model=test_model.resource_name,
display_name=None,
)
deploy_model_mock.assert_called_once_with(
endpoint=test_endpoint.resource_name,
deployed_model=expected_deployed_model,
traffic_split={"0": 100},
metadata=(),
timeout=None,
)


@pytest.mark.usefixtures(
"get_endpoint_mock", "get_model_mock", "create_endpoint_mock"
)
Expand Down

0 comments on commit 0502892

Please sign in to comment.