create endpoint with InferenceAmiVersion (#602)

CoderHam · hemant-co · web-flow · commit 756515a6ea0e · 2024-11-22T01:45:21.000+05:30
* x * pickup changes from cohere-ai/cohere-aws#196 * misc cleanup and import fixes --------- Co-authored-by: CoderHam <hemant@cohere.com>
diff --git a/.gitignore b/.gitignore
@@ -3,3 +3,4 @@ dist/
 __pycache__/
 poetry.toml
 .ruff_cache/
+.venv/
diff --git a/src/cohere/manually_maintained/cohere_aws/client.py b/src/cohere/manually_maintained/cohere_aws/client.py
@@ -3,14 +3,12 @@
 import tarfile
 import tempfile
 import time
-from typing import Any, Dict, List, Optional, Tuple, Union
+from typing import Any, Dict, List, Optional, Union
 
 from .classification import Classification, Classifications
 from .embeddings import Embeddings
 from .error import CohereError
-from .generation import (Generation, Generations,
-                                         StreamingGenerations,
-                                         TokenLikelihood)
+from .generation import Generations, StreamingGenerations
 from .chat import Chat, StreamingChat
 from .rerank import Reranking
 from .summary import Summary
@@ -135,7 +133,7 @@ def create_endpoint(
             instance_type (str, optional): The EC2 instance type to deploy the endpoint to. Defaults to "ml.g4dn.xlarge".
             n_instances (int, optional): Number of endpoint instances. Defaults to 1.
             recreate (bool, optional): Force re-creation of endpoint if it already exists. Defaults to False.
-            rool (str, optional): The IAM role to use for the endpoint. If not provided, sagemaker.get_execution_role()
+            role (str, optional): The IAM role to use for the endpoint. If not provided, sagemaker.get_execution_role()
                 will be used to get the role. This should work when one uses the client inside SageMaker. If this errors
                 out, the default role "ServiceRoleSagemaker" will be used, which generally works outside of SageMaker.
         """
@@ -150,6 +148,7 @@ def create_endpoint(
         kwargs = {}
         model_data = None
         validation_params = dict()
+        useBoto = False
         if s3_models_dir is not None:
             # If s3_models_dir is given, we assume to have custom fine-tuned models -> Algorithm
             kwargs["algorithm_arn"] = arn
@@ -163,6 +162,7 @@ def create_endpoint(
                 model_data_download_timeout=2400,
                 container_startup_health_check_timeout=2400
             )
+            useBoto = True
 
         # Out of precaution, check if there is an endpoint config and delete it if that's the case
         # Otherwise it might block deployment
@@ -171,30 +171,80 @@ def create_endpoint(
         except lazy_botocore().ClientError:
             pass
 
+        try:
+            self._service_client.delete_model(ModelName=endpoint_name)
+        except lazy_botocore().ClientError:
+            pass
+
         if role is None:
-            try:
-                role = lazy_sagemaker().get_execution_role()
-            except ValueError:
-                print("Using default role: 'ServiceRoleSagemaker'.")
-                role = "ServiceRoleSagemaker"
+            if useBoto:
+                accountID = lazy_sagemaker().account_id()
+                role = f"arn:aws:iam::{accountID}:role/ServiceRoleSagemaker"
+            else:
+                try:
+                    role = lazy_sagemaker().get_execution_role()
+                except ValueError:
+                    print("Using default role: 'ServiceRoleSagemaker'.")
+                    role = "ServiceRoleSagemaker"
 
-        model = lazy_sagemaker().ModelPackage(
-            role=role,
-            model_data=model_data,
-            sagemaker_session=self._sess,  # makes sure the right region is used
-            **kwargs
-        )
+        # deploy fine-tuned model using sagemaker SDK
+        if s3_models_dir is not None:
+            model = lazy_sagemaker().ModelPackage(
+                role=role,
+                model_data=model_data,
+                sagemaker_session=self._sess,  # makes sure the right region is used
+                **kwargs
+            )
 
-        try:
-            model.deploy(
-                n_instances,
-                instance_type,
-                endpoint_name=endpoint_name,
-                **validation_params
+            try:
+                model.deploy(
+                    n_instances,
+                    instance_type,
+                    endpoint_name=endpoint_name,
+                    **validation_params
+                )
+            except lazy_botocore().ParamValidationError:
+                # For at least some versions of python 3.6, SageMaker SDK does not support the validation_params
+                model.deploy(n_instances, instance_type, endpoint_name=endpoint_name)
+        else:
+            # deploy pre-trained model using boto to add InferenceAmiVersion
+            self._service_client.create_model(
+                ModelName=endpoint_name,
+                ExecutionRoleArn=role,
+                EnableNetworkIsolation=True,
+                PrimaryContainer={
+                    'ModelPackageName': arn,
+                },
+            )
+            self._service_client.create_endpoint_config(
+                EndpointConfigName=endpoint_name,
+                ProductionVariants=[
+                    {
+                        'VariantName': 'AllTraffic',
+                        'ModelName': endpoint_name,
+                        'InstanceType': instance_type,
+                        'InitialInstanceCount': n_instances,
+                        'InferenceAmiVersion': 'al2-ami-sagemaker-inference-gpu-2'
+                    },
+                ],
             )
-        except lazy_botocore().ParamValidationError:
-            # For at least some versions of python 3.6, SageMaker SDK does not support the validation_params
-            model.deploy(n_instances, instance_type, endpoint_name=endpoint_name)
+            self._service_client.create_endpoint(
+                EndpointName=endpoint_name,
+                EndpointConfigName=endpoint_name,
+            )
+
+            waiter = self._service_client.get_waiter('endpoint_in_service')
+            try:
+                print(f"Waiting for endpoint {endpoint_name} to be in service...")
+                waiter.wait(
+                    EndpointName=endpoint_name,
+                    WaiterConfig={
+                        'Delay': 30,
+                        'MaxAttempts': 80
+                    }
+                )
+            except Exception as e:
+                raise CohereError(f"Failed to create endpoint: {e}")
         self.connect_to_endpoint(endpoint_name)
 
     def chat(
@@ -725,12 +775,12 @@ def create_finetune(
         s3_resource = lazy_boto3().resource("s3")
 
         # Copy new model to root of output_model_dir
-        bucket, old_key = parse_s3_url(current_filepath)
-        _, new_key = parse_s3_url(f"{s3_models_dir}{name}.tar.gz")
+        bucket, old_key = lazy_sagemaker().s3.parse_s3_url(current_filepath)
+        _, new_key = lazy_sagemaker().s3.parse_s3_url(f"{s3_models_dir}{name}.tar.gz")
         s3_resource.Object(bucket, new_key).copy(CopySource={"Bucket": bucket, "Key": old_key})
 
         # Delete old dir
-        bucket, old_short_key = parse_s3_url(s3_models_dir + job_name)
+        bucket, old_short_key = lazy_sagemaker().s3.parse_s3_url(s3_models_dir + job_name)
         s3_resource.Bucket(bucket).objects.filter(Prefix=old_short_key).delete()
 
     def export_finetune(
@@ -791,12 +841,12 @@ def export_finetune(
         s3_resource = lazy_boto3().resource("s3")
 
         # Copy the exported TensorRT-LLM engine to the root of s3_output_dir
-        bucket, old_key = parse_s3_url(current_filepath)
-        _, new_key = parse_s3_url(f"{s3_output_dir}{name}.tar.gz")
+        bucket, old_key = lazy_sagemaker().s3.parse_s3_url(current_filepath)
+        _, new_key = lazy_sagemaker().s3.parse_s3_url(f"{s3_output_dir}{name}.tar.gz")
         s3_resource.Object(bucket, new_key).copy(CopySource={"Bucket": bucket, "Key": old_key})
 
         # Delete the old S3 directory
-        bucket, old_short_key = parse_s3_url(f"{s3_output_dir}{job_name}")
+        bucket, old_short_key = lazy_sagemaker().s3.parse_s3_url(f"{s3_output_dir}{job_name}")
         s3_resource.Bucket(bucket).objects.filter(Prefix=old_short_key).delete()
 
     def wait_for_finetune_job(self, job_id: str, timeout: int = 2*60*60) -> str: