watching md predict/access logs for error

agrimk · agrimk · commit 0a2725cabba9 · 2025-06-26T22:53:36.000+05:30
diff --git a/ads/aqua/modeldeployment/deployment.py b/ads/aqua/modeldeployment/deployment.py
@@ -727,21 +727,33 @@ def _create_deployment(
         ).deploy(wait_for_completion=False)
 
         deployment_id = deployment.id
+
         logger.info(
             f"Aqua model deployment {deployment_id} created for model {aqua_model_id}. Work request Id is {deployment.dsc_model_deployment.workflow_req_id}"
         )
+        status_list = []
+        progress_thread_1 = threading.Thread(
+            target=deployment.watch,
+            args=(status_list),
+            daemon=True,
+        )
+        progress_thread_1.start()
 
-        progress_thread = threading.Thread(
+        progress_thread_2 = threading.Thread(
             target=self.get_deployment_status,
             args=(
                 deployment_id,
                 deployment.dsc_model_deployment.workflow_req_id,
                 model_type,
                 model_name,
+                status_list,
             ),
             daemon=True,
         )
-        progress_thread.start()
+        progress_thread_2.start()
+
+        progress_thread_1.join()
+        progress_thread_2.join()
 
         # we arbitrarily choose last 8 characters of OCID to identify MD in telemetry
         telemetry_kwargs = {"ocid": get_ocid_substring(deployment_id, key_len=8)}
@@ -1237,6 +1249,7 @@ def get_deployment_status(
         work_request_id: str,
         model_type: str,
         model_name: str,
+        status_list: List[str] = [],
     ) -> None:
         """Waits for the data science  model deployment to be completed and log its status in telemetry.
 
@@ -1249,14 +1262,17 @@ def get_deployment_status(
             The work request Id of the model deployment.
         model_type: str
             The type of aqua model to be deployed. Allowed values are: `custom`, `service` and `multi_model`.
+        status_list: List[str]
+            The list of status frmo streams the access and/or predict logs of model deployment.
 
         Returns
         -------
         AquaDeployment
             An Aqua deployment instance.
         """
+
         ocid = get_ocid_substring(model_deployment_id, key_len=8)
-        telemetry_kwargs = {"ocid": ocid}
+        telemetry_kwargs = {"ocid": ocid, "model_name": model_name}
 
         data_science_work_request: DataScienceWorkRequest = DataScienceWorkRequest(
             work_request_id
@@ -1274,18 +1290,21 @@ def get_deployment_status(
                 for error in data_science_work_request._error_message:
                     error_str = error_str + " " + error.message
 
-            self.telemetry.record_event(
-                category=f"aqua/{model_type}/deployment/status",
-                action="FAILED",
-                detail=error_str,
-                value=model_name,
-                **telemetry_kwargs,
-            )
+                status = ""
+                if len(status_list) > 0:
+                    status = status_list[-1]
 
+                telemetry_kwargs["status"] = status
+
+                self.telemetry.record_event(
+                    category=f"aqua/{model_type}/deployment/status",
+                    action="FAILED",
+                    detail=error_str,
+                    **telemetry_kwargs,
+                )
         else:
             self.telemetry.record_event_async(
                 category=f"aqua/{model_type}/deployment/status",
                 action="SUCCEEDED",
-                value=model_name,
                 **telemetry_kwargs,
             )
diff --git a/ads/model/deployment/model_deployment.py b/ads/model/deployment/model_deployment.py
@@ -1,22 +1,27 @@
 #!/usr/bin/env python
-# -*- coding: utf-8; -*-
 
-# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
+# Copyright (c) 2021, 2025 Oracle and/or its affiliates.
 # Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
 
 
 import collections
 import copy
 import datetime
-import oci
-import warnings
 import time
-from typing import Dict, List, Union, Any
+import warnings
+from typing import Any, Dict, List, Union
 
+import oci
 import oci.loggingsearch
-from ads.common import auth as authutil
 import pandas as pd
-from ads.model.serde.model_input import JsonModelInputSERDE
+from oci.data_science.models import (
+    CreateModelDeploymentDetails,
+    LogDetails,
+    UpdateModelDeploymentDetails,
+)
+
+from ads.common import auth as authutil
+from ads.common import utils as ads_utils
 from ads.common.oci_logging import (
     LOG_INTERVAL,
     LOG_RECORDS_LIMIT,
@@ -30,10 +35,10 @@
 from ads.model.deployment.common.utils import send_request
 from ads.model.deployment.model_deployment_infrastructure import (
     DEFAULT_BANDWIDTH_MBPS,
+    DEFAULT_MEMORY_IN_GBS,
+    DEFAULT_OCPUS,
     DEFAULT_REPLICA,
     DEFAULT_SHAPE_NAME,
-    DEFAULT_OCPUS,
-    DEFAULT_MEMORY_IN_GBS,
     MODEL_DEPLOYMENT_INFRASTRUCTURE_TYPE,
     ModelDeploymentInfrastructure,
 )
@@ -45,18 +50,14 @@
     ModelDeploymentRuntimeType,
     OCIModelDeploymentRuntimeType,
 )
+from ads.model.serde.model_input import JsonModelInputSERDE
 from ads.model.service.oci_datascience_model_deployment import (
     OCIDataScienceModelDeployment,
 )
-from ads.common import utils as ads_utils
+
 from .common import utils
 from .common.utils import State
 from .model_deployment_properties import ModelDeploymentProperties
-from oci.data_science.models import (
-    LogDetails,
-    CreateModelDeploymentDetails,
-    UpdateModelDeploymentDetails,
-)
 
 DEFAULT_WAIT_TIME = 1200
 DEFAULT_POLL_INTERVAL = 10
@@ -734,6 +735,7 @@ def watch(
         time_start: datetime = None,
         interval: int = LOG_INTERVAL,
         log_filter: str = None,
+        status_list: List[str] = None,
     ) -> "ModelDeployment":
         """Streams the access and/or predict logs of model deployment.
 
@@ -751,6 +753,8 @@ def watch(
         log_filter : str, optional
             Expression for filtering the logs. This will be the WHERE clause of the query.
             Defaults to None.
+        status_list : List[str], optional
+            List of status of model deployment. This is used to store list of status from logs.
 
         Returns
         -------
@@ -760,6 +764,8 @@ def watch(
         status = ""
         while not self._stop_condition():
             status = self._check_and_print_status(status)
+            if status not in status_list:
+                status_list.append(status)
             time.sleep(interval)
 
         time_start = time_start or self.time_created
@@ -964,7 +970,9 @@ def predict(
         except oci.exceptions.ServiceError as ex:
             # When bandwidth exceeds the allocated value, TooManyRequests error (429) will be raised by oci backend.
             if ex.status == 429:
-                bandwidth_mbps = self.infrastructure.bandwidth_mbps or DEFAULT_BANDWIDTH_MBPS
+                bandwidth_mbps = (
+                    self.infrastructure.bandwidth_mbps or DEFAULT_BANDWIDTH_MBPS
+                )
                 utils.get_logger().warning(
                     f"Load balancer bandwidth exceeds the allocated {bandwidth_mbps} Mbps."
                     "To estimate the actual bandwidth, use formula: (payload size in KB) * (estimated requests per second) * 8 / 1024."
@@ -1644,22 +1652,22 @@ def _build_model_deployment_configuration_details(self) -> Dict:
             }
 
         if infrastructure.subnet_id:
-            instance_configuration[
-                infrastructure.CONST_SUBNET_ID
-            ] = infrastructure.subnet_id
+            instance_configuration[infrastructure.CONST_SUBNET_ID] = (
+                infrastructure.subnet_id
+            )
 
         if infrastructure.private_endpoint_id:
             if not hasattr(
                 oci.data_science.models.InstanceConfiguration, "private_endpoint_id"
             ):
                 # TODO: add oci version with private endpoint support.
-                raise EnvironmentError(
+                raise OSError(
                     "Private endpoint is not supported in the current OCI SDK installed."
                 )
 
-            instance_configuration[
-                infrastructure.CONST_PRIVATE_ENDPOINT_ID
-            ] = infrastructure.private_endpoint_id
+            instance_configuration[infrastructure.CONST_PRIVATE_ENDPOINT_ID] = (
+                infrastructure.private_endpoint_id
+            )
 
         scaling_policy = {
             infrastructure.CONST_POLICY_TYPE: "FIXED_SIZE",
@@ -1704,7 +1712,7 @@ def _build_model_deployment_configuration_details(self) -> Dict:
                 oci.data_science.models,
                 "ModelDeploymentEnvironmentConfigurationDetails",
             ):
-                raise EnvironmentError(
+                raise OSError(
                     "Environment variable hasn't been supported in the current OCI SDK installed."
                 )
 
@@ -1720,9 +1728,9 @@ def _build_model_deployment_configuration_details(self) -> Dict:
             and runtime.inference_server.upper()
             == MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON
         ):
-            environment_variables[
-                "CONTAINER_TYPE"
-            ] = MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON
+            environment_variables["CONTAINER_TYPE"] = (
+                MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON
+            )
             runtime.set_spec(runtime.CONST_ENV, environment_variables)
         environment_configuration_details = {
             runtime.CONST_ENVIRONMENT_CONFIG_TYPE: runtime.environment_config_type,
@@ -1734,17 +1742,17 @@ def _build_model_deployment_configuration_details(self) -> Dict:
                 oci.data_science.models,
                 "OcirModelDeploymentEnvironmentConfigurationDetails",
             ):
-                raise EnvironmentError(
+                raise OSError(
                     "Container runtime hasn't been supported in the current OCI SDK installed."
                 )
             environment_configuration_details["image"] = runtime.image
             environment_configuration_details["imageDigest"] = runtime.image_digest
             environment_configuration_details["cmd"] = runtime.cmd
             environment_configuration_details["entrypoint"] = runtime.entrypoint
             environment_configuration_details["serverPort"] = runtime.server_port
-            environment_configuration_details[
-                "healthCheckPort"
-            ] = runtime.health_check_port
+            environment_configuration_details["healthCheckPort"] = (
+                runtime.health_check_port
+            )
 
         model_deployment_configuration_details = {
             infrastructure.CONST_DEPLOYMENT_TYPE: "SINGLE_MODEL",
@@ -1754,7 +1762,7 @@ def _build_model_deployment_configuration_details(self) -> Dict:
 
         if runtime.deployment_mode == ModelDeploymentMode.STREAM:
             if not hasattr(oci.data_science.models, "StreamConfigurationDetails"):
-                raise EnvironmentError(
+                raise OSError(
                     "Model deployment mode hasn't been supported in the current OCI SDK installed."
                 )
             model_deployment_configuration_details[
@@ -1786,9 +1794,13 @@ def _build_category_log_details(self) -> Dict:
 
         logs = {}
         if (
-            self.infrastructure.access_log and 
-            self.infrastructure.access_log.get(self.infrastructure.CONST_LOG_GROUP_ID, None)
-            and self.infrastructure.access_log.get(self.infrastructure.CONST_LOG_ID, None)
+            self.infrastructure.access_log
+            and self.infrastructure.access_log.get(
+                self.infrastructure.CONST_LOG_GROUP_ID, None
+            )
+            and self.infrastructure.access_log.get(
+                self.infrastructure.CONST_LOG_ID, None
+            )
         ):
             logs[self.infrastructure.CONST_ACCESS] = {
                 self.infrastructure.CONST_LOG_GROUP_ID: self.infrastructure.access_log.get(
@@ -1799,9 +1811,13 @@ def _build_category_log_details(self) -> Dict:
                 ),
             }
         if (
-            self.infrastructure.predict_log and 
-            self.infrastructure.predict_log.get(self.infrastructure.CONST_LOG_GROUP_ID, None)
-            and self.infrastructure.predict_log.get(self.infrastructure.CONST_LOG_ID, None)
+            self.infrastructure.predict_log
+            and self.infrastructure.predict_log.get(
+                self.infrastructure.CONST_LOG_GROUP_ID, None
+            )
+            and self.infrastructure.predict_log.get(
+                self.infrastructure.CONST_LOG_ID, None
+            )
         ):
             logs[self.infrastructure.CONST_PREDICT] = {
                 self.infrastructure.CONST_LOG_GROUP_ID: self.infrastructure.predict_log.get(