Skip to content

Commit 0a2725c

Browse files
committed
watching md predict/access logs for error
1 parent 6eeb6cb commit 0a2725c

File tree

2 files changed

+84
-49
lines changed

2 files changed

+84
-49
lines changed

ads/aqua/modeldeployment/deployment.py

Lines changed: 30 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -727,21 +727,33 @@ def _create_deployment(
727727
).deploy(wait_for_completion=False)
728728

729729
deployment_id = deployment.id
730+
730731
logger.info(
731732
f"Aqua model deployment {deployment_id} created for model {aqua_model_id}. Work request Id is {deployment.dsc_model_deployment.workflow_req_id}"
732733
)
734+
status_list = []
735+
progress_thread_1 = threading.Thread(
736+
target=deployment.watch,
737+
args=(status_list),
738+
daemon=True,
739+
)
740+
progress_thread_1.start()
733741

734-
progress_thread = threading.Thread(
742+
progress_thread_2 = threading.Thread(
735743
target=self.get_deployment_status,
736744
args=(
737745
deployment_id,
738746
deployment.dsc_model_deployment.workflow_req_id,
739747
model_type,
740748
model_name,
749+
status_list,
741750
),
742751
daemon=True,
743752
)
744-
progress_thread.start()
753+
progress_thread_2.start()
754+
755+
progress_thread_1.join()
756+
progress_thread_2.join()
745757

746758
# we arbitrarily choose last 8 characters of OCID to identify MD in telemetry
747759
telemetry_kwargs = {"ocid": get_ocid_substring(deployment_id, key_len=8)}
@@ -1237,6 +1249,7 @@ def get_deployment_status(
12371249
work_request_id: str,
12381250
model_type: str,
12391251
model_name: str,
1252+
status_list: List[str] = [],
12401253
) -> None:
12411254
"""Waits for the data science model deployment to be completed and log its status in telemetry.
12421255
@@ -1249,14 +1262,17 @@ def get_deployment_status(
12491262
The work request Id of the model deployment.
12501263
model_type: str
12511264
The type of aqua model to be deployed. Allowed values are: `custom`, `service` and `multi_model`.
1265+
status_list: List[str]
1266+
The list of status frmo streams the access and/or predict logs of model deployment.
12521267
12531268
Returns
12541269
-------
12551270
AquaDeployment
12561271
An Aqua deployment instance.
12571272
"""
1273+
12581274
ocid = get_ocid_substring(model_deployment_id, key_len=8)
1259-
telemetry_kwargs = {"ocid": ocid}
1275+
telemetry_kwargs = {"ocid": ocid, "model_name": model_name}
12601276

12611277
data_science_work_request: DataScienceWorkRequest = DataScienceWorkRequest(
12621278
work_request_id
@@ -1274,18 +1290,21 @@ def get_deployment_status(
12741290
for error in data_science_work_request._error_message:
12751291
error_str = error_str + " " + error.message
12761292

1277-
self.telemetry.record_event(
1278-
category=f"aqua/{model_type}/deployment/status",
1279-
action="FAILED",
1280-
detail=error_str,
1281-
value=model_name,
1282-
**telemetry_kwargs,
1283-
)
1293+
status = ""
1294+
if len(status_list) > 0:
1295+
status = status_list[-1]
12841296

1297+
telemetry_kwargs["status"] = status
1298+
1299+
self.telemetry.record_event(
1300+
category=f"aqua/{model_type}/deployment/status",
1301+
action="FAILED",
1302+
detail=error_str,
1303+
**telemetry_kwargs,
1304+
)
12851305
else:
12861306
self.telemetry.record_event_async(
12871307
category=f"aqua/{model_type}/deployment/status",
12881308
action="SUCCEEDED",
1289-
value=model_name,
12901309
**telemetry_kwargs,
12911310
)

ads/model/deployment/model_deployment.py

Lines changed: 54 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,27 @@
11
#!/usr/bin/env python
2-
# -*- coding: utf-8; -*-
32

4-
# Copyright (c) 2021, 2023 Oracle and/or its affiliates.
3+
# Copyright (c) 2021, 2025 Oracle and/or its affiliates.
54
# Licensed under the Universal Permissive License v 1.0 as shown at https://oss.oracle.com/licenses/upl/
65

76

87
import collections
98
import copy
109
import datetime
11-
import oci
12-
import warnings
1310
import time
14-
from typing import Dict, List, Union, Any
11+
import warnings
12+
from typing import Any, Dict, List, Union
1513

14+
import oci
1615
import oci.loggingsearch
17-
from ads.common import auth as authutil
1816
import pandas as pd
19-
from ads.model.serde.model_input import JsonModelInputSERDE
17+
from oci.data_science.models import (
18+
CreateModelDeploymentDetails,
19+
LogDetails,
20+
UpdateModelDeploymentDetails,
21+
)
22+
23+
from ads.common import auth as authutil
24+
from ads.common import utils as ads_utils
2025
from ads.common.oci_logging import (
2126
LOG_INTERVAL,
2227
LOG_RECORDS_LIMIT,
@@ -30,10 +35,10 @@
3035
from ads.model.deployment.common.utils import send_request
3136
from ads.model.deployment.model_deployment_infrastructure import (
3237
DEFAULT_BANDWIDTH_MBPS,
38+
DEFAULT_MEMORY_IN_GBS,
39+
DEFAULT_OCPUS,
3340
DEFAULT_REPLICA,
3441
DEFAULT_SHAPE_NAME,
35-
DEFAULT_OCPUS,
36-
DEFAULT_MEMORY_IN_GBS,
3742
MODEL_DEPLOYMENT_INFRASTRUCTURE_TYPE,
3843
ModelDeploymentInfrastructure,
3944
)
@@ -45,18 +50,14 @@
4550
ModelDeploymentRuntimeType,
4651
OCIModelDeploymentRuntimeType,
4752
)
53+
from ads.model.serde.model_input import JsonModelInputSERDE
4854
from ads.model.service.oci_datascience_model_deployment import (
4955
OCIDataScienceModelDeployment,
5056
)
51-
from ads.common import utils as ads_utils
57+
5258
from .common import utils
5359
from .common.utils import State
5460
from .model_deployment_properties import ModelDeploymentProperties
55-
from oci.data_science.models import (
56-
LogDetails,
57-
CreateModelDeploymentDetails,
58-
UpdateModelDeploymentDetails,
59-
)
6061

6162
DEFAULT_WAIT_TIME = 1200
6263
DEFAULT_POLL_INTERVAL = 10
@@ -734,6 +735,7 @@ def watch(
734735
time_start: datetime = None,
735736
interval: int = LOG_INTERVAL,
736737
log_filter: str = None,
738+
status_list: List[str] = None,
737739
) -> "ModelDeployment":
738740
"""Streams the access and/or predict logs of model deployment.
739741
@@ -751,6 +753,8 @@ def watch(
751753
log_filter : str, optional
752754
Expression for filtering the logs. This will be the WHERE clause of the query.
753755
Defaults to None.
756+
status_list : List[str], optional
757+
List of status of model deployment. This is used to store list of status from logs.
754758
755759
Returns
756760
-------
@@ -760,6 +764,8 @@ def watch(
760764
status = ""
761765
while not self._stop_condition():
762766
status = self._check_and_print_status(status)
767+
if status not in status_list:
768+
status_list.append(status)
763769
time.sleep(interval)
764770

765771
time_start = time_start or self.time_created
@@ -964,7 +970,9 @@ def predict(
964970
except oci.exceptions.ServiceError as ex:
965971
# When bandwidth exceeds the allocated value, TooManyRequests error (429) will be raised by oci backend.
966972
if ex.status == 429:
967-
bandwidth_mbps = self.infrastructure.bandwidth_mbps or DEFAULT_BANDWIDTH_MBPS
973+
bandwidth_mbps = (
974+
self.infrastructure.bandwidth_mbps or DEFAULT_BANDWIDTH_MBPS
975+
)
968976
utils.get_logger().warning(
969977
f"Load balancer bandwidth exceeds the allocated {bandwidth_mbps} Mbps."
970978
"To estimate the actual bandwidth, use formula: (payload size in KB) * (estimated requests per second) * 8 / 1024."
@@ -1644,22 +1652,22 @@ def _build_model_deployment_configuration_details(self) -> Dict:
16441652
}
16451653

16461654
if infrastructure.subnet_id:
1647-
instance_configuration[
1648-
infrastructure.CONST_SUBNET_ID
1649-
] = infrastructure.subnet_id
1655+
instance_configuration[infrastructure.CONST_SUBNET_ID] = (
1656+
infrastructure.subnet_id
1657+
)
16501658

16511659
if infrastructure.private_endpoint_id:
16521660
if not hasattr(
16531661
oci.data_science.models.InstanceConfiguration, "private_endpoint_id"
16541662
):
16551663
# TODO: add oci version with private endpoint support.
1656-
raise EnvironmentError(
1664+
raise OSError(
16571665
"Private endpoint is not supported in the current OCI SDK installed."
16581666
)
16591667

1660-
instance_configuration[
1661-
infrastructure.CONST_PRIVATE_ENDPOINT_ID
1662-
] = infrastructure.private_endpoint_id
1668+
instance_configuration[infrastructure.CONST_PRIVATE_ENDPOINT_ID] = (
1669+
infrastructure.private_endpoint_id
1670+
)
16631671

16641672
scaling_policy = {
16651673
infrastructure.CONST_POLICY_TYPE: "FIXED_SIZE",
@@ -1704,7 +1712,7 @@ def _build_model_deployment_configuration_details(self) -> Dict:
17041712
oci.data_science.models,
17051713
"ModelDeploymentEnvironmentConfigurationDetails",
17061714
):
1707-
raise EnvironmentError(
1715+
raise OSError(
17081716
"Environment variable hasn't been supported in the current OCI SDK installed."
17091717
)
17101718

@@ -1720,9 +1728,9 @@ def _build_model_deployment_configuration_details(self) -> Dict:
17201728
and runtime.inference_server.upper()
17211729
== MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON
17221730
):
1723-
environment_variables[
1724-
"CONTAINER_TYPE"
1725-
] = MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON
1731+
environment_variables["CONTAINER_TYPE"] = (
1732+
MODEL_DEPLOYMENT_INFERENCE_SERVER_TRITON
1733+
)
17261734
runtime.set_spec(runtime.CONST_ENV, environment_variables)
17271735
environment_configuration_details = {
17281736
runtime.CONST_ENVIRONMENT_CONFIG_TYPE: runtime.environment_config_type,
@@ -1734,17 +1742,17 @@ def _build_model_deployment_configuration_details(self) -> Dict:
17341742
oci.data_science.models,
17351743
"OcirModelDeploymentEnvironmentConfigurationDetails",
17361744
):
1737-
raise EnvironmentError(
1745+
raise OSError(
17381746
"Container runtime hasn't been supported in the current OCI SDK installed."
17391747
)
17401748
environment_configuration_details["image"] = runtime.image
17411749
environment_configuration_details["imageDigest"] = runtime.image_digest
17421750
environment_configuration_details["cmd"] = runtime.cmd
17431751
environment_configuration_details["entrypoint"] = runtime.entrypoint
17441752
environment_configuration_details["serverPort"] = runtime.server_port
1745-
environment_configuration_details[
1746-
"healthCheckPort"
1747-
] = runtime.health_check_port
1753+
environment_configuration_details["healthCheckPort"] = (
1754+
runtime.health_check_port
1755+
)
17481756

17491757
model_deployment_configuration_details = {
17501758
infrastructure.CONST_DEPLOYMENT_TYPE: "SINGLE_MODEL",
@@ -1754,7 +1762,7 @@ def _build_model_deployment_configuration_details(self) -> Dict:
17541762

17551763
if runtime.deployment_mode == ModelDeploymentMode.STREAM:
17561764
if not hasattr(oci.data_science.models, "StreamConfigurationDetails"):
1757-
raise EnvironmentError(
1765+
raise OSError(
17581766
"Model deployment mode hasn't been supported in the current OCI SDK installed."
17591767
)
17601768
model_deployment_configuration_details[
@@ -1786,9 +1794,13 @@ def _build_category_log_details(self) -> Dict:
17861794

17871795
logs = {}
17881796
if (
1789-
self.infrastructure.access_log and
1790-
self.infrastructure.access_log.get(self.infrastructure.CONST_LOG_GROUP_ID, None)
1791-
and self.infrastructure.access_log.get(self.infrastructure.CONST_LOG_ID, None)
1797+
self.infrastructure.access_log
1798+
and self.infrastructure.access_log.get(
1799+
self.infrastructure.CONST_LOG_GROUP_ID, None
1800+
)
1801+
and self.infrastructure.access_log.get(
1802+
self.infrastructure.CONST_LOG_ID, None
1803+
)
17921804
):
17931805
logs[self.infrastructure.CONST_ACCESS] = {
17941806
self.infrastructure.CONST_LOG_GROUP_ID: self.infrastructure.access_log.get(
@@ -1799,9 +1811,13 @@ def _build_category_log_details(self) -> Dict:
17991811
),
18001812
}
18011813
if (
1802-
self.infrastructure.predict_log and
1803-
self.infrastructure.predict_log.get(self.infrastructure.CONST_LOG_GROUP_ID, None)
1804-
and self.infrastructure.predict_log.get(self.infrastructure.CONST_LOG_ID, None)
1814+
self.infrastructure.predict_log
1815+
and self.infrastructure.predict_log.get(
1816+
self.infrastructure.CONST_LOG_GROUP_ID, None
1817+
)
1818+
and self.infrastructure.predict_log.get(
1819+
self.infrastructure.CONST_LOG_ID, None
1820+
)
18051821
):
18061822
logs[self.infrastructure.CONST_PREDICT] = {
18071823
self.infrastructure.CONST_LOG_GROUP_ID: self.infrastructure.predict_log.get(

0 commit comments

Comments
 (0)