From 944c1e7312e44e69415b4f8c05df9468a2ba1087 Mon Sep 17 00:00:00 2001 From: Janaka Abeywardhana Date: Tue, 5 Sep 2023 11:31:14 +0100 Subject: [PATCH 1/3] add IaC for Llama2 Azure OnlineEndpoints --- infra/README.md | 29 ++- infra/{ => app}/azure/arm/appservice.json | 0 .../azure/arm/appservice.parameters.json | 0 infra/{ => app}/azure/arm/deploy.sh | 0 infra/{ => app}/azure/arm/destroy.sh | 2 - infra/inference/azure/arm/deploy.sh | 194 ++++++++++++++++++ infra/inference/azure/arm/destroy.sh | 32 +++ .../azure/arm/environment-version.json | 51 +++++ .../azure/arm/model-1/code-version.json | 56 +++++ .../environment/conda-managedidentity.yaml | 14 ++ .../azure/arm/model-1/environment/conda.yaml | 13 ++ .../model/sklearn_regression_model.pkl | Bin 0 -> 756 bytes .../azure/arm/model-1/onlinescoring/score.py | 36 ++++ .../onlinescoring/score_managedidentity.py | 107 ++++++++++ .../azure/arm/model-1/sample-request.json | 4 + infra/inference/azure/arm/model-version.json | 53 +++++ .../azure/arm/online-endpoint-deployment.json | 107 ++++++++++ ...online-endpoint-deployment.parameters.json | 5 + .../inference/azure/arm/online-endpoint.json | 73 +++++++ infra/inference/azure/arm/sample-request.json | 4 + infra/inference/azure/arm/test-endpoint.sh | 81 ++++++++ infra/inference/azure/cli/deploy.sh | 10 + infra/inference/azure/cli/destroy.sh | 14 ++ .../online-deployment-llama2-7b-chat-v8.yaml | 8 + infra/inference/azure/cli/update.sh | 8 + source/docq/support/llms/azure_ml.py | 158 ++++++++++++++ 26 files changed, 1050 insertions(+), 9 deletions(-) rename infra/{ => app}/azure/arm/appservice.json (100%) rename infra/{ => app}/azure/arm/appservice.parameters.json (100%) rename infra/{ => app}/azure/arm/deploy.sh (100%) rename infra/{ => app}/azure/arm/destroy.sh (99%) create mode 100755 infra/inference/azure/arm/deploy.sh create mode 100755 infra/inference/azure/arm/destroy.sh create mode 100644 infra/inference/azure/arm/environment-version.json create mode 100644 infra/inference/azure/arm/model-1/code-version.json create mode 100644 infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml create mode 100644 infra/inference/azure/arm/model-1/environment/conda.yaml create mode 100644 infra/inference/azure/arm/model-1/model/sklearn_regression_model.pkl create mode 100644 infra/inference/azure/arm/model-1/onlinescoring/score.py create mode 100644 infra/inference/azure/arm/model-1/onlinescoring/score_managedidentity.py create mode 100644 infra/inference/azure/arm/model-1/sample-request.json create mode 100644 infra/inference/azure/arm/model-version.json create mode 100644 infra/inference/azure/arm/online-endpoint-deployment.json create mode 100644 infra/inference/azure/arm/online-endpoint-deployment.parameters.json create mode 100644 infra/inference/azure/arm/online-endpoint.json create mode 100644 infra/inference/azure/arm/sample-request.json create mode 100755 infra/inference/azure/arm/test-endpoint.sh create mode 100755 infra/inference/azure/cli/deploy.sh create mode 100755 infra/inference/azure/cli/destroy.sh create mode 100644 infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml create mode 100755 infra/inference/azure/cli/update.sh create mode 100644 source/docq/support/llms/azure_ml.py diff --git a/infra/README.md b/infra/README.md index 59eab12c..74eec0b2 100644 --- a/infra/README.md +++ b/infra/README.md @@ -1,24 +1,39 @@ # Infra-as-Code setup for Docq.AI hosting +- `app` folder - IaC for hosting the Docq.AI app on various cloud providers. +- `inference` folder - IaC for hosting ML models used by the Docq app for inference. + +## Install Azure CLI + +- Install core Azure CLI `brew install azure-cli` +- Install Azure ML CLI v2 (remove v1 if exists, then install v2) + + ```terminal + az extension remove --name ml + + az extension add --name ml --yes + ``` + ## Azure ARM Templates These docs are mainly for contributing and developing the various deployment methods available in the `/infra` folder. We recommend users start with installation instruction layed out in the user guide in the main docs site. But feel free if you want to get your hands dirty. -The ARM template in `/infra/azure/arm` powers the whizard based one-click deploy method described in the main docs. +The ARM template in `/infra/azure/arm` powers the wizard based one-click deploy method described in the main docs. ### Deploy and destroy scripts -There two scripts combine several azure CLI commands to for convinience. +There two scripts combine several azure CLI commands to for convenience. -Running `./deploy.sh` is the easiest way to test when interating on the template. +Running `./deploy.sh` is the easiest way to test when iterating on the template. - `./deploy.sh ` - args are optional. creates a resource group and deploys the ARM template based on several defaults. Inspect the script to discover the defaults and available parameters. Params can be overridden by passing argument values in order -- `./destroy.sh ` - args are optional. Destroys the resource group and all resources within. Handles purging all Congnitive Services in the resource group that are deleted. +- `./destroy.sh ` - args are optional. Destroys the resource group and all resources within. Handles purging all Cognitive Services in the resource group that are deleted. ### Useful CLI commands If using the scripts above you shouldn't need these but occasionally you they might help when troubleshooting. +- authenticate `az login` - Create resource group CLI - `az group create --name docq-rg-westeurope --location westeurope` - Deploy template CLI - `az deployment group create --resource-group docq-rg-westeurope --name docq1 --template-file appservice.json` - Delete resources in resource group - `az group delete --name docq-rg-westeurope` @@ -30,19 +45,19 @@ If using the scripts above you shouldn't need these but occasionally you they mi See the `models` tab in Azure AI Studio for models available to the specific Azure account along with version numbers" -See API ref for detials on avail options +See API ref for details on avail options Explanation about models ### Testing the template -- Run `./deploy.sh` to test the template deploys all resources sucessfully. +- Run `./deploy.sh` to test the template deploys all resources successfully. - Navigate to the app URL for this instance. Verify the app is working as expect. - Test template deployment method aka one-click deploy. This is important as some times what works when deploying from the CLI doesn't work in template deployment. - push the template change to your branch (origin) such that it's publicly available. - Copy the 'raw' URL for the template file: Navigate to the file on Github.com. Click on the 'raw' button on the top right area. - URL encode the github raw URL. - - Trigger Azure template deployment by navigaiting to `https://portal.azure.com/#create/Microsoft.Template/uri/` + - Trigger Azure template deployment by navigating to `https://portal.azure.com/#create/Microsoft.Template/uri/` Example: The URL on main ## AWS diff --git a/infra/azure/arm/appservice.json b/infra/app/azure/arm/appservice.json similarity index 100% rename from infra/azure/arm/appservice.json rename to infra/app/azure/arm/appservice.json diff --git a/infra/azure/arm/appservice.parameters.json b/infra/app/azure/arm/appservice.parameters.json similarity index 100% rename from infra/azure/arm/appservice.parameters.json rename to infra/app/azure/arm/appservice.parameters.json diff --git a/infra/azure/arm/deploy.sh b/infra/app/azure/arm/deploy.sh similarity index 100% rename from infra/azure/arm/deploy.sh rename to infra/app/azure/arm/deploy.sh diff --git a/infra/azure/arm/destroy.sh b/infra/app/azure/arm/destroy.sh similarity index 99% rename from infra/azure/arm/destroy.sh rename to infra/app/azure/arm/destroy.sh index 70418c5a..01db564d 100755 --- a/infra/azure/arm/destroy.sh +++ b/infra/app/azure/arm/destroy.sh @@ -4,8 +4,6 @@ LOCATION="${2:-"westeurope"}" RESOURCE_GROUP="${3:-${NAME}-rg-${LOCATION}}" - - read -p "This will delete all resources in resource group '${RESOURCE_GROUP}'. Are you sure? [y/n]" confirm if [ $confirm = "y" ] || [ $confirm = "Y" ] diff --git a/infra/inference/azure/arm/deploy.sh b/infra/inference/azure/arm/deploy.sh new file mode 100755 index 00000000..249e7ea2 --- /dev/null +++ b/infra/inference/azure/arm/deploy.sh @@ -0,0 +1,194 @@ +set -x + +NAME="${1:-"docq"}" + +LOCATION="${2:-"westeurope"}" + +RESOURCE_GROUP="${NAME}-ml-rg-${LOCATION}" + + +res1=$(az group create --name $RESOURCE_GROUP --location $LOCATION) + + +WORKSPACE="${NAME}-main-ws-${LOCATION}" + +az ml workspace create --name $WORKSPACE --resource-group $RESOURCE_GROUP --location $LOCATION +wait + +# +TOKEN=$(az account get-access-token --query accessToken -o tsv) +# + +# +SUBSCRIPTION_ID=$(az account show --query id -o tsv) +#LOCATION=$(az ml workspace show --query location -o tsv) +#RESOURCE_GROUP=$(az group show --query name -o tsv) +#WORKSPACE=$(az configure -l --query "[?name=='workspace'].value" -o tsv) +# + +# +#export ENDPOINT_NAME=endpoint-`echo $RANDOM` +export ENDPOINT_NAME="${NAME}-endpoint" +# + +# +API_VERSION="2022-05-01" +# + +echo -e "Using:\nSUBSCRIPTION_ID=$SUBSCRIPTION_ID\nLOCATION=$LOCATION\nRESOURCE_GROUP=$RESOURCE_GROUP\nWORKSPACE=$WORKSPACE" + +# define how to wait +wait_for_completion () { + operation_id=$1 + status="unknown" + + if [[ $operation_id == "" || -z $operation_id || $operation_id == "null" ]]; then + echo "operation id cannot be empty" + exit 1 + fi + + while [[ $status != "Succeeded" && $status != "Failed" ]] + do + echo "Getting operation status from: $operation_id" + operation_result=$(curl --location --request GET $operation_id --header "Authorization: Bearer $TOKEN") + # TODO error handling here + status=$(echo $operation_result | jq -r '.status') + echo "Current operation status: $status" + sleep 5 + done + + if [[ $status == "Failed" ]] + then + error=$(echo $operation_result | jq -r '.error') + echo "Error: $error" + fi +} + +# # +# # Get values for storage account +# response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/datastores?api-version=$API_VERSION&isDefault=true" \ +# --header "Authorization: Bearer $TOKEN") +# AZUREML_DEFAULT_DATASTORE=$(echo $response | jq -r '.value[0].name') +# AZUREML_DEFAULT_CONTAINER=$(echo $response | jq -r '.value[0].properties.containerName') +# export AZURE_STORAGE_ACCOUNT=$(echo $response | jq -r '.value[0].properties.accountName') +# # + +# # +# az storage blob upload-batch -d $AZUREML_DEFAULT_CONTAINER/score -s model-1/onlinescoring --account-name $AZURE_STORAGE_ACCOUNT +# # + +# # +# az deployment group create -g $RESOURCE_GROUP \ +# --template-file code-version.json \ +# --parameters \ +# workspaceName=$WORKSPACE \ +# codeAssetName="score-sklearn" \ +# codeUri="https://$AZURE_STORAGE_ACCOUNT.blob.core.windows.net/$AZUREML_DEFAULT_CONTAINER/score" +# # + +# # +# az storage blob upload-batch -d $AZUREML_DEFAULT_CONTAINER/model -s model-1/model --account-name $AZURE_STORAGE_ACCOUNT +# # + +# # +# az deployment group create -g $RESOURCE_GROUP \ +# --template-file model-version.json \ +# --parameters \ +# workspaceName=$WORKSPACE \ +# modelAssetVersion=6 \ +# modelAssetName="Llama-2-13b-chat" \ +# modelUri="azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/6" +# # + +# # +# CONDA_FILE=$(cat model-1/environment/conda.yaml) +# # + +# # +# ENV_VERSION=$RANDOM +# az deployment group create -g $RESOURCE_GROUP \ +# --template-file environment-version.json \ +# --parameters \ +# workspaceName=$WORKSPACE \ +# environmentAssetName=sklearn-env \ +# environmentAssetVersion=$ENV_VERSION \ +# dockerImage=mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 \ +# condaFile="$CONDA_FILE" +# # + +# +az deployment group create -g $RESOURCE_GROUP \ + --template-file online-endpoint.json \ + --parameters \ + workspaceName=$WORKSPACE \ + onlineEndpointName=$ENDPOINT_NAME \ + identityType=SystemAssigned \ + authMode=AMLToken \ + location=$LOCATION +# + +# +response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME?api-version=$API_VERSION" \ +--header "Content-Type: application/json" \ +--header "Authorization: Bearer $TOKEN") + +operation_id=$(echo $response | jq -r '.properties.properties.AzureAsyncOperationUri') +wait_for_completion $operation_id +# + +LLAMA2_7B_CHAT="azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/8" +SATVIKAG_CHATBOT="azureml://registries/HuggingFace/models/satvikag-chatbot/versions/3" +SELECTED_MODEL=$LLAMA2_7B_CHAT +MODEL_NAME="llama2-7b-chat-8" + +# +resourceScope="/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices" +az deployment group create -g $RESOURCE_GROUP \ + --template-file online-endpoint-deployment.json \ + --parameters \ + workspaceName=$WORKSPACE \ + location=$LOCATION \ + onlineEndpointName=$ENDPOINT_NAME \ + onlineDeploymentName=$MODEL_NAME \ + model=$SELECTED_MODEL \ + endpointComputeType="Managed" \ + skuName="Standard_NC12s_v3" \ + skuCapacity=1 + # + +# +response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue?api-version=$API_VERSION" \ +--header "Content-Type: application/json" \ +--header "Authorization: Bearer $TOKEN") + +operation_id=$(echo $response | jq -r '.properties.properties.AzureAsyncOperationUri') +wait_for_completion $operation_id + +scoringUri=$(echo $response | jq -r '.properties.scoringUri') +# + +# +response=$(curl -H "Content-Length: 0" --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/token?api-version=$API_VERSION" \ +--header "Authorization: Bearer $TOKEN") +accessToken=$(echo $response | jq -r '.accessToken') +# + +# +curl --location --request POST $scoringUri \ + --header "Authorization: Bearer $accessToken" \ + --header "Content-Type: application/json" \ + --data-raw @sample-request.json +# + +# +curl --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue/getLogs?api-version=$API_VERSION" \ + --header "Authorization: Bearer $TOKEN" \ + --header "Content-Type: application/json" \ + --data-raw "{ \"tail\": 100 }" +# + +# # +# curl --location --request DELETE "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME?api-version=$API_VERSION" \ +# --header "Content-Type: application/json" \ +# --header "Authorization: Bearer $TOKEN" || true +# # diff --git a/infra/inference/azure/arm/destroy.sh b/infra/inference/azure/arm/destroy.sh new file mode 100755 index 00000000..4dd619ad --- /dev/null +++ b/infra/inference/azure/arm/destroy.sh @@ -0,0 +1,32 @@ +NAME="${1:-"docq"}" + +LOCATION="${2:-"westeurope"}" + +RESOURCE_GROUP="${3:-${NAME}-ml-rg-${LOCATION}}" + +read -p "This will delete all resources in resource group '${RESOURCE_GROUP}'. Are you sure? [y/n]" confirm + +if [ $confirm = "y" ] || [ $confirm = "Y" ] +then + + workspaces=($(az ml workspace list --resource-group $RESOURCE_GROUP --query [].name --output tsv)) + echo "Deleting ${#workspaces[@]} Azure ML Workspaces" + + for workspace in $workspaces + do + az ml workspace delete --name $workspace --resource-group $RESOURCE_GROUP --permanently-delete --all-resources --yes + echo "'${workspace}' deleted." + done + wait + + res=$(az group delete --name ${RESOURCE_GROUP} -y) + echo $res | jq '.' + wait + + echo "Success! All resources in resource group '${RESOURCE_GROUP}' were deleted." + exit 0 +elif [ $confirm = "n" ] || [ $confirm = "N" ] +then + echo "Aborted! nothing was destroyed." + exit -1 +fi \ No newline at end of file diff --git a/infra/inference/azure/arm/environment-version.json b/infra/inference/azure/arm/environment-version.json new file mode 100644 index 00000000..a9f2e3ca --- /dev/null +++ b/infra/inference/azure/arm/environment-version.json @@ -0,0 +1,51 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "workspaceName": { + "type": "string" + }, + "environmentAssetName": { + "type": "string" + }, + "environmentAssetVersion": { + "defaultValue": "1", + "type": "string" + }, + "environmentDescription": { + "defaultValue": "This is a test description for an environment created from an ARM template", + "type": "string" + }, + "condaFile": { + "defaultValue": "", + "type": "string", + "metadata": { + "description": "Standard configuration file used by Conda that lets you install any kind of package, including Python, R, and C/C++ packages." + } + }, + "isAnonymous": { + "defaultValue": false, + "type": "bool" + }, + "dockerImage": { + "defaultValue": "", + "type": "string", + "metadata": { + "description": "Docker image path, for example: 'docker.io/tensorflow/serving:latest'." + } + } + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/environments/versions", + "apiVersion": "2022-05-01", + "name": "[concat(parameters('workspaceName'), '/', parameters('environmentAssetName'), '/', parameters('environmentAssetVersion'))]", + "properties": { + "isAnonymous": "[parameters('isAnonymous')]", + "description": "[parameters('environmentDescription')]", + "image": "[parameters('dockerImage')]", + "condaFile": "[parameters('condaFile')]" + } + } + ] +} \ No newline at end of file diff --git a/infra/inference/azure/arm/model-1/code-version.json b/infra/inference/azure/arm/model-1/code-version.json new file mode 100644 index 00000000..b54c9f4a --- /dev/null +++ b/infra/inference/azure/arm/model-1/code-version.json @@ -0,0 +1,56 @@ +{ + "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#", + "contentVersion": "1.0.0.0", + "parameters": { + "workspaceName": { + "type": "string", + "metadata": { + "description": "Specifies the name of the Azure Machine Learning Workspace which will contain this compute." + } + }, + "codeAssetName": { + "type": "string", + "metadata": { + "description": "Specifies the name of the Azure Machine Learning code asset which will be created or updated." + } + }, + "codeAssetVersion": { + "defaultValue": "1", + "type": "string", + "metadata": { + "description": "Specifies the version of the Azure Machine Learning code asset which will be created or updated." + } + }, + "codeUri": { + "type": "string", + "metadata": { + "description": "Specifies the location of the Azure Machine Learning code asset in a storage account." + } + }, + "codeAssetDescription": { + "defaultValue": "This is a test description for a code asset created by an ARM template", + "type": "string" + }, + "isAnonymous": { + "defaultValue": false, + "type": "bool", + "metadata": { + "description": "If the name version are system generated (anonymous registration)." + } + } + }, + "resources": [ + { + "type": "Microsoft.MachineLearningServices/workspaces/codes/versions", + "apiVersion": "2022-05-01", + "name": "[concat(parameters('workspaceName'), '/', parameters('codeAssetName'), '/', parameters('codeAssetVersion'))]", + "properties": { + "description": "[parameters('codeAssetDescription')]", + "codeUri": "[parameters('codeUri')]", + "isAnonymous": "[parameters('isAnonymous')]", + "properties": {}, + "tags": {} + } + } + ] +} \ No newline at end of file diff --git a/infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml b/infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml new file mode 100644 index 00000000..c6372ca3 --- /dev/null +++ b/infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml @@ -0,0 +1,14 @@ +name: model-env +channels: + - conda-forge +dependencies: + - python=3.7 + - numpy=1.21.2 + - pip=21.2.4 + - scikit-learn=0.24.2 + - scipy=1.7.1 + - pip: + - azureml-defaults==1.38.0 + - joblib==1.0.1 + - azure-storage-blob==12.11 + - azure-identity==1.7 diff --git a/infra/inference/azure/arm/model-1/environment/conda.yaml b/infra/inference/azure/arm/model-1/environment/conda.yaml new file mode 100644 index 00000000..426b6146 --- /dev/null +++ b/infra/inference/azure/arm/model-1/environment/conda.yaml @@ -0,0 +1,13 @@ +name: model-env +channels: + - conda-forge +dependencies: + - python=3.9 + - numpy=1.23.5 + - pip=23.0.1 + - scikit-learn=1.2.2 + - scipy=1.10.1 + - pip: + - azureml-defaults==1.49.0 + - inference-schema[numpy-support]==1.5.1 + - joblib==1.2.0 \ No newline at end of file diff --git a/infra/inference/azure/arm/model-1/model/sklearn_regression_model.pkl b/infra/inference/azure/arm/model-1/model/sklearn_regression_model.pkl new file mode 100644 index 0000000000000000000000000000000000000000..edb4ffa7d6bc0435b2597b9d2782616848ab86c6 GIT binary patch literal 756 zcmX|9O-K}B7~a*ib!yGOTCGg`UsHy#1Sw(0qz-K^xFSsyeay^$yWhC8GxN@mrLtv* zbXV}&Av#JO3<`>Z#E=MzehwjZswhm-h@cJ?9kOper8&&-KJz}$`@BCR#j2Chv1}W> zt{L=;vSt$NJW^x_b0-*-z-b#hj%FGh1ez{EFb73ZDI5p{Qksi2!^O6WEfE$rPjH)A9UKbJb2QUEeqQmaMb0x9ITJB8jh!^y6iX* zRvD2qlCq!9K{*q5h-_il5aTXG1p`bwiEXHqg0ylMR52{+4$`FRRRtDtP$L!5lvF!x zEab;vQvxEEnMWgys2{PFI{!OkcNlXAiaw~JJgDpE}%#P6wZ zvVXj#cUgG4)N@$;@IzR5^Wsms?Yr=Ex?$+pjc>wm(et;D3QIyw&-_HJdr@fa`|`1_ z_OtN#$gjU+Q=fzh(Yn&)z7np^kJo>N`vRNW|3gEtn=sq`?N1JPwW +TOKEN=$(az account get-access-token --query accessToken -o tsv) +# + +API_VERSION="2022-05-01" + +WORKSPACE="${NAME}-main-ws-${LOCATION}" + +ENDPOINT_NAME="endpoint-10252" + +# +SUBSCRIPTION_ID=$(az account show --query id -o tsv) +# + +echo -e "Using:\nSUBSCRIPTION_ID=$SUBSCRIPTION_ID\nLOCATION=$LOCATION\nRESOURCE_GROUP=$RESOURCE_GROUP\nWORKSPACE=$WORKSPACE" + +wait_for_completion () { + operation_id=$1 + status="unknown" + + if [[ $operation_id == "" || -z $operation_id || $operation_id == "null" ]]; then + echo "operation id cannot be empty" + exit 1 + fi + + while [[ $status != "Succeeded" && $status != "Failed" ]] + do + echo "Getting operation status from: $operation_id" + operation_result=$(curl --location --request GET $operation_id --header "Authorization: Bearer $TOKEN") + # TODO error handling here + status=$(echo $operation_result | jq -r '.status') + echo "Current operation status: $status" + sleep 5 + done + + if [[ $status == "Failed" ]] + then + error=$(echo $operation_result | jq -r '.error') + echo "Error: $error" + fi +} + +# # +# response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue?api-version=$API_VERSION" \ +# --header "Content-Type: application/json" \ +# --header "Authorization: Bearer $TOKEN") + +# operation_id=$(echo $response | jq -r '.properties.properties.AzureAsyncOperationUri') +# wait_for_completion $operation_id + +# scoringUri=$(echo $response | jq -r '.properties.scoringUri') +# # + + +# # +# response=$(curl -H "Content-Length: 0" --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/token?api-version=$API_VERSION" \ +# --header "Authorization: Bearer $TOKEN") +# accessToken=$(echo $response | jq -r '.accessToken') +# # + +# # +# curl --location --request POST $scoringUri \ +# --header "Authorization: Bearer $accessToken" \ +# --header "Content-Type: application/json" \ +# --data-raw @sample-request.json +# # + +# # +# curl --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue/getLogs?api-version=$API_VERSION" \ +# --header "Authorization: Bearer $TOKEN" \ +# --header "Content-Type: application/json" \ +# --data-raw "{ \"tail\": 100 }" + +az ml online-endpoint invoke --name $ENDPOINT_NAME --workspace-name $WORKSPACE --resource-group $RESOURCE_GROUP --request-file model-1/sample-request.json \ No newline at end of file diff --git a/infra/inference/azure/cli/deploy.sh b/infra/inference/azure/cli/deploy.sh new file mode 100755 index 00000000..23f34cb6 --- /dev/null +++ b/infra/inference/azure/cli/deploy.sh @@ -0,0 +1,10 @@ +FILE="${1:-"online-deployment-llama2-7b-chat-v8.yaml"}" +ENDPOINT_NAME="${2:-"docq-endpoint"}" +LOCAL="${3:-false}" +az ml online-deployment create \ + --name "llama2-7b-chat-8" \ + --workspace-name "docq-main-ws-eastus" \ + --resource-group "docq-ml-rg-eastus" \ + --endpoint-name $ENDPOINT_NAME \ + --file $FILE \ + --local $LOCAL \ No newline at end of file diff --git a/infra/inference/azure/cli/destroy.sh b/infra/inference/azure/cli/destroy.sh new file mode 100755 index 00000000..01a6d82c --- /dev/null +++ b/infra/inference/azure/cli/destroy.sh @@ -0,0 +1,14 @@ +FILE="${1:-"online-deployment-llama2-7b-chat-v8.yaml"}" +ENDPOINT_NAME="${2:-"docq-endpoint"}" +az ml online-deployment update \ + --name "llama2-7b-chat-8" \ + --workspace-name "docq-main-ws-eastus" \ + --resource-group "docq-ml-rg-eastus" \ + --endpoint-name $ENDPOINT_NAME + --set traffic=0 + +az ml online-deployment delete \ + --name "llama2-7b-chat-8" \ + --workspace-name "docq-main-ws-eastus" \ + --resource-group "docq-ml-rg-eastus" \ + --endpoint-name $ENDPOINT_NAME \ No newline at end of file diff --git a/infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml b/infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml new file mode 100644 index 00000000..c17b93c5 --- /dev/null +++ b/infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml @@ -0,0 +1,8 @@ +$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json +model: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/8 +instance_type: Standard_NC12s_v3 +instance_count: 1 +request_settings: + request_timeout_ms: 90000 + max_concurrent_requests_per_instance: 1 + max_queue_wait_ms: 1000 diff --git a/infra/inference/azure/cli/update.sh b/infra/inference/azure/cli/update.sh new file mode 100755 index 00000000..6ef63277 --- /dev/null +++ b/infra/inference/azure/cli/update.sh @@ -0,0 +1,8 @@ +FILE="${1:-"online-deployment-llama2-7b-chat-v8.yaml"}" +ENDPOINT_NAME="${2:-"docq-endpoint"}" +az ml online-deployment update \ + --name "llama2-7b-chat-8" \ + --workspace-name "docq-main-ws-eastus" \ + --resource-group "docq-ml-rg-eastus" \ + --endpoint-name $ENDPOINT_NAME \ + --file $FILE \ No newline at end of file diff --git a/source/docq/support/llms/azure_ml.py b/source/docq/support/llms/azure_ml.py new file mode 100644 index 00000000..97718918 --- /dev/null +++ b/source/docq/support/llms/azure_ml.py @@ -0,0 +1,158 @@ +"""Llama Index `LLM` class implementation for LLMs hosted using Azure ML Online Endpoints.""" + +from typing import Any, Dict, Optional, Sequence + +from llama_index.callbacks import CallbackManager +from llama_index.constants import DEFAULT_NUM_OUTPUTS +from llama_index.llms.base import ( + ChatMessage, + ChatResponse, + ChatResponseGen, + CompletionResponse, + CompletionResponseGen, + LLMMetadata, + llm_chat_callback, + llm_completion_callback, +) +from llama_index.llms.custom import CustomLLM +from llama_index.llms.generic_utils import chat_to_completion_decorator +from llama_index.llms.openai_utils import ( + from_openai_message_dict, + to_openai_message_dicts, +) + + +class AzureML(CustomLLM): + """Llama Index `LLM` class implementation for LLMs hosted using Azure ML Online Endpoints.""" + + def __init__( + self, + endpoint_url: str, + api_key: str, + model: str = "llama-13b-chat", + temperature: float = 0.1, + max_length: int = 200, + max_tokens: int = 200, + top_p: float = 0.9, + do_sample: bool = True, + additional_kwargs: Optional[Dict[str, Any]] = None, + callback_manager: Optional[CallbackManager] = None, + ) -> None: + try: + from llamaapi import LlamaAPI as Client + except ImportError as e: + raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e + + self._client = Client(api_key) + self._model = model + self._temperature = temperature + self._max_tokens = max_tokens + self._additional_kwargs = additional_kwargs or {} + self.callback_manager = callback_manager or CallbackManager([]) + + @property + def _model_kwargs(self) -> Dict[str, Any]: + base_kwargs = { + "model": self._model, + "temperature": self._temperature, + "max_length": self._max_tokens, + } + model_kwargs = { + **base_kwargs, + **self._additional_kwargs, + } + return model_kwargs + + @property + def metadata(self) -> LLMMetadata: + return LLMMetadata( + context_window=4096, + num_output=DEFAULT_NUM_OUTPUTS, + is_chat_model=True, + is_function_calling_model=True, + model_name="llama-api", + ) + + @llm_chat_callback() + def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: + message_dicts = to_openai_message_dicts(messages) + json_dict = { + "messages": message_dicts, + **self._model_kwargs, + **kwargs, + } + response = self._client.run(json_dict).json() + message_dict = response["choices"][0]["message"] + message = from_openai_message_dict(message_dict) + + return ChatResponse(message=message, raw=response) + + @llm_completion_callback() + def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: + complete_fn = chat_to_completion_decorator(self.chat) + return complete_fn(prompt, **kwargs) + + @llm_completion_callback() + def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen: + raise NotImplementedError("stream_complete is not supported for LlamaAPI") + + @llm_chat_callback() + def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: + raise NotImplementedError("stream_chat is not supported for LlamaAPI") + + def _allow_self_signed_https(self, allowed: bool) -> None: + """Bypass the server certificate verification on client side if using self-signed certificate in your scoring service aka Azure Online endpoint. + + You would only use a self-signed certificate when running a local endpoint for dev and testing purposes. + + Args: + allowed (bool): Whether to allow self-signed certificates. + """ + import os + import ssl + + if allowed and not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None): + ssl._create_default_https_context = ssl._create_unverified_context + + def _azureml_request(self, input: str) -> None: + import json + import urllib.request + + # _allow_self_signed_https(false) # this line is needed if you use self-signed certificate in your scoring service. + + # Request data goes here + # The example below assumes JSON formatting which may be updated + # depending on the format your endpoint expects. + # More information can be found here: + # https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script + data = {} + + body = str.encode(json.dumps(data)) + + url = "https://docq-endpoint.eastus.inference.ml.azure.com/score" + # Replace this with the primary/secondary key or AMLToken for the endpoint + api_key = "" + if not api_key: + raise Exception("A key should be provided to invoke the endpoint") + + # The azureml-model-deployment header will force the request to go to a specific deployment. + # Remove this header to have the request observe the endpoint traffic rules + headers = { + "Content-Type": "application/json", + "Authorization": ("Bearer " + api_key), + "azureml-model-deployment": "llama2-7b-chat-8", + } + + req = urllib.request.Request(url, body, headers) + + try: + response = urllib.request.urlopen(req) + + result = response.read() + print(result) + except urllib.error.HTTPError as error: + print("The request failed with status code: " + str(error.code)) + + # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure + print(error.info()) + print(error.read().decode("utf8", "ignore")) From fe8f5afbce0172372bd3a96f7b6103d7ce0ad5c3 Mon Sep 17 00:00:00 2001 From: Janaka Abeywardhana Date: Tue, 5 Sep 2023 11:31:14 +0100 Subject: [PATCH 2/3] update: CustomLLM class --- source/docq/support/llms/azure_ml.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/source/docq/support/llms/azure_ml.py b/source/docq/support/llms/azure_ml.py index 97718918..22a0f16c 100644 --- a/source/docq/support/llms/azure_ml.py +++ b/source/docq/support/llms/azure_ml.py @@ -38,12 +38,12 @@ def __init__( additional_kwargs: Optional[Dict[str, Any]] = None, callback_manager: Optional[CallbackManager] = None, ) -> None: - try: - from llamaapi import LlamaAPI as Client - except ImportError as e: - raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e + # try: + # from llamaapi import LlamaAPI as Client + # except ImportError as e: + # raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e - self._client = Client(api_key) + # self._client = Client(api_key) self._model = model self._temperature = temperature self._max_tokens = max_tokens @@ -56,6 +56,9 @@ def _model_kwargs(self) -> Dict[str, Any]: "model": self._model, "temperature": self._temperature, "max_length": self._max_tokens, + "top_p": self._top_p, + "do_sample": self._do_sample, + "max_new_tokens": self._max_tokens, } model_kwargs = { **base_kwargs, @@ -100,6 +103,15 @@ def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen: def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: raise NotImplementedError("stream_chat is not supported for LlamaAPI") + +class AzureMLOnlineEndpoint: + """Client for Azure ML Online Endpoint.""" + + def __init__(self, endpoint_url: str, api_key: str, allow_self_signed_https: bool = False) -> None: + self._endpoint_url = endpoint_url + self._api_key = api_key + self._allow_self_signed_https(allow_self_signed_https) + def _allow_self_signed_https(self, allowed: bool) -> None: """Bypass the server certificate verification on client side if using self-signed certificate in your scoring service aka Azure Online endpoint. @@ -114,7 +126,8 @@ def _allow_self_signed_https(self, allowed: bool) -> None: if allowed and not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None): ssl._create_default_https_context = ssl._create_unverified_context - def _azureml_request(self, input: str) -> None: + def run(self, input: str) -> str: + """Run the model on the input.""" import json import urllib.request From eb2b48a2d1d68ba485e6c63f50ebc2cb7f780196 Mon Sep 17 00:00:00 2001 From: Janaka Abeywardhana Date: Tue, 5 Sep 2023 11:31:14 +0100 Subject: [PATCH 3/3] refactor: use methods in LlamaIndex generic utils --- source/docq/support/llms/azure_ml.py | 124 +++++++++++++++++---------- 1 file changed, 77 insertions(+), 47 deletions(-) diff --git a/source/docq/support/llms/azure_ml.py b/source/docq/support/llms/azure_ml.py index 22a0f16c..8f181023 100644 --- a/source/docq/support/llms/azure_ml.py +++ b/source/docq/support/llms/azure_ml.py @@ -1,6 +1,6 @@ """Llama Index `LLM` class implementation for LLMs hosted using Azure ML Online Endpoints.""" -from typing import Any, Dict, Optional, Sequence +from typing import Any, Callable, Dict, Optional, Sequence from llama_index.callbacks import CallbackManager from llama_index.constants import DEFAULT_NUM_OUTPUTS @@ -15,10 +15,9 @@ llm_completion_callback, ) from llama_index.llms.custom import CustomLLM -from llama_index.llms.generic_utils import chat_to_completion_decorator -from llama_index.llms.openai_utils import ( - from_openai_message_dict, - to_openai_message_dicts, +from llama_index.llms.generic_utils import completion_response_to_chat_response +from llama_index.llms.generic_utils import ( + messages_to_prompt as generic_messages_to_prompt, ) @@ -31,34 +30,38 @@ def __init__( api_key: str, model: str = "llama-13b-chat", temperature: float = 0.1, + max_new_tokens: int = 200, max_length: int = 200, - max_tokens: int = 200, top_p: float = 0.9, do_sample: bool = True, + model_deployment_name: Optional[str] = None, additional_kwargs: Optional[Dict[str, Any]] = None, + messages_to_prompt: Optional[Callable] = None, callback_manager: Optional[CallbackManager] = None, ) -> None: - # try: - # from llamaapi import LlamaAPI as Client - # except ImportError as e: - # raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e - - # self._client = Client(api_key) + """Initialize the LLM.""" + self._client = AzureMLOnlineEndpoint( + endpoint_url=endpoint_url, api_key=api_key, model_deployment_name=model_deployment_name + ) # Client(api_key) self._model = model self._temperature = temperature - self._max_tokens = max_tokens + self._max_new_tokens = max_new_tokens + self._max_length = max_length + self._top_p = top_p + self._do_sample = do_sample self._additional_kwargs = additional_kwargs or {} self.callback_manager = callback_manager or CallbackManager([]) + self._messages_to_prompt = messages_to_prompt or generic_messages_to_prompt @property def _model_kwargs(self) -> Dict[str, Any]: base_kwargs = { "model": self._model, "temperature": self._temperature, - "max_length": self._max_tokens, + "max_length": self._max_new_tokens, "top_p": self._top_p, "do_sample": self._do_sample, - "max_new_tokens": self._max_tokens, + "max_new_tokens": self._max_new_tokens, } model_kwargs = { **base_kwargs, @@ -68,6 +71,7 @@ def _model_kwargs(self) -> Dict[str, Any]: @property def metadata(self) -> LLMMetadata: + """Get the metadata for the LLM.""" return LLMMetadata( context_window=4096, num_output=DEFAULT_NUM_OUTPUTS, @@ -78,38 +82,61 @@ def metadata(self) -> LLMMetadata: @llm_chat_callback() def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse: - message_dicts = to_openai_message_dicts(messages) - json_dict = { - "messages": message_dicts, - **self._model_kwargs, - **kwargs, - } - response = self._client.run(json_dict).json() - message_dict = response["choices"][0]["message"] - message = from_openai_message_dict(message_dict) - - return ChatResponse(message=message, raw=response) + """Chat with the LLM.""" + prompt = self._messages_to_prompt(messages) + completion_response = self.complete(prompt, **kwargs) + return completion_response_to_chat_response(completion_response) @llm_completion_callback() def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse: - complete_fn = chat_to_completion_decorator(self.chat) - return complete_fn(prompt, **kwargs) + """Complete the prompt with the LLM.""" + # self._generate_kwargs.update({"stream": False}) + + input_json_dict = { + "input_data": { + "input_string": prompt, + "parameters": { + **self._model_kwargs, + **kwargs, + }, + } + } + response = self._client.run(input_json_dict) + + return CompletionResponse(text=response["output"], raw=response) @llm_completion_callback() def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen: + """Stream completion of the prompt with the LLM.""" raise NotImplementedError("stream_complete is not supported for LlamaAPI") @llm_chat_callback() def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen: + """Stream chat with the LLM.""" raise NotImplementedError("stream_chat is not supported for LlamaAPI") class AzureMLOnlineEndpoint: - """Client for Azure ML Online Endpoint.""" + """Web API Client for interacting with an model(s) hosted by an Azure ML Online Endpoint.""" - def __init__(self, endpoint_url: str, api_key: str, allow_self_signed_https: bool = False) -> None: + def __init__( + self, + endpoint_url: str, + api_key: str, + model_deployment_name: Optional[str] = None, + allow_self_signed_https: Optional[bool] = False, + ) -> None: + """Initialize the client. + + Args: + endpoint_url (str): The AzureML Online endpoint URL. + api_key (str): The API key. Primary/secondary key or AMLToken can be used. + allow_self_signed_https (bool, optional): Whether to allow self-signed certificates. Defaults to `False`. + model_deployment_name (str, optional): The model deployment name. Used to override server-side deployment routing rules. Defaults to `None`. + """ self._endpoint_url = endpoint_url self._api_key = api_key + self._model_deployment_name = model_deployment_name self._allow_self_signed_https(allow_self_signed_https) def _allow_self_signed_https(self, allowed: bool) -> None: @@ -126,35 +153,38 @@ def _allow_self_signed_https(self, allowed: bool) -> None: if allowed and not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None): ssl._create_default_https_context = ssl._create_unverified_context - def run(self, input: str) -> str: - """Run the model on the input.""" + def run(self, input_data: dict[str, any]) -> str: + """Run the model on the input. + + Args: + input_data (str): The prompt input data. Format is dependent on what the model expects. See the Azure ML Studio Model registry for examples. + + Returns: + str: Model response data. + """ import json import urllib.request # _allow_self_signed_https(false) # this line is needed if you use self-signed certificate in your scoring service. - # Request data goes here - # The example below assumes JSON formatting which may be updated - # depending on the format your endpoint expects. - # More information can be found here: - # https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script - data = {} + body = str.encode(json.dumps(input_data)) - body = str.encode(json.dumps(data)) + url = self._endpoint_url # "https://docq-endpoint.eastus.inference.ml.azure.com/score" - url = "https://docq-endpoint.eastus.inference.ml.azure.com/score" - # Replace this with the primary/secondary key or AMLToken for the endpoint - api_key = "" + api_key = self._api_key if not api_key: - raise Exception("A key should be provided to invoke the endpoint") + raise Exception( + "Missing API key. One should be provided to invoke the endpoint. Primary/secondary key or AMLToken can be used." + ) - # The azureml-model-deployment header will force the request to go to a specific deployment. - # Remove this header to have the request observe the endpoint traffic rules headers = { "Content-Type": "application/json", "Authorization": ("Bearer " + api_key), - "azureml-model-deployment": "llama2-7b-chat-8", } + # The azureml-model-deployment header will force the request to go to a specific deployment. + # When None, requests observe the endpoint traffic rules + if self._model_deployment_name is not None: + headers["azureml-model-deployment"] = self._model_deployment_name req = urllib.request.Request(url, body, headers) @@ -166,6 +196,6 @@ def run(self, input: str) -> str: except urllib.error.HTTPError as error: print("The request failed with status code: " + str(error.code)) - # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure + # Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure print(error.info()) print(error.read().decode("utf8", "ignore"))