From 944c1e7312e44e69415b4f8c05df9468a2ba1087 Mon Sep 17 00:00:00 2001
From: Janaka Abeywardhana <contact@janaka.co.uk>
Date: Tue, 5 Sep 2023 11:31:14 +0100
Subject: [PATCH 1/3] add IaC for Llama2 Azure OnlineEndpoints

---
 infra/README.md                               |  29 ++-
 infra/{ => app}/azure/arm/appservice.json     |   0
 .../azure/arm/appservice.parameters.json      |   0
 infra/{ => app}/azure/arm/deploy.sh           |   0
 infra/{ => app}/azure/arm/destroy.sh          |   2 -
 infra/inference/azure/arm/deploy.sh           | 194 ++++++++++++++++++
 infra/inference/azure/arm/destroy.sh          |  32 +++
 .../azure/arm/environment-version.json        |  51 +++++
 .../azure/arm/model-1/code-version.json       |  56 +++++
 .../environment/conda-managedidentity.yaml    |  14 ++
 .../azure/arm/model-1/environment/conda.yaml  |  13 ++
 .../model/sklearn_regression_model.pkl        | Bin 0 -> 756 bytes
 .../azure/arm/model-1/onlinescoring/score.py  |  36 ++++
 .../onlinescoring/score_managedidentity.py    | 107 ++++++++++
 .../azure/arm/model-1/sample-request.json     |   4 +
 infra/inference/azure/arm/model-version.json  |  53 +++++
 .../azure/arm/online-endpoint-deployment.json | 107 ++++++++++
 ...online-endpoint-deployment.parameters.json |   5 +
 .../inference/azure/arm/online-endpoint.json  |  73 +++++++
 infra/inference/azure/arm/sample-request.json |   4 +
 infra/inference/azure/arm/test-endpoint.sh    |  81 ++++++++
 infra/inference/azure/cli/deploy.sh           |  10 +
 infra/inference/azure/cli/destroy.sh          |  14 ++
 .../online-deployment-llama2-7b-chat-v8.yaml  |   8 +
 infra/inference/azure/cli/update.sh           |   8 +
 source/docq/support/llms/azure_ml.py          | 158 ++++++++++++++
 26 files changed, 1050 insertions(+), 9 deletions(-)
 rename infra/{ => app}/azure/arm/appservice.json (100%)
 rename infra/{ => app}/azure/arm/appservice.parameters.json (100%)
 rename infra/{ => app}/azure/arm/deploy.sh (100%)
 rename infra/{ => app}/azure/arm/destroy.sh (99%)
 create mode 100755 infra/inference/azure/arm/deploy.sh
 create mode 100755 infra/inference/azure/arm/destroy.sh
 create mode 100644 infra/inference/azure/arm/environment-version.json
 create mode 100644 infra/inference/azure/arm/model-1/code-version.json
 create mode 100644 infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml
 create mode 100644 infra/inference/azure/arm/model-1/environment/conda.yaml
 create mode 100644 infra/inference/azure/arm/model-1/model/sklearn_regression_model.pkl
 create mode 100644 infra/inference/azure/arm/model-1/onlinescoring/score.py
 create mode 100644 infra/inference/azure/arm/model-1/onlinescoring/score_managedidentity.py
 create mode 100644 infra/inference/azure/arm/model-1/sample-request.json
 create mode 100644 infra/inference/azure/arm/model-version.json
 create mode 100644 infra/inference/azure/arm/online-endpoint-deployment.json
 create mode 100644 infra/inference/azure/arm/online-endpoint-deployment.parameters.json
 create mode 100644 infra/inference/azure/arm/online-endpoint.json
 create mode 100644 infra/inference/azure/arm/sample-request.json
 create mode 100755 infra/inference/azure/arm/test-endpoint.sh
 create mode 100755 infra/inference/azure/cli/deploy.sh
 create mode 100755 infra/inference/azure/cli/destroy.sh
 create mode 100644 infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml
 create mode 100755 infra/inference/azure/cli/update.sh
 create mode 100644 source/docq/support/llms/azure_ml.py

diff --git a/infra/README.md b/infra/README.md
index 59eab12c..74eec0b2 100644
--- a/infra/README.md
+++ b/infra/README.md
@@ -1,24 +1,39 @@
 # Infra-as-Code setup for Docq.AI hosting
 
+- `app` folder - IaC for hosting the Docq.AI app on various cloud providers.
+- `inference` folder - IaC for hosting ML models used by the Docq app for inference.
+
+## Install Azure CLI
+
+- Install core Azure CLI `brew install azure-cli`
+- Install Azure ML CLI v2 (remove v1 if exists, then install v2)
+
+  ```terminal
+  az extension remove --name ml
+
+  az extension add --name ml --yes
+  ```
+
 ## Azure ARM Templates
 
 These docs are mainly for contributing and developing the various deployment methods available in the `/infra` folder. We recommend users start with installation instruction layed out in the user guide in the main docs site. But feel free if you want to get your hands dirty.
 
-The ARM template in `/infra/azure/arm` powers the whizard based one-click deploy method described in the main docs.
+The ARM template in `/infra/azure/arm` powers the wizard based one-click deploy method described in the main docs.
 
 ### Deploy and destroy scripts
 
-There two scripts combine several azure CLI commands to for convinience.
+There two scripts combine several azure CLI commands to for convenience.
 
-Running `./deploy.sh` is the easiest way to test when interating on the template.
+Running `./deploy.sh` is the easiest way to test when iterating on the template.
 
 - `./deploy.sh <NAME> <LOCATION> <RESOURCE_GROUP>` - args are optional. creates a resource group and deploys the ARM template based on several defaults. Inspect the script to discover the defaults and available parameters. Params can be overridden by passing argument values in order
-- `./destroy.sh <NAME> <LOCATION> <RESOURCE_GROUP>` - args are optional. Destroys the resource group and all resources within. Handles purging all Congnitive Services in the resource group that are deleted.
+- `./destroy.sh <NAME> <LOCATION> <RESOURCE_GROUP>` - args are optional. Destroys the resource group and all resources within. Handles purging all Cognitive Services in the resource group that are deleted.
 
 ### Useful CLI commands
 
 If using the scripts above you shouldn't need these but occasionally you they might help when troubleshooting.
 
+- authenticate `az login`
 - Create resource group CLI - `az group create --name docq-rg-westeurope --location westeurope`
 - Deploy template CLI - `az deployment group create --resource-group docq-rg-westeurope --name docq1 --template-file appservice.json`
 - Delete resources in resource group - `az group delete --name docq-rg-westeurope`
@@ -30,19 +45,19 @@ If using the scripts above you shouldn't need these but occasionally you they mi
 
 See the `models` tab in Azure AI Studio <https://oai.azure.com/portal> for models available to the specific Azure account along with version numbers"
 
-See API ref for detials on avail options <https://learn.microsoft.com/en-us/rest/api/cognitiveservices/azureopenaistable/deployments/create>
+See API ref for details on avail options <https://learn.microsoft.com/en-us/rest/api/cognitiveservices/azureopenaistable/deployments/create>
 
 Explanation about models <https://learn.microsoft.com/en-gb/azure/cognitive-services/openai/concepts/models>
 
 ### Testing the template
 
-- Run `./deploy.sh` to test the template deploys all resources sucessfully.
+- Run `./deploy.sh` to test the template deploys all resources successfully.
 - Navigate to the app URL for this instance. Verify the app is working as expect.
 - Test template deployment method aka one-click deploy. This is important as some times what works when deploying from the CLI doesn't work in template deployment.
   - push the template change to your branch (origin) such that it's publicly available.
   - Copy the 'raw' URL for the template file: Navigate to the file on Github.com. Click on the 'raw' button on the top right area.
   - URL encode the github raw URL.
-  - Trigger Azure template deployment by navigaiting to `https://portal.azure.com/#create/Microsoft.Template/uri/<encoded Github raw URL to the template file on your branch>`
+  - Trigger Azure template deployment by navigating to `https://portal.azure.com/#create/Microsoft.Template/uri/<encoded Github raw URL to the template file on your branch>`
   Example: The URL on main <https://portal.azure.com/#create/Microsoft.Template/uri/https%3A%2F%2Fraw.githubusercontent.com%2Fdocqai%2Fdocq%2Fmain%2Finfra%2Fazure%2Farm%2Fappservice.json>
 
 ## AWS
diff --git a/infra/azure/arm/appservice.json b/infra/app/azure/arm/appservice.json
similarity index 100%
rename from infra/azure/arm/appservice.json
rename to infra/app/azure/arm/appservice.json
diff --git a/infra/azure/arm/appservice.parameters.json b/infra/app/azure/arm/appservice.parameters.json
similarity index 100%
rename from infra/azure/arm/appservice.parameters.json
rename to infra/app/azure/arm/appservice.parameters.json
diff --git a/infra/azure/arm/deploy.sh b/infra/app/azure/arm/deploy.sh
similarity index 100%
rename from infra/azure/arm/deploy.sh
rename to infra/app/azure/arm/deploy.sh
diff --git a/infra/azure/arm/destroy.sh b/infra/app/azure/arm/destroy.sh
similarity index 99%
rename from infra/azure/arm/destroy.sh
rename to infra/app/azure/arm/destroy.sh
index 70418c5a..01db564d 100755
--- a/infra/azure/arm/destroy.sh
+++ b/infra/app/azure/arm/destroy.sh
@@ -4,8 +4,6 @@ LOCATION="${2:-"westeurope"}"
 
 RESOURCE_GROUP="${3:-${NAME}-rg-${LOCATION}}"
 
-
-
 read -p "This will delete all resources in resource group '${RESOURCE_GROUP}'. Are you sure? [y/n]" confirm
 
 if [ $confirm = "y" ] || [ $confirm = "Y" ] 
diff --git a/infra/inference/azure/arm/deploy.sh b/infra/inference/azure/arm/deploy.sh
new file mode 100755
index 00000000..249e7ea2
--- /dev/null
+++ b/infra/inference/azure/arm/deploy.sh
@@ -0,0 +1,194 @@
+set -x
+
+NAME="${1:-"docq"}"
+
+LOCATION="${2:-"westeurope"}"
+
+RESOURCE_GROUP="${NAME}-ml-rg-${LOCATION}"
+
+
+res1=$(az group create --name $RESOURCE_GROUP --location $LOCATION)
+
+
+WORKSPACE="${NAME}-main-ws-${LOCATION}"
+
+az ml workspace create --name $WORKSPACE --resource-group $RESOURCE_GROUP --location $LOCATION
+wait
+
+#<get_access_token>
+TOKEN=$(az account get-access-token --query accessToken -o tsv)
+#</get_access_token>
+
+# <create_variables>
+SUBSCRIPTION_ID=$(az account show --query id -o tsv)
+#LOCATION=$(az ml workspace show --query location -o tsv)
+#RESOURCE_GROUP=$(az group show --query name -o tsv)
+#WORKSPACE=$(az configure -l --query "[?name=='workspace'].value" -o tsv)
+#</create_variables>
+
+# <set_endpoint_name>
+#export ENDPOINT_NAME=endpoint-`echo $RANDOM`
+export ENDPOINT_NAME="${NAME}-endpoint"
+# </set_endpoint_name>
+
+#<api_version>
+API_VERSION="2022-05-01"
+#</api_version>
+
+echo -e "Using:\nSUBSCRIPTION_ID=$SUBSCRIPTION_ID\nLOCATION=$LOCATION\nRESOURCE_GROUP=$RESOURCE_GROUP\nWORKSPACE=$WORKSPACE"
+
+# define how to wait  
+wait_for_completion () {
+    operation_id=$1
+    status="unknown"
+
+    if [[ $operation_id == "" || -z $operation_id  || $operation_id == "null" ]]; then
+        echo "operation id cannot be empty"
+        exit 1
+    fi
+
+    while [[ $status != "Succeeded" && $status != "Failed" ]]
+    do
+        echo "Getting operation status from: $operation_id"
+        operation_result=$(curl --location --request GET $operation_id --header "Authorization: Bearer $TOKEN")
+        # TODO error handling here
+        status=$(echo $operation_result | jq -r '.status')
+        echo "Current operation status: $status"
+        sleep 5
+    done
+
+    if [[ $status == "Failed" ]]
+    then
+        error=$(echo $operation_result | jq -r '.error')
+        echo "Error: $error"
+    fi
+}
+
+# # <get_storage_details>
+# # Get values for storage account
+# response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/datastores?api-version=$API_VERSION&isDefault=true" \
+# --header "Authorization: Bearer $TOKEN")
+# AZUREML_DEFAULT_DATASTORE=$(echo $response | jq -r '.value[0].name')
+# AZUREML_DEFAULT_CONTAINER=$(echo $response | jq -r '.value[0].properties.containerName')
+# export AZURE_STORAGE_ACCOUNT=$(echo $response | jq -r '.value[0].properties.accountName')
+# # </get_storage_details>
+
+# # <upload_code>
+# az storage blob upload-batch -d $AZUREML_DEFAULT_CONTAINER/score -s model-1/onlinescoring --account-name $AZURE_STORAGE_ACCOUNT
+# # </upload_code>
+
+# # <create_code>
+# az deployment group create -g $RESOURCE_GROUP \
+# --template-file code-version.json \
+# --parameters \
+# workspaceName=$WORKSPACE \
+# codeAssetName="score-sklearn" \
+# codeUri="https://$AZURE_STORAGE_ACCOUNT.blob.core.windows.net/$AZUREML_DEFAULT_CONTAINER/score"
+# # </create_code>
+
+# # <upload_model>
+# az storage blob upload-batch -d $AZUREML_DEFAULT_CONTAINER/model -s model-1/model --account-name $AZURE_STORAGE_ACCOUNT
+# # </upload_model>
+
+# # <create_model>
+# az deployment group create -g $RESOURCE_GROUP \
+# --template-file model-version.json \
+# --parameters \
+# workspaceName=$WORKSPACE \
+# modelAssetVersion=6 \
+# modelAssetName="Llama-2-13b-chat" \
+# modelUri="azureml://registries/azureml-meta/models/Llama-2-13b-chat/versions/6"
+# # </create_model>
+
+# # <read_condafile>
+# CONDA_FILE=$(cat model-1/environment/conda.yaml)
+# # </read_condafile>
+
+# # <create_environment>
+# ENV_VERSION=$RANDOM
+# az deployment group create -g $RESOURCE_GROUP \
+# --template-file environment-version.json \
+# --parameters \
+# workspaceName=$WORKSPACE \
+# environmentAssetName=sklearn-env \
+# environmentAssetVersion=$ENV_VERSION \
+# dockerImage=mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1 \
+# condaFile="$CONDA_FILE"
+# # </create_environment>
+
+# <create_endpoint>
+az deployment group create -g $RESOURCE_GROUP \
+    --template-file online-endpoint.json \
+    --parameters \
+    workspaceName=$WORKSPACE \
+    onlineEndpointName=$ENDPOINT_NAME \
+    identityType=SystemAssigned \
+    authMode=AMLToken \
+    location=$LOCATION
+# </create_endpoint>
+
+# <get_endpoint>
+response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME?api-version=$API_VERSION" \
+--header "Content-Type: application/json" \
+--header "Authorization: Bearer $TOKEN")
+
+operation_id=$(echo $response | jq -r '.properties.properties.AzureAsyncOperationUri')
+wait_for_completion $operation_id
+# </get_endpoint>
+
+LLAMA2_7B_CHAT="azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/8"
+SATVIKAG_CHATBOT="azureml://registries/HuggingFace/models/satvikag-chatbot/versions/3"
+SELECTED_MODEL=$LLAMA2_7B_CHAT
+MODEL_NAME="llama2-7b-chat-8"
+
+# <create_deployment>
+resourceScope="/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices"
+az deployment group create -g $RESOURCE_GROUP \
+ --template-file online-endpoint-deployment.json \
+ --parameters \
+ workspaceName=$WORKSPACE \
+ location=$LOCATION \
+ onlineEndpointName=$ENDPOINT_NAME \
+ onlineDeploymentName=$MODEL_NAME \
+ model=$SELECTED_MODEL \
+ endpointComputeType="Managed" \
+ skuName="Standard_NC12s_v3" \
+ skuCapacity=1
+ # </create_deployment>
+
+# <get_deployment>
+response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue?api-version=$API_VERSION" \
+--header "Content-Type: application/json" \
+--header "Authorization: Bearer $TOKEN")
+
+operation_id=$(echo $response | jq -r '.properties.properties.AzureAsyncOperationUri')
+wait_for_completion $operation_id
+
+scoringUri=$(echo $response | jq -r '.properties.scoringUri')
+# </get_endpoint>
+
+# <get_endpoint_access_token>
+response=$(curl -H "Content-Length: 0" --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/token?api-version=$API_VERSION" \
+--header "Authorization: Bearer $TOKEN")
+accessToken=$(echo $response | jq -r '.accessToken')
+# </get_endpoint_access_token>
+
+# <score_endpoint>
+curl --location --request POST $scoringUri \
+    --header "Authorization: Bearer $accessToken" \
+    --header "Content-Type: application/json" \
+    --data-raw @sample-request.json
+# </score_endpoint>
+
+# <get_deployment_logs>
+curl --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue/getLogs?api-version=$API_VERSION" \
+    --header "Authorization: Bearer $TOKEN" \
+    --header "Content-Type: application/json" \
+    --data-raw "{ \"tail\": 100 }"
+# </get_deployment_logs>
+
+# # <delete_endpoint>
+# curl --location --request DELETE "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME?api-version=$API_VERSION" \
+# --header "Content-Type: application/json" \
+# --header "Authorization: Bearer $TOKEN" || true
+# # </delete_endpoint>
diff --git a/infra/inference/azure/arm/destroy.sh b/infra/inference/azure/arm/destroy.sh
new file mode 100755
index 00000000..4dd619ad
--- /dev/null
+++ b/infra/inference/azure/arm/destroy.sh
@@ -0,0 +1,32 @@
+NAME="${1:-"docq"}"
+
+LOCATION="${2:-"westeurope"}"
+
+RESOURCE_GROUP="${3:-${NAME}-ml-rg-${LOCATION}}"
+
+read -p "This will delete all resources in resource group '${RESOURCE_GROUP}'. Are you sure? [y/n]" confirm
+
+if [ $confirm = "y" ] || [ $confirm = "Y" ] 
+then
+
+  workspaces=($(az ml workspace list --resource-group $RESOURCE_GROUP --query [].name --output tsv))
+  echo "Deleting ${#workspaces[@]} Azure ML Workspaces"
+
+  for workspace in $workspaces 
+  do
+      az ml workspace delete --name $workspace --resource-group $RESOURCE_GROUP --permanently-delete --all-resources --yes
+      echo "'${workspace}' deleted."
+  done
+  wait
+
+  res=$(az group delete --name ${RESOURCE_GROUP} -y)
+  echo $res | jq '.'
+  wait
+
+  echo "Success! All resources in resource group '${RESOURCE_GROUP}' were deleted."
+  exit 0
+elif [ $confirm = "n" ] || [ $confirm = "N" ] 
+then
+    echo "Aborted! nothing was destroyed."
+    exit -1
+fi
\ No newline at end of file
diff --git a/infra/inference/azure/arm/environment-version.json b/infra/inference/azure/arm/environment-version.json
new file mode 100644
index 00000000..a9f2e3ca
--- /dev/null
+++ b/infra/inference/azure/arm/environment-version.json
@@ -0,0 +1,51 @@
+{
+  "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+  "contentVersion": "1.0.0.0",
+  "parameters": {
+    "workspaceName": {
+      "type": "string"
+    },
+    "environmentAssetName": {
+      "type": "string"
+    },
+    "environmentAssetVersion": {
+      "defaultValue": "1",
+      "type": "string"
+    },
+    "environmentDescription": {
+      "defaultValue": "This is a test description for an environment created from an ARM template",
+      "type": "string"
+    },
+    "condaFile": {
+      "defaultValue": "",
+      "type": "string",
+      "metadata": {
+        "description": "Standard configuration file used by Conda that lets you install any kind of package, including Python, R, and C/C++ packages."
+      }
+    },
+    "isAnonymous": {
+      "defaultValue": false,
+      "type": "bool"
+    },
+    "dockerImage": {
+      "defaultValue": "",
+      "type": "string",
+      "metadata": {
+        "description": "Docker image path, for example: 'docker.io/tensorflow/serving:latest'."
+      }
+    }
+  },
+  "resources": [
+    {
+      "type": "Microsoft.MachineLearningServices/workspaces/environments/versions",
+      "apiVersion": "2022-05-01",
+      "name": "[concat(parameters('workspaceName'), '/', parameters('environmentAssetName'), '/', parameters('environmentAssetVersion'))]",
+      "properties": {
+        "isAnonymous": "[parameters('isAnonymous')]",
+        "description": "[parameters('environmentDescription')]",
+        "image": "[parameters('dockerImage')]",
+        "condaFile": "[parameters('condaFile')]"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/model-1/code-version.json b/infra/inference/azure/arm/model-1/code-version.json
new file mode 100644
index 00000000..b54c9f4a
--- /dev/null
+++ b/infra/inference/azure/arm/model-1/code-version.json
@@ -0,0 +1,56 @@
+{
+  "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+  "contentVersion": "1.0.0.0",
+  "parameters": {
+    "workspaceName": {
+      "type": "string",
+      "metadata": {
+        "description": "Specifies the name of the Azure Machine Learning Workspace which will contain this compute."
+      }
+    },
+    "codeAssetName": {
+      "type": "string",
+      "metadata": {
+        "description": "Specifies the name of the Azure Machine Learning code asset which will be created or updated."
+      }
+    },
+    "codeAssetVersion": {
+      "defaultValue": "1",
+      "type": "string",
+      "metadata": {
+        "description": "Specifies the version of the Azure Machine Learning code asset which will be created or updated."
+      }
+    },
+    "codeUri": {
+      "type": "string",
+      "metadata": {
+        "description": "Specifies the location of the Azure Machine Learning code asset in a storage account."
+      }
+    },
+    "codeAssetDescription": {
+      "defaultValue": "This is a test description for a code asset created by an ARM template",
+      "type": "string"
+    },
+    "isAnonymous": {
+      "defaultValue": false,
+      "type": "bool",
+      "metadata": {
+        "description": "If the name version are system generated (anonymous registration)."
+      }
+    }
+  },
+  "resources": [
+    {
+      "type": "Microsoft.MachineLearningServices/workspaces/codes/versions",
+      "apiVersion": "2022-05-01",
+      "name": "[concat(parameters('workspaceName'), '/', parameters('codeAssetName'), '/', parameters('codeAssetVersion'))]",
+      "properties": {
+        "description": "[parameters('codeAssetDescription')]",
+        "codeUri": "[parameters('codeUri')]",
+        "isAnonymous": "[parameters('isAnonymous')]",
+        "properties": {},
+        "tags": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml b/infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml
new file mode 100644
index 00000000..c6372ca3
--- /dev/null
+++ b/infra/inference/azure/arm/model-1/environment/conda-managedidentity.yaml
@@ -0,0 +1,14 @@
+name: model-env
+channels:
+  - conda-forge
+dependencies:
+  - python=3.7
+  - numpy=1.21.2
+  - pip=21.2.4
+  - scikit-learn=0.24.2
+  - scipy=1.7.1
+  - pip:
+      - azureml-defaults==1.38.0
+      - joblib==1.0.1
+      - azure-storage-blob==12.11
+      - azure-identity==1.7
diff --git a/infra/inference/azure/arm/model-1/environment/conda.yaml b/infra/inference/azure/arm/model-1/environment/conda.yaml
new file mode 100644
index 00000000..426b6146
--- /dev/null
+++ b/infra/inference/azure/arm/model-1/environment/conda.yaml
@@ -0,0 +1,13 @@
+name: model-env
+channels:
+  - conda-forge
+dependencies:
+  - python=3.9
+  - numpy=1.23.5
+  - pip=23.0.1
+  - scikit-learn=1.2.2
+  - scipy=1.10.1
+  - pip:
+    - azureml-defaults==1.49.0
+    - inference-schema[numpy-support]==1.5.1
+    - joblib==1.2.0
\ No newline at end of file
diff --git a/infra/inference/azure/arm/model-1/model/sklearn_regression_model.pkl b/infra/inference/azure/arm/model-1/model/sklearn_regression_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..edb4ffa7d6bc0435b2597b9d2782616848ab86c6
GIT binary patch
literal 756
zcmX|9O-K}B7~a*ib!yGOTCGg`UsHy#1Sw(0qz-K^xFSsyeay^$yWhC8GxN@mrLtv*
zbXV}&Av#JO3<`>Z#E=MzehwjZswhm-h@cJ?9kOper8&&-KJz}$`@BCR#j2Chv1}W>
zt{L=;vSt$NJW^x_b0-*-z-b#hj%FGh1ez{EFb73ZDI5p{Qksi2!^O6WEf<ROQW*se
z)6UAeHiThKK`NzyYFdLxf|5Lo4AgHb4wS~F5dCMFj^=6u7}ge~a-WP+Sav;Hj|w9Q
z#W<;iN#GQMGL=$wP2mkMOC>E$rPjH)A9UKbJb2QUEeqQmaMb0x9ITJB8jh!^y6iX*
zRvD2qlCq!9K{*q5h-_il5aTXG1p`bwiEXHqg0ylMR52{+4$`FRRRtDtP$L!5lvF!x
zEab;vQvxE<STUn@Ky5r8KfX*K_KF2us8b}4rpvl+o<rHJY{BNdr&LX2RM6*yYt6<{
z?{0SX&Y{LLxy6fOQ20IdY}gfpGm|r6D9oOHM<zlqFNh28GhFw*Q{sc04^NZowAgWb
zt|i(rCSE_^)*UJB71<6D9iW!<umf9G9P_GaV?OKYuC~&<qsqEWN41qix<VtF*S%%8
zT05bB6t*Rxfpu)lMg~D6-GIMUM>EnMWgys2{PFI{!OkcNlXAiaw~JJgDpE}%#P6wZ
zvVXj#cUgG4)N@$;@IzR5^Wsms?Yr=Ex?$+pjc>wm(et;D3QIyw&-_HJdr@fa`|`1_
z_OtN#$gjU+Q=fzh(Yn&)z7np^kJo>N`vRNW|3gEtn=sq`?N1JPwW<hvMqzIPqEZDl
bidOF#4Pe{9S!iNsx1Vo4$hX2iPvQRoZ!QSn

literal 0
HcmV?d00001

diff --git a/infra/inference/azure/arm/model-1/onlinescoring/score.py b/infra/inference/azure/arm/model-1/onlinescoring/score.py
new file mode 100644
index 00000000..5d5c3a75
--- /dev/null
+++ b/infra/inference/azure/arm/model-1/onlinescoring/score.py
@@ -0,0 +1,36 @@
+import os
+import logging
+import json
+import numpy
+import joblib
+
+
+def init():
+    """
+    This function is called when the container is initialized/started, typically after create/update of the deployment.
+    You can write the logic here to perform init operations like caching the model in memory
+    """
+    global model
+    # AZUREML_MODEL_DIR is an environment variable created during deployment.
+    # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
+    # Please provide your model's folder name if there is one
+    model_path = os.path.join(
+        os.getenv("AZUREML_MODEL_DIR"), "model/sklearn_regression_model.pkl"
+    )
+    # deserialize the model file back into a sklearn model
+    model = joblib.load(model_path)
+    logging.info("Init complete")
+
+
+def run(raw_data):
+    """
+    This function is called for every invocation of the endpoint to perform the actual scoring/prediction.
+    In the example we extract the data from the json input and call the scikit-learn model's predict()
+    method and return the result back
+    """
+    logging.info("model 1: request received")
+    data = json.loads(raw_data)["data"]
+    data = numpy.array(data)
+    result = model.predict(data)
+    logging.info("Request processed")
+    return result.tolist()
diff --git a/infra/inference/azure/arm/model-1/onlinescoring/score_managedidentity.py b/infra/inference/azure/arm/model-1/onlinescoring/score_managedidentity.py
new file mode 100644
index 00000000..1b116796
--- /dev/null
+++ b/infra/inference/azure/arm/model-1/onlinescoring/score_managedidentity.py
@@ -0,0 +1,107 @@
+import os
+import logging
+import json
+import numpy
+import joblib
+import requests
+from azure.identity import ManagedIdentityCredential
+from azure.storage.blob import BlobClient
+
+
+def access_blob_storage_sdk():
+    credential = ManagedIdentityCredential(client_id=os.getenv("UAI_CLIENT_ID"))
+    storage_account = os.getenv("STORAGE_ACCOUNT_NAME")
+    storage_container = os.getenv("STORAGE_CONTAINER_NAME")
+    file_name = os.getenv("FILE_NAME")
+
+    blob_client = BlobClient(
+        account_url=f"https://{storage_account}.blob.core.windows.net/",
+        container_name=storage_container,
+        blob_name=file_name,
+        credential=credential,
+    )
+    blob_contents = blob_client.download_blob().content_as_text()
+    logging.info(f"Blob contains: {blob_contents}")
+
+
+def get_token_rest():
+    """
+    Retrieve an access token via REST.
+    """
+
+    access_token = None
+    msi_endpoint = os.environ.get("MSI_ENDPOINT", None)
+    msi_secret = os.environ.get("MSI_SECRET", None)
+
+    # If UAI_CLIENT_ID is provided then assume that endpoint was created with user assigned identity,
+    # # otherwise system assigned identity deployment.
+    client_id = os.environ.get("UAI_CLIENT_ID", None)
+    if client_id is not None:
+        token_url = (
+            msi_endpoint + f"?clientid={client_id}&resource=https://storage.azure.com/"
+        )
+    else:
+        token_url = msi_endpoint + f"?resource=https://storage.azure.com/"
+
+    logging.info("Trying to get identity token...")
+    headers = {"secret": msi_secret, "Metadata": "true"}
+    resp = requests.get(token_url, headers=headers)
+    resp.raise_for_status()
+    access_token = resp.json()["access_token"]
+    logging.info("Retrieved token successfully.")
+    return access_token
+
+
+def access_blob_storage_rest():
+    """
+    Access a blob via REST.
+    """
+
+    logging.info("Trying to access blob storage...")
+    storage_account = os.environ.get("STORAGE_ACCOUNT_NAME")
+    storage_container = os.environ.get("STORAGE_CONTAINER_NAME")
+    file_name = os.environ.get("FILE_NAME")
+    logging.info(
+        f"storage_account: {storage_account}, container: {storage_container}, filename: {file_name}"
+    )
+    token = get_token_rest()
+
+    blob_url = f"https://{storage_account}.blob.core.windows.net/{storage_container}/{file_name}?api-version=2019-04-01"
+    auth_headers = {
+        "Authorization": f"Bearer {token}",
+        "x-ms-blob-type": "BlockBlob",
+        "x-ms-version": "2019-02-02",
+    }
+    resp = requests.get(blob_url, headers=auth_headers)
+    resp.raise_for_status()
+    logging.info(f"Blob contains: {resp.text}")
+
+
+def init():
+    global model
+    # AZUREML_MODEL_DIR is an environment variable created during deployment.
+    # It is the path to the model folder (./azureml-models/$MODEL_NAME/$VERSION)
+    # For multiple models, it points to the folder containing all deployed models (./azureml-models)
+    # Please provide your model's folder name if there is one
+    model_path = os.path.join(
+        os.getenv("AZUREML_MODEL_DIR"), "model/sklearn_regression_model.pkl"
+    )
+    # deserialize the model file back into a sklearn model
+    model = joblib.load(model_path)
+    logging.info("Model loaded")
+
+    # Access Azure resource (Blob storage) using system assigned identity token
+    access_blob_storage_rest()
+    access_blob_storage_sdk()
+
+    logging.info("Init complete")
+
+
+# note you can pass in multiple rows for scoring
+def run(raw_data):
+    logging.info("Request received")
+    data = json.loads(raw_data)["data"]
+    data = numpy.array(data)
+    result = model.predict(data)
+    logging.info("Request processed")
+    return result.tolist()
diff --git a/infra/inference/azure/arm/model-1/sample-request.json b/infra/inference/azure/arm/model-1/sample-request.json
new file mode 100644
index 00000000..ebf20bf8
--- /dev/null
+++ b/infra/inference/azure/arm/model-1/sample-request.json
@@ -0,0 +1,4 @@
+{"data": [
+    [1,2,3,4,5,6,7,8,9,10], 
+    [10,9,8,7,6,5,4,3,2,1]
+]}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/model-version.json b/infra/inference/azure/arm/model-version.json
new file mode 100644
index 00000000..04735204
--- /dev/null
+++ b/infra/inference/azure/arm/model-version.json
@@ -0,0 +1,53 @@
+{
+  "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+  "contentVersion": "1.0.0.0",
+  "parameters": {
+    "workspaceName": {
+      "type": "string",
+      "metadata": {
+        "description": "The name of the Azure Machine Learning Workspace."
+      }
+    },
+    "modelAssetVersion": {
+      "defaultValue": "1",
+      "type": "string",
+      "metadata": {
+        "description": "The version of the Azure Machine Learning model which will be created or updated."
+      }
+    },
+    "modelAssetName": {
+      "type": "string",
+      "metadata": {
+        "description": "Specifies the name of the Azure Machine Learning model container which will contain this model."
+      }
+    },
+    "modelUri": {
+      "type": "string"
+    },
+    "modelDescription": {
+      "defaultValue": "This is a test description for a model version created by an ARM template",
+      "type": "string"
+    },
+    "isAnonymous": {
+      "defaultValue": false,
+      "type": "bool",
+      "metadata": {
+        "description": "If the name version are system generated (anonymous registration)."
+      }
+    }
+  },
+  "resources": [
+    {
+      "type": "Microsoft.MachineLearningServices/workspaces/models/versions",
+      "apiVersion": "2022-05-01",
+      "name": "[concat(parameters('workspaceName'), '/', parameters('modelAssetName'), '/', parameters('modelAssetVersion'))]",
+      "properties": {
+        "description": "[parameters('modelDescription')]",
+        "isAnonymous": "[parameters('isAnonymous')]",
+        "modelUri": "[parameters('modelUri')]",
+        "properties": {},
+        "tags": {}
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/online-endpoint-deployment.json b/infra/inference/azure/arm/online-endpoint-deployment.json
new file mode 100644
index 00000000..d8c06121
--- /dev/null
+++ b/infra/inference/azure/arm/online-endpoint-deployment.json
@@ -0,0 +1,107 @@
+{
+  "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+  "contentVersion": "1.0.0.0",
+  "parameters": {
+    "workspaceName": {
+      "type": "string",
+      "defaultValue": "docq-workspace",
+      "metadata": {
+        "description": "Specifies the name of the Azure Machine Learning Workspace which will contain this compute."
+      }
+    },
+    "location": {
+      "type": "string",
+      "defaultValue": "eastus"
+    },
+    "appInsightsEnabled": {
+      "type": "bool",
+      "defaultValue": false
+    },
+    "onlineEndpointName": {
+      "type": "string",
+      "defaultValue": "",
+      "metadata": {
+        "description": "Specifies the name of the Azure Machine Learning online endpoint which will be deployed."
+      }
+    },
+    "onlineDeploymentName": {
+      "defaultValue": "blue",
+      "type": "string",
+      "metadata": {
+        "description": "Specifies the name of the Azure Machine Learning online endpoint which will be deployed."
+      }
+    },
+    "onlineEndpointDescription": {
+      "defaultValue": "Online endpoint deployment created for Docq.AI",
+      "type": "string"
+    },
+    "onlineDeploymentTags": {
+      "defaultValue": {
+        "app": "docq"
+      },
+      "type": "object"
+    },
+    "model": {
+      "type": "string",
+      "defaultValue": "azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/8"
+    },
+    "endpointComputeType": {
+      "type": "string",
+      "defaultValue": "Managed",
+      "allowedValues": [
+        "Managed",
+        "Kubernetes",
+        "AzureMLCompute"
+      ]
+    },
+    "skuName": {
+      "type": "string",
+      "defaultValue": "Standard_NC12s_v3",
+      "metadata": {
+        "description": "The name of the SKU. Ex - Standard_F4s_v2 or Standard_NC12s_v3. See SKU list https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list?view=azureml-api-2"
+      }
+    },
+    "skuCapacity": {
+      "type": "int",
+      "defaultValue": 1,
+      "metadata": {
+        "description": "If the SKU supports scale out/in then the capacity integer should be included. If scale out/in is not possible for the resource this may be omitted."
+      }
+    }
+  },
+  "resources": [
+    {
+      "type": "Microsoft.MachineLearningServices/workspaces/onlineEndpoints/deployments",
+      "apiVersion": "2022-12-01-preview",
+      "name": "[concat(parameters('workspaceName'), '/', parameters('onlineEndpointName'),'/', parameters('onlineDeploymentName'))]",
+      "location": "[parameters('location')]",
+      "tags": "[parameters('onlineDeploymentTags')]",
+      "sku": {
+        "name": "Standard_NC12s_v3",
+        "capacity": 1,
+        "tier": "Standard",
+        "size": "NC12s_v3"
+      },
+      "properties": {
+        "requestSettings": {
+          "requestTimeout": "PT90S",
+          "maxConcurrentRequestsPerInstance": 1,
+          "maxQueueWait": "PT1S"
+        },
+        "livenessProbe": {
+          "initialDelay": "PT600S",
+          "period": "PT10S",
+          "timeout": "PT2S",
+          "failureThreshold": 30,
+          "successThreshold": 1
+        },
+        "appInsightsEnabled": "[parameters('appInsightsEnabled')]",
+        "description": "[parameters('onlineEndpointDescription')]",
+        "egressPublicNetworkAccess": "Enabled",
+        "model": "[parameters('model')]",
+        "properties": {},
+        "endpointComputeType": "[parameters('endpointComputeType')]"
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/online-endpoint-deployment.parameters.json b/infra/inference/azure/arm/online-endpoint-deployment.parameters.json
new file mode 100644
index 00000000..1be78a2d
--- /dev/null
+++ b/infra/inference/azure/arm/online-endpoint-deployment.parameters.json
@@ -0,0 +1,5 @@
+{
+    "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentParameters.json#",
+    "contentVersion": "1.0.0.0",
+    "parameters": {}
+}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/online-endpoint.json b/infra/inference/azure/arm/online-endpoint.json
new file mode 100644
index 00000000..ee27ed84
--- /dev/null
+++ b/infra/inference/azure/arm/online-endpoint.json
@@ -0,0 +1,73 @@
+{
+  "$schema": "https://schema.management.azure.com/schemas/2019-04-01/deploymentTemplate.json#",
+  "contentVersion": "1.0.0.0",
+  "parameters": {
+    "workspaceName": {
+      "type": "string"
+    },
+    "location": {
+      "type": "string"
+    },
+    "onlineEndpointName": {
+      "type": "string"
+    },
+    "onlineEndpointDescription": {
+      "defaultValue": "This is an online endpoint created by an ARM template",
+      "type": "string"
+    },
+    "identityType": {
+      "allowedValues": [
+        "SystemAssigned",
+        "UserAssigned",
+        "SystemAssignedUserAssigned",
+        "None"
+      ],
+      "type": "string",
+      "metadata": {
+        "description": "The MSI Identity that is associated with this resource."
+      }
+    },
+    "allowPublicAccess": {
+      "defaultValue": true,
+      "type": "bool",
+      "metadata": {
+        "description": "Set to true for endpoints that should allow public access when Private Link is enabled."
+      }
+    },
+    "authMode": {
+      "defaultValue": "Key",
+      "allowedValues": [
+        "AMLToken",
+        "Key",
+        "AADToken"
+      ],
+      "type": "string"
+    },
+    "onlineEndpointTags": {
+      "defaultValue": {
+        "app": "docq"
+      },
+      "type": "object"
+    }
+  },
+  "resources": [
+    {
+      "type": "Microsoft.MachineLearningServices/workspaces/onlineEndpoints",
+      "apiVersion": "2022-05-01",
+      "name": "[concat(parameters('workspaceName'), '/', parameters('onlineEndpointName'))]",
+      "location": "[parameters('location')]",
+      "tags": "[parameters('onlineEndpointTags')]",
+      "identity": {
+        "type": "[parameters('identityType')]"
+      },
+      "properties": {
+        "authMode": "[parameters('authMode')]",
+        "description": "[parameters('onlineEndpointDescription')]",
+        "properties": {},
+        "traffic": {
+          "blue": 100
+        }
+      }
+    }
+  ]
+}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/sample-request.json b/infra/inference/azure/arm/sample-request.json
new file mode 100644
index 00000000..ebf20bf8
--- /dev/null
+++ b/infra/inference/azure/arm/sample-request.json
@@ -0,0 +1,4 @@
+{"data": [
+    [1,2,3,4,5,6,7,8,9,10], 
+    [10,9,8,7,6,5,4,3,2,1]
+]}
\ No newline at end of file
diff --git a/infra/inference/azure/arm/test-endpoint.sh b/infra/inference/azure/arm/test-endpoint.sh
new file mode 100755
index 00000000..9c55825b
--- /dev/null
+++ b/infra/inference/azure/arm/test-endpoint.sh
@@ -0,0 +1,81 @@
+NAME="docq"
+
+LOCATION="${1:-"westeurope"}"
+
+RESOURCE_GROUP="${NAME}-ml-rg-${LOCATION}"
+
+
+#<get_access_token>
+TOKEN=$(az account get-access-token --query accessToken -o tsv)
+#</get_access_token>
+
+API_VERSION="2022-05-01"
+
+WORKSPACE="${NAME}-main-ws-${LOCATION}"
+
+ENDPOINT_NAME="endpoint-10252"
+
+# <create_variables>
+SUBSCRIPTION_ID=$(az account show --query id -o tsv)
+#</create_variables>
+
+echo -e "Using:\nSUBSCRIPTION_ID=$SUBSCRIPTION_ID\nLOCATION=$LOCATION\nRESOURCE_GROUP=$RESOURCE_GROUP\nWORKSPACE=$WORKSPACE"
+
+wait_for_completion () {
+    operation_id=$1
+    status="unknown"
+
+    if [[ $operation_id == "" || -z $operation_id  || $operation_id == "null" ]]; then
+        echo "operation id cannot be empty"
+        exit 1
+    fi
+
+    while [[ $status != "Succeeded" && $status != "Failed" ]]
+    do
+        echo "Getting operation status from: $operation_id"
+        operation_result=$(curl --location --request GET $operation_id --header "Authorization: Bearer $TOKEN")
+        # TODO error handling here
+        status=$(echo $operation_result | jq -r '.status')
+        echo "Current operation status: $status"
+        sleep 5
+    done
+
+    if [[ $status == "Failed" ]]
+    then
+        error=$(echo $operation_result | jq -r '.error')
+        echo "Error: $error"
+    fi
+}
+
+# # <get_deployment>
+# response=$(curl --location --request GET "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue?api-version=$API_VERSION" \
+# --header "Content-Type: application/json" \
+# --header "Authorization: Bearer $TOKEN")
+
+# operation_id=$(echo $response | jq -r '.properties.properties.AzureAsyncOperationUri')
+# wait_for_completion $operation_id
+
+# scoringUri=$(echo $response | jq -r '.properties.scoringUri')
+# # </get_endpoint>
+
+
+# # <get_endpoint_access_token>
+# response=$(curl -H "Content-Length: 0" --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/token?api-version=$API_VERSION" \
+# --header "Authorization: Bearer $TOKEN")
+# accessToken=$(echo $response | jq -r '.accessToken')
+# # </get_endpoint_access_token>
+
+# # <score_endpoint>
+# curl --location --request POST $scoringUri \
+# --header "Authorization: Bearer $accessToken" \
+# --header "Content-Type: application/json" \
+# --data-raw @sample-request.json
+# # </score_endpoint>
+
+# # <get_deployment_logs>
+# curl --location --request POST "https://management.azure.com/subscriptions/$SUBSCRIPTION_ID/resourceGroups/$RESOURCE_GROUP/providers/Microsoft.MachineLearningServices/workspaces/$WORKSPACE/onlineEndpoints/$ENDPOINT_NAME/deployments/blue/getLogs?api-version=$API_VERSION" \
+# --header "Authorization: Bearer $TOKEN" \
+# --header "Content-Type: application/json" \
+# --data-raw "{ \"tail\": 100 }"
+
+az ml online-endpoint invoke --name $ENDPOINT_NAME --workspace-name $WORKSPACE --resource-group $RESOURCE_GROUP --request-file model-1/sample-request.json
\ No newline at end of file
diff --git a/infra/inference/azure/cli/deploy.sh b/infra/inference/azure/cli/deploy.sh
new file mode 100755
index 00000000..23f34cb6
--- /dev/null
+++ b/infra/inference/azure/cli/deploy.sh
@@ -0,0 +1,10 @@
+FILE="${1:-"online-deployment-llama2-7b-chat-v8.yaml"}"
+ENDPOINT_NAME="${2:-"docq-endpoint"}"
+LOCAL="${3:-false}"
+az ml online-deployment create \
+  --name "llama2-7b-chat-8" \
+  --workspace-name "docq-main-ws-eastus" \
+  --resource-group "docq-ml-rg-eastus" \
+  --endpoint-name $ENDPOINT_NAME \
+  --file $FILE \
+  --local $LOCAL
\ No newline at end of file
diff --git a/infra/inference/azure/cli/destroy.sh b/infra/inference/azure/cli/destroy.sh
new file mode 100755
index 00000000..01a6d82c
--- /dev/null
+++ b/infra/inference/azure/cli/destroy.sh
@@ -0,0 +1,14 @@
+FILE="${1:-"online-deployment-llama2-7b-chat-v8.yaml"}"
+ENDPOINT_NAME="${2:-"docq-endpoint"}"
+az ml online-deployment update \
+  --name "llama2-7b-chat-8" \
+  --workspace-name "docq-main-ws-eastus" \
+  --resource-group "docq-ml-rg-eastus" \
+  --endpoint-name $ENDPOINT_NAME
+  --set traffic=0
+
+az ml online-deployment delete \
+  --name "llama2-7b-chat-8" \
+  --workspace-name "docq-main-ws-eastus" \
+  --resource-group "docq-ml-rg-eastus" \
+  --endpoint-name $ENDPOINT_NAME
\ No newline at end of file
diff --git a/infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml b/infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml
new file mode 100644
index 00000000..c17b93c5
--- /dev/null
+++ b/infra/inference/azure/cli/online-deployment-llama2-7b-chat-v8.yaml
@@ -0,0 +1,8 @@
+$schema: https://azuremlschemas.azureedge.net/latest/managedOnlineDeployment.schema.json
+model: azureml://registries/azureml-meta/models/Llama-2-7b-chat/versions/8
+instance_type: Standard_NC12s_v3
+instance_count: 1
+request_settings:
+  request_timeout_ms: 90000
+  max_concurrent_requests_per_instance: 1
+  max_queue_wait_ms: 1000
diff --git a/infra/inference/azure/cli/update.sh b/infra/inference/azure/cli/update.sh
new file mode 100755
index 00000000..6ef63277
--- /dev/null
+++ b/infra/inference/azure/cli/update.sh
@@ -0,0 +1,8 @@
+FILE="${1:-"online-deployment-llama2-7b-chat-v8.yaml"}"
+ENDPOINT_NAME="${2:-"docq-endpoint"}"
+az ml online-deployment update \
+  --name "llama2-7b-chat-8" \
+  --workspace-name "docq-main-ws-eastus" \
+  --resource-group "docq-ml-rg-eastus" \
+  --endpoint-name $ENDPOINT_NAME \
+  --file $FILE
\ No newline at end of file
diff --git a/source/docq/support/llms/azure_ml.py b/source/docq/support/llms/azure_ml.py
new file mode 100644
index 00000000..97718918
--- /dev/null
+++ b/source/docq/support/llms/azure_ml.py
@@ -0,0 +1,158 @@
+"""Llama Index `LLM` class implementation for LLMs hosted using Azure ML Online Endpoints."""
+
+from typing import Any, Dict, Optional, Sequence
+
+from llama_index.callbacks import CallbackManager
+from llama_index.constants import DEFAULT_NUM_OUTPUTS
+from llama_index.llms.base import (
+    ChatMessage,
+    ChatResponse,
+    ChatResponseGen,
+    CompletionResponse,
+    CompletionResponseGen,
+    LLMMetadata,
+    llm_chat_callback,
+    llm_completion_callback,
+)
+from llama_index.llms.custom import CustomLLM
+from llama_index.llms.generic_utils import chat_to_completion_decorator
+from llama_index.llms.openai_utils import (
+    from_openai_message_dict,
+    to_openai_message_dicts,
+)
+
+
+class AzureML(CustomLLM):
+    """Llama Index `LLM` class implementation for LLMs hosted using Azure ML Online Endpoints."""
+
+    def __init__(
+        self,
+        endpoint_url: str,
+        api_key: str,
+        model: str = "llama-13b-chat",
+        temperature: float = 0.1,
+        max_length: int = 200,
+        max_tokens: int = 200,
+        top_p: float = 0.9,
+        do_sample: bool = True,
+        additional_kwargs: Optional[Dict[str, Any]] = None,
+        callback_manager: Optional[CallbackManager] = None,
+    ) -> None:
+        try:
+            from llamaapi import LlamaAPI as Client
+        except ImportError as e:
+            raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e
+
+        self._client = Client(api_key)
+        self._model = model
+        self._temperature = temperature
+        self._max_tokens = max_tokens
+        self._additional_kwargs = additional_kwargs or {}
+        self.callback_manager = callback_manager or CallbackManager([])
+
+    @property
+    def _model_kwargs(self) -> Dict[str, Any]:
+        base_kwargs = {
+            "model": self._model,
+            "temperature": self._temperature,
+            "max_length": self._max_tokens,
+        }
+        model_kwargs = {
+            **base_kwargs,
+            **self._additional_kwargs,
+        }
+        return model_kwargs
+
+    @property
+    def metadata(self) -> LLMMetadata:
+        return LLMMetadata(
+            context_window=4096,
+            num_output=DEFAULT_NUM_OUTPUTS,
+            is_chat_model=True,
+            is_function_calling_model=True,
+            model_name="llama-api",
+        )
+
+    @llm_chat_callback()
+    def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
+        message_dicts = to_openai_message_dicts(messages)
+        json_dict = {
+            "messages": message_dicts,
+            **self._model_kwargs,
+            **kwargs,
+        }
+        response = self._client.run(json_dict).json()
+        message_dict = response["choices"][0]["message"]
+        message = from_openai_message_dict(message_dict)
+
+        return ChatResponse(message=message, raw=response)
+
+    @llm_completion_callback()
+    def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
+        complete_fn = chat_to_completion_decorator(self.chat)
+        return complete_fn(prompt, **kwargs)
+
+    @llm_completion_callback()
+    def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        raise NotImplementedError("stream_complete is not supported for LlamaAPI")
+
+    @llm_chat_callback()
+    def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
+        raise NotImplementedError("stream_chat is not supported for LlamaAPI")
+
+    def _allow_self_signed_https(self, allowed: bool) -> None:
+        """Bypass the server certificate verification on client side if using self-signed certificate in your scoring service aka Azure Online endpoint.
+
+        You would only use a self-signed certificate when running a local endpoint for dev and testing purposes.
+
+        Args:
+            allowed (bool): Whether to allow self-signed certificates.
+        """
+        import os
+        import ssl
+
+        if allowed and not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None):
+            ssl._create_default_https_context = ssl._create_unverified_context
+
+    def _azureml_request(self, input: str) -> None:
+        import json
+        import urllib.request
+
+        # _allow_self_signed_https(false) # this line is needed if you use self-signed certificate in your scoring service.
+
+        # Request data goes here
+        # The example below assumes JSON formatting which may be updated
+        # depending on the format your endpoint expects.
+        # More information can be found here:
+        # https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script
+        data = {}
+
+        body = str.encode(json.dumps(data))
+
+        url = "https://docq-endpoint.eastus.inference.ml.azure.com/score"
+        # Replace this with the primary/secondary key or AMLToken for the endpoint
+        api_key = ""
+        if not api_key:
+            raise Exception("A key should be provided to invoke the endpoint")
+
+        # The azureml-model-deployment header will force the request to go to a specific deployment.
+        # Remove this header to have the request observe the endpoint traffic rules
+        headers = {
+            "Content-Type": "application/json",
+            "Authorization": ("Bearer " + api_key),
+            "azureml-model-deployment": "llama2-7b-chat-8",
+        }
+
+        req = urllib.request.Request(url, body, headers)
+
+        try:
+            response = urllib.request.urlopen(req)
+
+            result = response.read()
+            print(result)
+        except urllib.error.HTTPError as error:
+            print("The request failed with status code: " + str(error.code))
+
+            # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
+            print(error.info())
+            print(error.read().decode("utf8", "ignore"))

From fe8f5afbce0172372bd3a96f7b6103d7ce0ad5c3 Mon Sep 17 00:00:00 2001
From: Janaka Abeywardhana <contact@janaka.co.uk>
Date: Tue, 5 Sep 2023 11:31:14 +0100
Subject: [PATCH 2/3] update: CustomLLM class

---
 source/docq/support/llms/azure_ml.py | 25 +++++++++++++++++++------
 1 file changed, 19 insertions(+), 6 deletions(-)

diff --git a/source/docq/support/llms/azure_ml.py b/source/docq/support/llms/azure_ml.py
index 97718918..22a0f16c 100644
--- a/source/docq/support/llms/azure_ml.py
+++ b/source/docq/support/llms/azure_ml.py
@@ -38,12 +38,12 @@ def __init__(
         additional_kwargs: Optional[Dict[str, Any]] = None,
         callback_manager: Optional[CallbackManager] = None,
     ) -> None:
-        try:
-            from llamaapi import LlamaAPI as Client
-        except ImportError as e:
-            raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e
+        # try:
+        #     from llamaapi import LlamaAPI as Client
+        # except ImportError as e:
+        #     raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e
 
-        self._client = Client(api_key)
+        # self._client = Client(api_key)
         self._model = model
         self._temperature = temperature
         self._max_tokens = max_tokens
@@ -56,6 +56,9 @@ def _model_kwargs(self) -> Dict[str, Any]:
             "model": self._model,
             "temperature": self._temperature,
             "max_length": self._max_tokens,
+            "top_p": self._top_p,
+            "do_sample": self._do_sample,
+            "max_new_tokens": self._max_tokens,
         }
         model_kwargs = {
             **base_kwargs,
@@ -100,6 +103,15 @@ def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
     def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
         raise NotImplementedError("stream_chat is not supported for LlamaAPI")
 
+
+class AzureMLOnlineEndpoint:
+    """Client for Azure ML Online Endpoint."""
+
+    def __init__(self, endpoint_url: str, api_key: str, allow_self_signed_https: bool = False) -> None:
+        self._endpoint_url = endpoint_url
+        self._api_key = api_key
+        self._allow_self_signed_https(allow_self_signed_https)
+
     def _allow_self_signed_https(self, allowed: bool) -> None:
         """Bypass the server certificate verification on client side if using self-signed certificate in your scoring service aka Azure Online endpoint.
 
@@ -114,7 +126,8 @@ def _allow_self_signed_https(self, allowed: bool) -> None:
         if allowed and not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None):
             ssl._create_default_https_context = ssl._create_unverified_context
 
-    def _azureml_request(self, input: str) -> None:
+    def run(self, input: str) -> str:
+        """Run the model on the input."""
         import json
         import urllib.request
 

From eb2b48a2d1d68ba485e6c63f50ebc2cb7f780196 Mon Sep 17 00:00:00 2001
From: Janaka Abeywardhana <contact@janaka.co.uk>
Date: Tue, 5 Sep 2023 11:31:14 +0100
Subject: [PATCH 3/3] refactor: use methods in LlamaIndex generic utils

---
 source/docq/support/llms/azure_ml.py | 124 +++++++++++++++++----------
 1 file changed, 77 insertions(+), 47 deletions(-)

diff --git a/source/docq/support/llms/azure_ml.py b/source/docq/support/llms/azure_ml.py
index 22a0f16c..8f181023 100644
--- a/source/docq/support/llms/azure_ml.py
+++ b/source/docq/support/llms/azure_ml.py
@@ -1,6 +1,6 @@
 """Llama Index `LLM` class implementation for LLMs hosted using Azure ML Online Endpoints."""
 
-from typing import Any, Dict, Optional, Sequence
+from typing import Any, Callable, Dict, Optional, Sequence
 
 from llama_index.callbacks import CallbackManager
 from llama_index.constants import DEFAULT_NUM_OUTPUTS
@@ -15,10 +15,9 @@
     llm_completion_callback,
 )
 from llama_index.llms.custom import CustomLLM
-from llama_index.llms.generic_utils import chat_to_completion_decorator
-from llama_index.llms.openai_utils import (
-    from_openai_message_dict,
-    to_openai_message_dicts,
+from llama_index.llms.generic_utils import completion_response_to_chat_response
+from llama_index.llms.generic_utils import (
+    messages_to_prompt as generic_messages_to_prompt,
 )
 
 
@@ -31,34 +30,38 @@ def __init__(
         api_key: str,
         model: str = "llama-13b-chat",
         temperature: float = 0.1,
+        max_new_tokens: int = 200,
         max_length: int = 200,
-        max_tokens: int = 200,
         top_p: float = 0.9,
         do_sample: bool = True,
+        model_deployment_name: Optional[str] = None,
         additional_kwargs: Optional[Dict[str, Any]] = None,
+        messages_to_prompt: Optional[Callable] = None,
         callback_manager: Optional[CallbackManager] = None,
     ) -> None:
-        # try:
-        #     from llamaapi import LlamaAPI as Client
-        # except ImportError as e:
-        #     raise ImportError("llama_api not installed." "Please install it with `pip install llamaapi`.") from e
-
-        # self._client = Client(api_key)
+        """Initialize the LLM."""
+        self._client = AzureMLOnlineEndpoint(
+            endpoint_url=endpoint_url, api_key=api_key, model_deployment_name=model_deployment_name
+        )  # Client(api_key)
         self._model = model
         self._temperature = temperature
-        self._max_tokens = max_tokens
+        self._max_new_tokens = max_new_tokens
+        self._max_length = max_length
+        self._top_p = top_p
+        self._do_sample = do_sample
         self._additional_kwargs = additional_kwargs or {}
         self.callback_manager = callback_manager or CallbackManager([])
+        self._messages_to_prompt = messages_to_prompt or generic_messages_to_prompt
 
     @property
     def _model_kwargs(self) -> Dict[str, Any]:
         base_kwargs = {
             "model": self._model,
             "temperature": self._temperature,
-            "max_length": self._max_tokens,
+            "max_length": self._max_new_tokens,
             "top_p": self._top_p,
             "do_sample": self._do_sample,
-            "max_new_tokens": self._max_tokens,
+            "max_new_tokens": self._max_new_tokens,
         }
         model_kwargs = {
             **base_kwargs,
@@ -68,6 +71,7 @@ def _model_kwargs(self) -> Dict[str, Any]:
 
     @property
     def metadata(self) -> LLMMetadata:
+        """Get the metadata for the LLM."""
         return LLMMetadata(
             context_window=4096,
             num_output=DEFAULT_NUM_OUTPUTS,
@@ -78,38 +82,61 @@ def metadata(self) -> LLMMetadata:
 
     @llm_chat_callback()
     def chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponse:
-        message_dicts = to_openai_message_dicts(messages)
-        json_dict = {
-            "messages": message_dicts,
-            **self._model_kwargs,
-            **kwargs,
-        }
-        response = self._client.run(json_dict).json()
-        message_dict = response["choices"][0]["message"]
-        message = from_openai_message_dict(message_dict)
-
-        return ChatResponse(message=message, raw=response)
+        """Chat with the LLM."""
+        prompt = self._messages_to_prompt(messages)
+        completion_response = self.complete(prompt, **kwargs)
+        return completion_response_to_chat_response(completion_response)
 
     @llm_completion_callback()
     def complete(self, prompt: str, **kwargs: Any) -> CompletionResponse:
-        complete_fn = chat_to_completion_decorator(self.chat)
-        return complete_fn(prompt, **kwargs)
+        """Complete the prompt with the LLM."""
+        # self._generate_kwargs.update({"stream": False})
+
+        input_json_dict = {
+            "input_data": {
+                "input_string": prompt,
+                "parameters": {
+                    **self._model_kwargs,
+                    **kwargs,
+                },
+            }
+        }
+        response = self._client.run(input_json_dict)
+
+        return CompletionResponse(text=response["output"], raw=response)
 
     @llm_completion_callback()
     def stream_complete(self, prompt: str, **kwargs: Any) -> CompletionResponseGen:
+        """Stream completion of the prompt with the LLM."""
         raise NotImplementedError("stream_complete is not supported for LlamaAPI")
 
     @llm_chat_callback()
     def stream_chat(self, messages: Sequence[ChatMessage], **kwargs: Any) -> ChatResponseGen:
+        """Stream chat with the LLM."""
         raise NotImplementedError("stream_chat is not supported for LlamaAPI")
 
 
 class AzureMLOnlineEndpoint:
-    """Client for Azure ML Online Endpoint."""
+    """Web API Client for interacting with an model(s) hosted by an Azure ML Online Endpoint."""
 
-    def __init__(self, endpoint_url: str, api_key: str, allow_self_signed_https: bool = False) -> None:
+    def __init__(
+        self,
+        endpoint_url: str,
+        api_key: str,
+        model_deployment_name: Optional[str] = None,
+        allow_self_signed_https: Optional[bool] = False,
+    ) -> None:
+        """Initialize the client.
+
+        Args:
+            endpoint_url (str): The AzureML Online endpoint URL.
+            api_key (str): The API key. Primary/secondary key or AMLToken can be used.
+            allow_self_signed_https (bool, optional): Whether to allow self-signed certificates. Defaults to `False`.
+            model_deployment_name (str, optional): The model deployment name. Used to override server-side deployment routing rules. Defaults to `None`.
+        """
         self._endpoint_url = endpoint_url
         self._api_key = api_key
+        self._model_deployment_name = model_deployment_name
         self._allow_self_signed_https(allow_self_signed_https)
 
     def _allow_self_signed_https(self, allowed: bool) -> None:
@@ -126,35 +153,38 @@ def _allow_self_signed_https(self, allowed: bool) -> None:
         if allowed and not os.environ.get("PYTHONHTTPSVERIFY", "") and getattr(ssl, "_create_unverified_context", None):
             ssl._create_default_https_context = ssl._create_unverified_context
 
-    def run(self, input: str) -> str:
-        """Run the model on the input."""
+    def run(self, input_data: dict[str, any]) -> str:
+        """Run the model on the input.
+
+        Args:
+            input_data (str): The prompt input data. Format is dependent on what the model expects. See the Azure ML Studio Model registry for examples.
+
+        Returns:
+            str: Model response data.
+        """
         import json
         import urllib.request
 
         # _allow_self_signed_https(false) # this line is needed if you use self-signed certificate in your scoring service.
 
-        # Request data goes here
-        # The example below assumes JSON formatting which may be updated
-        # depending on the format your endpoint expects.
-        # More information can be found here:
-        # https://docs.microsoft.com/azure/machine-learning/how-to-deploy-advanced-entry-script
-        data = {}
+        body = str.encode(json.dumps(input_data))
 
-        body = str.encode(json.dumps(data))
+        url = self._endpoint_url  # "https://docq-endpoint.eastus.inference.ml.azure.com/score"
 
-        url = "https://docq-endpoint.eastus.inference.ml.azure.com/score"
-        # Replace this with the primary/secondary key or AMLToken for the endpoint
-        api_key = ""
+        api_key = self._api_key
         if not api_key:
-            raise Exception("A key should be provided to invoke the endpoint")
+            raise Exception(
+                "Missing API key. One should be provided to invoke the endpoint. Primary/secondary key or AMLToken can be used."
+            )
 
-        # The azureml-model-deployment header will force the request to go to a specific deployment.
-        # Remove this header to have the request observe the endpoint traffic rules
         headers = {
             "Content-Type": "application/json",
             "Authorization": ("Bearer " + api_key),
-            "azureml-model-deployment": "llama2-7b-chat-8",
         }
+        # The azureml-model-deployment header will force the request to go to a specific deployment.
+        # When None, requests observe the endpoint traffic rules
+        if self._model_deployment_name is not None:
+            headers["azureml-model-deployment"] = self._model_deployment_name
 
         req = urllib.request.Request(url, body, headers)
 
@@ -166,6 +196,6 @@ def run(self, input: str) -> str:
         except urllib.error.HTTPError as error:
             print("The request failed with status code: " + str(error.code))
 
-            # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
+            # Print the headers - they include the request ID and the timestamp, which are useful for debugging the failure
             print(error.info())
             print(error.read().decode("utf8", "ignore"))