Add routing for model adapter (#183)

* Add routing for model adapter * nit: logging * nit: gateway error response code refactoring * code review comments * add/delete httproute for model adapter * nit --------- Co-authored-by: varungupta <[email protected]>
vllm-project · Sep 19, 2024 · 1bc20e1 · 1bc20e1
1 parent 38e2e71
commit 1bc20e1
Show file tree

Hide file tree

Showing 12 changed files with 367 additions and 239 deletions.
diff --git a/config/gateway/gateway.yaml b/config/gateway/gateway.yaml
@@ -58,11 +58,16 @@ spec:
         name: original_route
         match:
           prefix: "/"
-          # headers:
-          #   # update ip address as needed and in production this config is not needed as backend will derive the pod ip
-          #   - name: "target-pod"
-          #     string_match:
-          #       exact: "10.244.1.3:8000"
+          headers:
+            - name: "routing-strategy"
+              string_match:
+                exact: "random"
+            - name: "least-request"
+              string_match:
+                exact: "random"
+            - name: "routing-strategy"
+              string_match:
+                exact: "throughput"
         route:  
           cluster: original_destination_cluster
           timeout: 1000s  # Increase route timeout

diff --git a/docs/development/app/README.md b/docs/development/app/README.md
@@ -84,7 +84,7 @@ curl -v http://localhost:8888/v1/chat/completions \
      "model": "llama2-70b",
      "messages": [{"role": "user", "content": "Say this is a test!"}],
      "temperature": 0.7
-   }'
+   }' &
 
 # least-request based
 for i in {1..10}; do

diff --git a/docs/tutorial/lora/README.md b/docs/tutorial/lora/README.md
@@ -12,7 +12,7 @@ docker build -t aibrix/vllm-mock:nightly -f Dockerfile .
 
 2. Deploy mocked model image
 ```shell
-kubectl apply -f deployment.yaml
+kubectl apply -f docs/development/app/deployment.yaml
 ```
 
 3. Load models
@@ -43,7 +43,7 @@ Verified! The model is loaded and unloaded successfully and pod annotations are
 5. Deploy the controller and apply the `model_adapter.yaml`
 
 ```
-kubectl apply -f model_adapter.yaml
+kubectl apply -f docs/tutorial/lora/model_adapter.yaml
 ```
 
 
@@ -70,4 +70,33 @@ curl https://localhost:8000/v1/completions \
     "max_tokens": 7,
     "temperature": 0
   }'
+```
+
+# request via gateway without routing strategy
+```shell
+curl -v http://localhost:8888/v1/chat/completions \
+  -H "user: your-user-name" \
+  -H "model: lora-1" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer any_key" \
+  -d '{
+     "model": "lora-1",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
+```
+
+# request via gateway with routing strategy
+```shell
+curl -v http://localhost:8888/v1/chat/completions \
+  -H "user: your-user-name" \
+  -H "model: lora-1" \
+  -H "routing-strategy: least-request" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer any_key" \
+  -d '{
+     "model": "lora-1",
+     "messages": [{"role": "user", "content": "Say this is a test!"}],
+     "temperature": 0.7
+   }'
 ```
diff --git a/docs/tutorial/lora/model_adapter.yaml b/docs/tutorial/lora/model_adapter.yaml
@@ -3,10 +3,32 @@ kind: ModelAdapter
 metadata:
   name: lora-1
   namespace: aibrix-system
+  labels:
+    model.aibrix.ai: "lora-1"
+    model.aibrix.ai/port: "8000"
 spec:
   baseModel: llama2-70b
   podSelector:
     matchLabels:
       model.aibrix.ai: llama2-70b
   artifactURL: huggingface://yard1/llama-2-7b-sql-lora-test
   schedulerName: default
+# ---
+# # for test-purpose, if need to create HTTPRoute object manually
+# apiVersion: gateway.networking.k8s.io/v1
+# kind: HTTPRoute
+# metadata:
+#   name: lora-1-router
+#   namespace: aibrix-system
+# spec:
+#   parentRefs:
+#     - name: aibrix-eg
+#   rules:
+#     - matches:
+#         - headers:
+#             - type: Exact
+#               name: model
+#               value: lora-1
+#       backendRefs:
+#         - name: lora-1
+#           port: 8000