[docs] Polish distributed inference and kv cache examples (#691)

vllm-project · Feb 17, 2025 · a1b389f · a1b389f
1 parent bc17678
commit a1b389f
Show file tree

Hide file tree

Showing 10 changed files with 136 additions and 22 deletions.
diff --git a/docs/source/features/distributed-kv-cache.rst b/docs/source/features/distributed-kv-cache.rst
@@ -29,9 +29,9 @@ After deployment, we can see all the components by using ``kubectl get pods -n a
 
 .. code-block:: RST
 
-    NAME                                                     READY   STATUS    RESTARTS   AGE
-    aibrix-model-deepseek-coder-7b-kvcache-596965997-p86cx   1/1     Running   0          2m
-    aibrix-model-deepseek-coder-7b-kvcache-etcd-0            1/1     Running   0          2m
+    NAME                                        READY   STATUS    RESTARTS   AGE
+    deepseek-coder-7b-kvcache-596965997-p86cx   1/1     Running   0          2m
+    deepseek-coder-7b-kvcache-etcd-0            1/1     Running   0          2m
 
 After all components are running, we can use the following yaml to deploy the inference service:
 
@@ -49,9 +49,8 @@ Now let's use ``kubectl get pods`` command to ensure the inference service is ru
 
 .. code-block:: RST
 
-    NAME                                                       READY   STATUS    RESTARTS   AGE
-    download-model                                             1/1     Running   0          12m
-    aibrix-model-deepseek-coder-7b-instruct-6b885ffd8b-2kfjv   2/2     Running   0          4m
+    NAME                                          READY   STATUS    RESTARTS   AGE
+    deepseek-coder-7b-instruct-6b885ffd8b-2kfjv   2/2     Running   0          4m
 
 
 After launching AIBrix's deployment, we can use the following yaml to deploy a distributed KV cache cluster:
@@ -65,6 +64,16 @@ After launching AIBrix's deployment, we can use the following yaml to deploy a d
     2. ``kvcache.orchestration.aibrix.ai/node-affinity-gpu-type`` is unnecessary unless you deploy the model across different GPUs.
 
 
+Run ``kubectl get pods`` to verify all pods are running.
+
+.. note::
+    kubectl get pods -o wide
+    NAME                                            READY   STATUS              RESTARTS   AGE     IP               NODE                                           NOMINATED NODE   READINESS GATES
+    deepseek-coder-7b-instruct-85664648c7-xgp9h     1/1     Running             0          2m41s   192.168.59.224   ip-192-168-41-184.us-west-2.compute.internal   <none>           <none>
+    deepseek-coder-7b-kvcache-7d5896cd89-dcfzt      1/1     Running             0          2m31s   192.168.37.154   ip-192-168-41-184.us-west-2.compute.internal   <none>           <none>
+    deepseek-coder-7b-kvcache-etcd-0                1/1     Running             0          2m31s   192.168.19.197   ip-192-168-3-183.us-west-2.compute.internal    <none>           <none>
+
+
 Once the inference service is running, let's set up port forwarding so that we can test the service from local:
 
 * Run ``kubectl get svc -n envoy-gateway-system`` to get the name of the Envoy Gateway service.

diff --git a/docs/source/features/multi-node-inference.rst b/docs/source/features/multi-node-inference.rst
@@ -53,7 +53,7 @@ Workloads Examples
 
 This is the ``RayClusterFleet`` example, you can apply this yaml in your cluster.
 
-.. literalinclude:: ../../../samples/distributed/fleet.yaml
+.. literalinclude:: ../../../samples/distributed/fleet-two-node.yaml
    :language: yaml
 
 

diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
@@ -59,6 +59,11 @@ Depending on where you deployed the AIBrix, you can use either of the following
     kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 &
     ENDPOINT="localhost:8888"
 
+.. attention::
+
+    Some cloud provider like AWS EKS expose the endpoint at hostname field, if that case, you should use ``.status.loadBalancer.ingress[0].hostname`` instead.
+
+
 .. code-block:: bash
 
     # completion api

diff --git a/samples/distributed/fleet-two-node.yaml b/samples/distributed/fleet-two-node.yaml
@@ -0,0 +1,74 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: RayClusterFleet
+metadata:
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+  name: facebook-opt-13b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: facebook-opt-13b
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: facebook-opt-13b
+      annotations:
+        ray.io/overwrite-container-cmd: "true"
+    spec:
+      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      headGroupSpec:
+        rayStartParams:
+          dashboard-host: '0.0.0.0'
+        template:
+          spec:
+            containers:
+              - name: ray-head
+                image: vllm/vllm-openai:v0.7.1
+                ports:
+                  - containerPort: 6379
+                    name: gcs-server
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  - containerPort: 8000
+                    name: service
+                command: ["/bin/bash", "-lc", "--"]
+                args: ["ulimit -n 65536; echo head; $KUBERAY_GEN_RAY_START_CMD; vllm serve facebook/opt-125m --tensor-parallel-size 2 --distributed-executor-backend ray"]
+                resources:
+                  limits:
+                    cpu: "4"
+                    nvidia.com/gpu: 1
+                  requests:
+                    cpu: "4"
+                    nvidia.com/gpu: 1
+      workerGroupSpecs:
+        # the pod replicas in this group typed worker
+        - replicas: 1
+          minReplicas: 1
+          maxReplicas: 5
+          groupName: small-group
+          rayStartParams: {}
+          template:
+            spec:
+              containers:
+                - name: ray-worker # must consist of lower case alphanumeric characters or '-', and must start and end with an alphanumeric character (e.g. 'my-name',  or '123-abc'
+                  image: vllm/vllm-openai:v0.7.1
+                  lifecycle:
+                    preStop:
+                      exec:
+                        command: [ "/bin/sh","-c","ray stop" ]
+                  resources:
+                    limits:
+                      cpu: "4"
+                      nvidia.com/gpu: 1
+                    requests:
+                      cpu: "4"
+                      nvidia.com/gpu: 1
diff --git a/samples/distributed/fleet.yaml b/samples/distributed/fleet.yaml
@@ -4,12 +4,12 @@ metadata:
   labels:
     app.kubernetes.io/name: aibrix
     app.kubernetes.io/managed-by: kustomize
-  name: facebook-opt-13b
+  name: qwen-coder-7b-instruct
 spec:
   replicas: 1
   selector:
     matchLabels:
-      model.aibrix.ai/name: facebook-opt-13b
+      model.aibrix.ai/name: qwen-coder-7b-instruct
   strategy:
     rollingUpdate:
       maxSurge: 25%
@@ -18,7 +18,7 @@ spec:
   template:
     metadata:
       labels:
-        model.aibrix.ai/name: facebook-opt-13b
+        model.aibrix.ai/name: qwen-coder-7b-instruct
       annotations:
         ray.io/overwrite-container-cmd: "true"
     spec:
@@ -44,7 +44,7 @@ spec:
                 # Starting from v1.1.0, KubeRay injects the environment variable `KUBERAY_GEN_RAY_START_CMD`
                 # into the Ray container. This variable can be used to retrieve the generated Ray start command.
                 # Note that this environment variable does not include the `ulimit` command.
-                args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve /models/llama-2-7b-hf/ --served-model-name meta-llama/llama-2-7b-hf --tensor-parallel-size 2 --distributed-executor-backend ray"]
+                args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve Qwen/Qwen2.5-Coder-7B-Instruct --served-model-name qwen-coder-7b-instruct --tensor-parallel-size 2 --distributed-executor-backend ray"]
                 resources:
                   limits:
                     cpu: "8000m"

diff --git a/samples/kvcache/deployment-tp.yaml b/samples/kvcache/deployment-tp.yaml
@@ -1,17 +1,24 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: aibrix-model-deepseek-coder-33b-instruct
+  name: deepseek-coder-33b-instruct
   labels:
     model.aibrix.ai/name: deepseek-coder-33b-instruct
     model.aibrix.ai/port: "8000"
 spec:
+  replicas: 1
   strategy:
     rollingUpdate:
       maxSurge: 1
       maxUnavailable: 1
     type: RollingUpdate
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-coder-33b-instruct
   template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: deepseek-coder-33b-instruct
     spec:
       containers:
         - name: vllm-openai
@@ -58,7 +65,12 @@ spec:
           volumeMounts:
             - mountPath: /var/run
               name: kvcache-socket
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
       volumes:
         - name: kvcache-socket
           hostPath:
-            path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-33b-kvcache-rpc
+            path: /var/run/vineyard-kubernetes/default/deepseek-coder-33b-kvcache
diff --git a/samples/kvcache/deployment.yaml b/samples/kvcache/deployment.yaml
@@ -1,17 +1,24 @@
 apiVersion: apps/v1
 kind: Deployment
 metadata:
-  name: aibrix-model-deepseek-coder-7b-instruct
+  name: deepseek-coder-7b-instruct
   labels:
     model.aibrix.ai/name: deepseek-coder-7b-instruct
     model.aibrix.ai/port: "8000"
 spec:
+  replicas: 1
   strategy:
     rollingUpdate:
       maxSurge: 1
       maxUnavailable: 1
     type: RollingUpdate
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-coder-7b-instruct
   template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: deepseek-coder-7b-instruct
     spec:
       containers:
         - name: vllm-openai
@@ -26,9 +33,9 @@ spec:
             - --model
             - deepseek-ai/deepseek-coder-6.7b-instruct
             - --served-model-name
-            - deepseek-coder-6.7b-instruct
+            - deepseek-coder-7b-instruct
             - --max-model-len
-            - "17000"
+            - "12288"
             - --enable-prefix-caching
             - --disable-fastapi-docs
           env:
@@ -53,7 +60,12 @@ spec:
           volumeMounts:
             - mountPath: /var/run
               name: kvcache-socket
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
       volumes:
         - name: kvcache-socket
           hostPath:
-            path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-7b-kvcache-rpc
+            path: /var/run/vineyard-kubernetes/default/deepseek-coder-7b-kvcache
diff --git a/samples/kvcache/kvcache-tp.yaml b/samples/kvcache/kvcache-tp.yaml
@@ -1,7 +1,7 @@
 apiVersion: orchestration.aibrix.ai/v1alpha1
 kind: KVCache
 metadata:
-  name: aibrix-model-deepseek-coder-33b-kvcache
+  name: deepseek-coder-33b-kvcache
   namespace: default
   annotations:
 #    kvcache.orchestration.aibrix.ai/node-affinity-gpu-type: NVIDIA-L20
@@ -11,6 +11,6 @@ spec:
   service:
     type: ClusterIP
     port: 9600
-  cache:
+  cacheSpec:
     image: aibrix/vineyardd:20241120
     imagePullPolicy: IfNotPresent
diff --git a/samples/kvcache/kvcache.yaml b/samples/kvcache/kvcache.yaml
@@ -1,15 +1,15 @@
 apiVersion: orchestration.aibrix.ai/v1alpha1
 kind: KVCache
 metadata:
-  name: aibrix-model-deepseek-coder-7b-kvcache
+  name: deepseek-coder-7b-kvcache
   namespace: default
   annotations:
-    kvcache.orchestration.aibrix.ai/pod-affinity-workload: aibrix-model-deepseek-coder-7b-instruct
+    kvcache.orchestration.aibrix.ai/pod-affinity-workload: deepseek-coder-7b-instruct
 spec:
   replicas: 1
   service:
     type: ClusterIP
     port: 9600
-  cache:
+  cacheSpec:
     image: aibrix/vineyardd:20241120
     imagePullPolicy: IfNotPresent
diff --git a/samples/quickstart/model.yaml b/samples/quickstart/model.yaml
@@ -30,6 +30,8 @@ spec:
             - --served-model-name
             # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name`
             - deepseek-r1-distill-llama-8b
+            - --max-model-len
+            - "12288" # 24k length, this is to avoid "The model's max seq len (131072) is larger than the maximum number of tokens that can be stored in KV cache" issue.
           image: vllm/vllm-openai:v0.7.1
           imagePullPolicy: Always
           name: vllm-openai