Add vllm cpu alternative for local development (#721)

* Add vllm cpu image Signed-off-by: Varun Gupta <[email protected]> * local mac Signed-off-by: Varun Gupta <[email protected]> * squash commits Signed-off-by: Varun Gupta <[email protected]> --------- Signed-off-by: Varun Gupta <[email protected]>
vllm-project · Feb 21, 2025 · d37a507 · d37a507
1 parent 41126e4
commit d37a507
Show file tree

Hide file tree

Showing 5 changed files with 288 additions and 0 deletions.
diff --git a/development/vllm/README.md b/development/vllm/README.md
@@ -0,0 +1,83 @@
+# vLLM CPU application
+
+Documents lists commands to deploy vLLM cpu application for local development
+
+## Deploy model
+### Download facebook/opt-125m model locally
+```shell
+huggingface-cli download facebook/opt-125m
+```
+
+### Setup kind cluster
+Update path for huggingface cache in kind config
+```shell
+kind create cluster --config=./development/vllm/kind-config.yaml
+```
+(Optional) Load container image to docker context
+
+> Note: If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker.
+> Therefore, the following command is not necessary. Only kind user need this step.
+
+```shell
+docker pull aibrix/vllm-cpu-env:macos
+kind load docker-image aibrix/vllm-cpu-env:macos
+```
+
+Build aibrix runtime component
+```shell
+make docker-build-all
+kind load docker-image aibrix/runtime:nightly
+```
+
+### Deploy model
+```shell
+kubectl create -k vllm/config
+
+kubectl delete -k vllm/config
+```
+
+### Setup port forwarding
+
+```shell
+kubectl port-forward svc/facebook-opt-125m 8000:8000 &
+```
+
+### Inference request
+```shell
+curl -v http://localhost:8000/v1/completions \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer test-key-1234567890" \
+  -d '{
+     "model": "facebook-opt-125m",
+     "prompt": "Say this is a test",
+     "temperature": 0.5,
+     "max_tokens": 512
+   }'
+```
+
+
+## Deploy aibrix gateway
+### Setup components
+```shell
+make docker-build-all
+kubectl create -k config/dependency
+kubectl create -k config/default
+```
+
+### Setup port forwarding for envoy service
+```shell
+kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 &
+```
+
+### Inference request
+```shell
+curl -v POST "http://localhost:8888/v1/completions" \
+  -H "Content-Type: application/json" \
+  -H "Authorization: Bearer test-key-1234567890" \
+  --data '{
+    "model": "facebook-opt-125m",
+    "prompt": "Once upon a time,",
+    "max_tokens": 512,
+    "temperature": 0.5
+  }'
+```
diff --git a/development/vllm/config/components.yaml b/development/vllm/config/components.yaml
@@ -0,0 +1,76 @@
+# Debug only: Make sure pod can be visited from controller that deployed in mac.
+apiVersion: v1
+kind: Service
+metadata:
+  name: facebook-opt-125m
+  namespace: default
+  labels:
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/path: "/metrics"
+    prometheus.io/port: "8000"
+spec:
+  selector:
+    model.aibrix.ai/name: "facebook-opt-125m"
+  ports:
+    - protocol: TCP
+      name: metrics
+      port: 8000
+      targetPort: 8000
+      nodePort: 30081
+  type: NodePort
+---
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: mocked-app-sa
+  namespace: default
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: mocked-app-pod-reader-role
+  namespace: default
+rules:
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: mocked-app-pod-reader-role-binding
+  namespace: default
+subjects:
+  - kind: ServiceAccount
+    name: mocked-app-sa
+    namespace: default
+roleRef:
+  kind: Role
+  name: mocked-app-pod-reader-role
+  apiGroup: rbac.authorization.k8s.io
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: default
+  name: mocked-app-deployment-reader-role
+rules:
+  - apiGroups: ["apps"]
+    resources: ["deployments"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: mocked-app-deployment-reader-role-binding
+  namespace: default
+subjects:
+  - kind: ServiceAccount
+    name: mocked-app-sa
+    namespace: default
+roleRef:
+  kind: Role
+  name: mocked-app-deployment-reader-role
+  apiGroup: rbac.authorization.k8s.io
diff --git a/development/vllm/config/deployment.yaml b/development/vllm/config/deployment.yaml
@@ -0,0 +1,116 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mock-facebook-opt-125m
+  namespace: default
+  labels:
+    model.aibrix.ai/name: "facebook-opt-125m"
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      adapter.model.aibrix.ai/enabled: "true"
+      model.aibrix.ai/name: "facebook-opt-125m"
+      app: "mock-facebook-opt-125m"
+  template:
+    metadata:
+      labels:
+        adapter.model.aibrix.ai/enabled: "true"
+        model.aibrix.ai/name: "facebook-opt-125m"
+        app: "mock-facebook-opt-125m"
+    spec:
+      serviceAccountName: mocked-app-sa
+      containers:
+        - name: llm-engine
+          image: aibrix/vllm-cpu-env:macos
+          ports:
+            - containerPort: 8000
+          command: ["/bin/sh", "-c"]
+          args: ["vllm serve facebook/opt-125m --served-model-name facebook-opt-125m --chat-template /etc/chat-template-config/chat-template.j2 --trust-remote-code --device cpu --disable_async_output_proc --enforce-eager --dtype float16"]
+          env:
+            - name: DEPLOYMENT_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.labels['app']
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: MY_POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+          volumeMounts:
+          - name: model
+            mountPath: /root/.cache/huggingface
+          - name: chat-template-volume
+            mountPath: /etc/chat-template-config
+        - name: aibrix-runtime
+          image: aibrix/runtime:nightly
+          command:
+            - aibrix_runtime
+            - --port
+            - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
+      volumes:
+        - name: model
+          hostPath:
+            path: /root/.cache/huggingface
+        - name: chat-template-volume
+          configMap:
+            name: chat-template-config
+---
+apiVersion: v1
+kind: ConfigMap
+metadata:
+  name: chat-template-config
+data:
+  chat-template.j2: |
+    {%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+    {%- else -%}
+        {% set system_message = '' -%}
+    {%- endif -%}
+
+    {{ bos_token + system_message }}
+    {%- for message in messages -%}
+        {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+            {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+        {%- endif -%}
+
+        {%- if message['role'] == 'user' -%}
+            {{ 'USER: ' + message['content'] + '\n' }}
+        {%- elif message['role'] == 'assistant' -%}
+            {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+        {%- endif -%}
+    {%- endfor -%}
+
+    {%- if add_generation_prompt -%}
+        {{ 'ASSISTANT:' }}
+    {% endif %}
diff --git a/development/vllm/config/kustomization.yaml b/development/vllm/config/kustomization.yaml
@@ -0,0 +1,5 @@
+kind: Kustomization
+
+resources:
+  - deployment.yaml
+  - components.yaml
diff --git a/development/vllm/kind-config.yaml b/development/vllm/kind-config.yaml
@@ -0,0 +1,8 @@
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+nodes:
+  - role: control-plane
+  - role: worker
+    extraMounts:
+    - hostPath: /<path>/.cache/huggingface
+      containerPath: /root/.cache/huggingface