Skip to content

Commit

Permalink
Add vllm cpu alternative for local development (#721)
Browse files Browse the repository at this point in the history
* Add vllm cpu image

Signed-off-by: Varun Gupta <[email protected]>

* local mac

Signed-off-by: Varun Gupta <[email protected]>

* squash commits

Signed-off-by: Varun Gupta <[email protected]>

---------

Signed-off-by: Varun Gupta <[email protected]>
  • Loading branch information
varungup90 authored Feb 21, 2025
1 parent 41126e4 commit d37a507
Show file tree
Hide file tree
Showing 5 changed files with 288 additions and 0 deletions.
83 changes: 83 additions & 0 deletions development/vllm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# vLLM CPU application

Documents lists commands to deploy vLLM cpu application for local development

## Deploy model
### Download facebook/opt-125m model locally
```shell
huggingface-cli download facebook/opt-125m
```

### Setup kind cluster
Update path for huggingface cache in kind config
```shell
kind create cluster --config=./development/vllm/kind-config.yaml
```
(Optional) Load container image to docker context

> Note: If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker.
> Therefore, the following command is not necessary. Only kind user need this step.
```shell
docker pull aibrix/vllm-cpu-env:macos
kind load docker-image aibrix/vllm-cpu-env:macos
```

Build aibrix runtime component
```shell
make docker-build-all
kind load docker-image aibrix/runtime:nightly
```

### Deploy model
```shell
kubectl create -k vllm/config

kubectl delete -k vllm/config
```

### Setup port forwarding

```shell
kubectl port-forward svc/facebook-opt-125m 8000:8000 &
```

### Inference request
```shell
curl -v http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-H "Authorization: Bearer test-key-1234567890" \
-d '{
"model": "facebook-opt-125m",
"prompt": "Say this is a test",
"temperature": 0.5,
"max_tokens": 512
}'
```


## Deploy aibrix gateway
### Setup components
```shell
make docker-build-all
kubectl create -k config/dependency
kubectl create -k config/default
```

### Setup port forwarding for envoy service
```shell
kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 &
```

### Inference request
```shell
curl -v POST "http://localhost:8888/v1/completions" \
-H "Content-Type: application/json" \
-H "Authorization: Bearer test-key-1234567890" \
--data '{
"model": "facebook-opt-125m",
"prompt": "Once upon a time,",
"max_tokens": 512,
"temperature": 0.5
}'
```
76 changes: 76 additions & 0 deletions development/vllm/config/components.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
# Debug only: Make sure pod can be visited from controller that deployed in mac.
apiVersion: v1
kind: Service
metadata:
name: facebook-opt-125m
namespace: default
labels:
prometheus-discovery: "true"
annotations:
prometheus.io/scrape: "true"
prometheus.io/path: "/metrics"
prometheus.io/port: "8000"
spec:
selector:
model.aibrix.ai/name: "facebook-opt-125m"
ports:
- protocol: TCP
name: metrics
port: 8000
targetPort: 8000
nodePort: 30081
type: NodePort
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: mocked-app-sa
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
name: mocked-app-pod-reader-role
namespace: default
rules:
- apiGroups: [""]
resources: ["pods"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: mocked-app-pod-reader-role-binding
namespace: default
subjects:
- kind: ServiceAccount
name: mocked-app-sa
namespace: default
roleRef:
kind: Role
name: mocked-app-pod-reader-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: rbac.authorization.k8s.io/v1
kind: Role
metadata:
namespace: default
name: mocked-app-deployment-reader-role
rules:
- apiGroups: ["apps"]
resources: ["deployments"]
verbs: ["get", "list", "watch"]
---
apiVersion: rbac.authorization.k8s.io/v1
kind: RoleBinding
metadata:
name: mocked-app-deployment-reader-role-binding
namespace: default
subjects:
- kind: ServiceAccount
name: mocked-app-sa
namespace: default
roleRef:
kind: Role
name: mocked-app-deployment-reader-role
apiGroup: rbac.authorization.k8s.io
116 changes: 116 additions & 0 deletions development/vllm/config/deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,116 @@
apiVersion: apps/v1
kind: Deployment
metadata:
name: mock-facebook-opt-125m
namespace: default
labels:
model.aibrix.ai/name: "facebook-opt-125m"
model.aibrix.ai/port: "8000"
adapter.model.aibrix.ai/enabled: "true"
spec:
replicas: 1
selector:
matchLabels:
adapter.model.aibrix.ai/enabled: "true"
model.aibrix.ai/name: "facebook-opt-125m"
app: "mock-facebook-opt-125m"
template:
metadata:
labels:
adapter.model.aibrix.ai/enabled: "true"
model.aibrix.ai/name: "facebook-opt-125m"
app: "mock-facebook-opt-125m"
spec:
serviceAccountName: mocked-app-sa
containers:
- name: llm-engine
image: aibrix/vllm-cpu-env:macos
ports:
- containerPort: 8000
command: ["/bin/sh", "-c"]
args: ["vllm serve facebook/opt-125m --served-model-name facebook-opt-125m --chat-template /etc/chat-template-config/chat-template.j2 --trust-remote-code --device cpu --disable_async_output_proc --enforce-eager --dtype float16"]
env:
- name: DEPLOYMENT_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['app']
- name: POD_NAME
valueFrom:
fieldRef:
fieldPath: metadata.name
- name: POD_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: MY_POD_IP
valueFrom:
fieldRef:
fieldPath: status.podIP
volumeMounts:
- name: model
mountPath: /root/.cache/huggingface
- name: chat-template-volume
mountPath: /etc/chat-template-config
- name: aibrix-runtime
image: aibrix/runtime:nightly
command:
- aibrix_runtime
- --port
- "8080"
env:
- name: INFERENCE_ENGINE
value: vllm
- name: INFERENCE_ENGINE_ENDPOINT
value: http://localhost:8000
ports:
- containerPort: 8080
protocol: TCP
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 3
periodSeconds: 2
readinessProbe:
httpGet:
path: /ready
port: 8080
initialDelaySeconds: 5
periodSeconds: 10
volumes:
- name: model
hostPath:
path: /root/.cache/huggingface
- name: chat-template-volume
configMap:
name: chat-template-config
---
apiVersion: v1
kind: ConfigMap
metadata:
name: chat-template-config
data:
chat-template.j2: |
{%- if messages[0]['role'] == 'system' -%}
{%- set system_message = messages[0]['content'] -%}
{%- set messages = messages[1:] -%}
{%- else -%}
{% set system_message = '' -%}
{%- endif -%}
{{ bos_token + system_message }}
{%- for message in messages -%}
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
{%- endif -%}
{%- if message['role'] == 'user' -%}
{{ 'USER: ' + message['content'] + '\n' }}
{%- elif message['role'] == 'assistant' -%}
{{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
{%- endif -%}
{%- endfor -%}
{%- if add_generation_prompt -%}
{{ 'ASSISTANT:' }}
{% endif %}
5 changes: 5 additions & 0 deletions development/vllm/config/kustomization.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
kind: Kustomization

resources:
- deployment.yaml
- components.yaml
8 changes: 8 additions & 0 deletions development/vllm/kind-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
kind: Cluster
apiVersion: kind.x-k8s.io/v1alpha4
nodes:
- role: control-plane
- role: worker
extraMounts:
- hostPath: /<path>/.cache/huggingface
containerPath: /root/.cache/huggingface

0 comments on commit d37a507

Please sign in to comment.