-
Notifications
You must be signed in to change notification settings - Fork 258
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add vllm cpu alternative for local development (#721)
* Add vllm cpu image Signed-off-by: Varun Gupta <[email protected]> * local mac Signed-off-by: Varun Gupta <[email protected]> * squash commits Signed-off-by: Varun Gupta <[email protected]> --------- Signed-off-by: Varun Gupta <[email protected]>
- Loading branch information
1 parent
41126e4
commit d37a507
Showing
5 changed files
with
288 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,83 @@ | ||
# vLLM CPU application | ||
|
||
Documents lists commands to deploy vLLM cpu application for local development | ||
|
||
## Deploy model | ||
### Download facebook/opt-125m model locally | ||
```shell | ||
huggingface-cli download facebook/opt-125m | ||
``` | ||
|
||
### Setup kind cluster | ||
Update path for huggingface cache in kind config | ||
```shell | ||
kind create cluster --config=./development/vllm/kind-config.yaml | ||
``` | ||
(Optional) Load container image to docker context | ||
|
||
> Note: If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker. | ||
> Therefore, the following command is not necessary. Only kind user need this step. | ||
```shell | ||
docker pull aibrix/vllm-cpu-env:macos | ||
kind load docker-image aibrix/vllm-cpu-env:macos | ||
``` | ||
|
||
Build aibrix runtime component | ||
```shell | ||
make docker-build-all | ||
kind load docker-image aibrix/runtime:nightly | ||
``` | ||
|
||
### Deploy model | ||
```shell | ||
kubectl create -k vllm/config | ||
|
||
kubectl delete -k vllm/config | ||
``` | ||
|
||
### Setup port forwarding | ||
|
||
```shell | ||
kubectl port-forward svc/facebook-opt-125m 8000:8000 & | ||
``` | ||
|
||
### Inference request | ||
```shell | ||
curl -v http://localhost:8000/v1/completions \ | ||
-H "Content-Type: application/json" \ | ||
-H "Authorization: Bearer test-key-1234567890" \ | ||
-d '{ | ||
"model": "facebook-opt-125m", | ||
"prompt": "Say this is a test", | ||
"temperature": 0.5, | ||
"max_tokens": 512 | ||
}' | ||
``` | ||
|
||
|
||
## Deploy aibrix gateway | ||
### Setup components | ||
```shell | ||
make docker-build-all | ||
kubectl create -k config/dependency | ||
kubectl create -k config/default | ||
``` | ||
|
||
### Setup port forwarding for envoy service | ||
```shell | ||
kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 & | ||
``` | ||
|
||
### Inference request | ||
```shell | ||
curl -v POST "http://localhost:8888/v1/completions" \ | ||
-H "Content-Type: application/json" \ | ||
-H "Authorization: Bearer test-key-1234567890" \ | ||
--data '{ | ||
"model": "facebook-opt-125m", | ||
"prompt": "Once upon a time,", | ||
"max_tokens": 512, | ||
"temperature": 0.5 | ||
}' | ||
``` |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,76 @@ | ||
# Debug only: Make sure pod can be visited from controller that deployed in mac. | ||
apiVersion: v1 | ||
kind: Service | ||
metadata: | ||
name: facebook-opt-125m | ||
namespace: default | ||
labels: | ||
prometheus-discovery: "true" | ||
annotations: | ||
prometheus.io/scrape: "true" | ||
prometheus.io/path: "/metrics" | ||
prometheus.io/port: "8000" | ||
spec: | ||
selector: | ||
model.aibrix.ai/name: "facebook-opt-125m" | ||
ports: | ||
- protocol: TCP | ||
name: metrics | ||
port: 8000 | ||
targetPort: 8000 | ||
nodePort: 30081 | ||
type: NodePort | ||
--- | ||
apiVersion: v1 | ||
kind: ServiceAccount | ||
metadata: | ||
name: mocked-app-sa | ||
namespace: default | ||
--- | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: Role | ||
metadata: | ||
name: mocked-app-pod-reader-role | ||
namespace: default | ||
rules: | ||
- apiGroups: [""] | ||
resources: ["pods"] | ||
verbs: ["get", "list", "watch"] | ||
--- | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: RoleBinding | ||
metadata: | ||
name: mocked-app-pod-reader-role-binding | ||
namespace: default | ||
subjects: | ||
- kind: ServiceAccount | ||
name: mocked-app-sa | ||
namespace: default | ||
roleRef: | ||
kind: Role | ||
name: mocked-app-pod-reader-role | ||
apiGroup: rbac.authorization.k8s.io | ||
--- | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: Role | ||
metadata: | ||
namespace: default | ||
name: mocked-app-deployment-reader-role | ||
rules: | ||
- apiGroups: ["apps"] | ||
resources: ["deployments"] | ||
verbs: ["get", "list", "watch"] | ||
--- | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: RoleBinding | ||
metadata: | ||
name: mocked-app-deployment-reader-role-binding | ||
namespace: default | ||
subjects: | ||
- kind: ServiceAccount | ||
name: mocked-app-sa | ||
namespace: default | ||
roleRef: | ||
kind: Role | ||
name: mocked-app-deployment-reader-role | ||
apiGroup: rbac.authorization.k8s.io |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,116 @@ | ||
apiVersion: apps/v1 | ||
kind: Deployment | ||
metadata: | ||
name: mock-facebook-opt-125m | ||
namespace: default | ||
labels: | ||
model.aibrix.ai/name: "facebook-opt-125m" | ||
model.aibrix.ai/port: "8000" | ||
adapter.model.aibrix.ai/enabled: "true" | ||
spec: | ||
replicas: 1 | ||
selector: | ||
matchLabels: | ||
adapter.model.aibrix.ai/enabled: "true" | ||
model.aibrix.ai/name: "facebook-opt-125m" | ||
app: "mock-facebook-opt-125m" | ||
template: | ||
metadata: | ||
labels: | ||
adapter.model.aibrix.ai/enabled: "true" | ||
model.aibrix.ai/name: "facebook-opt-125m" | ||
app: "mock-facebook-opt-125m" | ||
spec: | ||
serviceAccountName: mocked-app-sa | ||
containers: | ||
- name: llm-engine | ||
image: aibrix/vllm-cpu-env:macos | ||
ports: | ||
- containerPort: 8000 | ||
command: ["/bin/sh", "-c"] | ||
args: ["vllm serve facebook/opt-125m --served-model-name facebook-opt-125m --chat-template /etc/chat-template-config/chat-template.j2 --trust-remote-code --device cpu --disable_async_output_proc --enforce-eager --dtype float16"] | ||
env: | ||
- name: DEPLOYMENT_NAME | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.labels['app'] | ||
- name: POD_NAME | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.name | ||
- name: POD_NAMESPACE | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: metadata.namespace | ||
- name: MY_POD_IP | ||
valueFrom: | ||
fieldRef: | ||
fieldPath: status.podIP | ||
volumeMounts: | ||
- name: model | ||
mountPath: /root/.cache/huggingface | ||
- name: chat-template-volume | ||
mountPath: /etc/chat-template-config | ||
- name: aibrix-runtime | ||
image: aibrix/runtime:nightly | ||
command: | ||
- aibrix_runtime | ||
- --port | ||
- "8080" | ||
env: | ||
- name: INFERENCE_ENGINE | ||
value: vllm | ||
- name: INFERENCE_ENGINE_ENDPOINT | ||
value: http://localhost:8000 | ||
ports: | ||
- containerPort: 8080 | ||
protocol: TCP | ||
livenessProbe: | ||
httpGet: | ||
path: /healthz | ||
port: 8080 | ||
initialDelaySeconds: 3 | ||
periodSeconds: 2 | ||
readinessProbe: | ||
httpGet: | ||
path: /ready | ||
port: 8080 | ||
initialDelaySeconds: 5 | ||
periodSeconds: 10 | ||
volumes: | ||
- name: model | ||
hostPath: | ||
path: /root/.cache/huggingface | ||
- name: chat-template-volume | ||
configMap: | ||
name: chat-template-config | ||
--- | ||
apiVersion: v1 | ||
kind: ConfigMap | ||
metadata: | ||
name: chat-template-config | ||
data: | ||
chat-template.j2: | | ||
{%- if messages[0]['role'] == 'system' -%} | ||
{%- set system_message = messages[0]['content'] -%} | ||
{%- set messages = messages[1:] -%} | ||
{%- else -%} | ||
{% set system_message = '' -%} | ||
{%- endif -%} | ||
{{ bos_token + system_message }} | ||
{%- for message in messages -%} | ||
{%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%} | ||
{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }} | ||
{%- endif -%} | ||
{%- if message['role'] == 'user' -%} | ||
{{ 'USER: ' + message['content'] + '\n' }} | ||
{%- elif message['role'] == 'assistant' -%} | ||
{{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }} | ||
{%- endif -%} | ||
{%- endfor -%} | ||
{%- if add_generation_prompt -%} | ||
{{ 'ASSISTANT:' }} | ||
{% endif %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
kind: Kustomization | ||
|
||
resources: | ||
- deployment.yaml | ||
- components.yaml |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,8 @@ | ||
kind: Cluster | ||
apiVersion: kind.x-k8s.io/v1alpha4 | ||
nodes: | ||
- role: control-plane | ||
- role: worker | ||
extraMounts: | ||
- hostPath: /<path>/.cache/huggingface | ||
containerPath: /root/.cache/huggingface |