From d081b3cc73708bac3c49522f33232b249dfdf30e Mon Sep 17 00:00:00 2001 From: Jingyuan Zhang Date: Tue, 3 Dec 2024 16:54:10 -0800 Subject: [PATCH 1/3] Merge simulator to app --- development/app/Dockerfile | 20 +- development/{simulator => app}/Makefile | 62 ++-- development/app/README.md | 23 +- development/app/app.py | 266 ++++++++++---- .../config/heterogeneous/kustomization.yaml | 7 + .../simulator_a40/kustomization.yaml | 25 ++ .../simulator_a40/patch_deployment_a40.yaml | 25 ++ .../patch_podautoscaler_a40.yaml | 15 + .../simulator_a40/rename_deployment_a40.json | 7 + .../rename_podautoscaler_a40.json | 7 + .../mock/components.yaml} | 77 +--- .../app/config/mock/kustomization.yaml | 3 + .../app/config/simulator/kustomization.yaml | 25 ++ .../simulator/patch_deployment_a100.yaml | 26 ++ .../simulator/patch_podautoscaler_a100.yaml | 15 + .../simulator/rename_deployment_a100.json | 7 + .../simulator/rename_podautoscaler_a100.json | 7 + .../templates/deployment/deployment.yaml | 71 ++++ .../templates/deployment/kustomization.yaml | 2 + .../podautoscaler/kustomization.yaml | 2 + .../podautoscaler/podautoscaler.yaml | 22 ++ development/app/requirements.txt | 14 +- development/{simulator => app}/simulator.py | 0 development/simulator/Dockerfile | 33 -- development/simulator/README.md | 75 ---- development/simulator/app.py | 336 ------------------ development/simulator/deployment-a100.yaml | 161 --------- development/simulator/deployment-a40.yaml | 160 --------- development/simulator/environment.yml | 29 -- development/simulator/requirements.txt | 14 - 30 files changed, 557 insertions(+), 979 deletions(-) rename development/{simulator => app}/Makefile (93%) create mode 100644 development/app/config/heterogeneous/kustomization.yaml create mode 100644 development/app/config/heterogeneous/simulator_a40/kustomization.yaml create mode 100644 development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml create mode 100644 development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml create mode 100644 development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json create mode 100644 development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json rename development/app/{deployment.yaml => config/mock/components.yaml} (50%) create mode 100644 development/app/config/mock/kustomization.yaml create mode 100644 development/app/config/simulator/kustomization.yaml create mode 100644 development/app/config/simulator/patch_deployment_a100.yaml create mode 100644 development/app/config/simulator/patch_podautoscaler_a100.yaml create mode 100644 development/app/config/simulator/rename_deployment_a100.json create mode 100644 development/app/config/simulator/rename_podautoscaler_a100.json create mode 100644 development/app/config/templates/deployment/deployment.yaml create mode 100644 development/app/config/templates/deployment/kustomization.yaml create mode 100644 development/app/config/templates/podautoscaler/kustomization.yaml create mode 100644 development/app/config/templates/podautoscaler/podautoscaler.yaml rename development/{simulator => app}/simulator.py (100%) delete mode 100644 development/simulator/Dockerfile delete mode 100644 development/simulator/README.md delete mode 100644 development/simulator/app.py delete mode 100644 development/simulator/deployment-a100.yaml delete mode 100644 development/simulator/deployment-a40.yaml delete mode 100644 development/simulator/environment.yml delete mode 100644 development/simulator/requirements.txt diff --git a/development/app/Dockerfile b/development/app/Dockerfile index cec8d075..314808ef 100644 --- a/development/app/Dockerfile +++ b/development/app/Dockerfile @@ -1,9 +1,10 @@ # Use the official Python base image -FROM python:3.9-slim +FROM python:3.10-slim # Set environment variables ENV PYTHONDONTWRITEBYTECODE=1 ENV PYTHONUNBUFFERED=1 +ENV WANDB_MODE=disabled # Set the working directory WORKDIR /app @@ -12,15 +13,24 @@ WORKDIR /app COPY requirements.txt /app/ # Install dependencies -RUN apt update && apt install -y curl jq +RUN apt update && apt install -y curl jq git RUN pip install --no-cache-dir -r requirements.txt # Copy the rest of the application code into the container -COPY . /app/ +COPY ./*.py /app/ + +ENV MODEL_NAME=llama2-7b +ARG GPU_TYPE=disabled + +# Trigger profiling +RUN if [ "$GPU_TYPE" != "disabled" ]; then \ + python app.py --time_limit 1000 --replica_config_device ${GPU_TYPE}; \ + fi # Expose the port the app runs on EXPOSE 8000 -# Run the application -CMD ["python", "app.py"] +# Run the application, environment variable is necessary to apply ARG +ENV GPU_TYPE=$GPU_TYPE +CMD python app.py --replica_config_device ${GPU_TYPE} diff --git a/development/simulator/Makefile b/development/app/Makefile similarity index 93% rename from development/simulator/Makefile rename to development/app/Makefile index 2805f99d..909a775c 100644 --- a/development/simulator/Makefile +++ b/development/app/Makefile @@ -1,28 +1,55 @@ all: build -build-a100: - docker build -t aibrix/vllm-simulator:nightly -f Dockerfile . +docker-build-mock: + docker build -t aibrix/vllm-mock:nightly -f Dockerfile . -build-a40: +docker-build-simulator: + docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile . + +docker-build-simulator-a40: docker build -t aibrix/vllm-simulator-a40:nightly --build-arg GPU_TYPE=a40 -f Dockerfile . -build: build-a100 +docker-build: docker-build-mock -deploy-a100: - kubectl apply -f deployment-a100.yaml +deploy-mock: + kubectl create -k config/mock + sleep 2 + kubectl port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 & + kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 & -deploy-a40: - kubectl apply -f deployment-a40.yaml +deploy-simulator: + kubectl create -k config/simulator + sleep 2 + kubectl port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 & + kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 & -deploy: deploy-a100 +deploy-heterogeneous: + kubectl create -k config/heterogeneous sleep 2 - kubectl -n aibrix-system port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 & + kubectl port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 & + kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 & + +deploy: deploy-mock + +clean-mock: + kubectl delete -k config/mock + sleep 1 + curl http://localhost:8000/metrics + curl http://localhost:8888/metrics -clean: - kubectl delete -f deployment-a100.yaml - kubectl delete -f deployment-a40.yaml +clean-simulator: + kubectl delete -k config/simulator sleep 1 curl http://localhost:8000/metrics + curl http://localhost:8888/metrics + +clean-heterogeneous: + kubectl delete -k config/heterogeneous + sleep 1 + curl http://localhost:8000/metrics + curl http://localhost:8888/metrics + +clean: clean-mock test: curl http://localhost:8000/v1/chat/completions \ @@ -56,17 +83,8 @@ test-long: "max_tokens": 50 \ }' -init-local-gateway-call: - kubectl -n aibrix-system port-forward svc/aibrix-gateway-users 8090:8090 1>/dev/null 2>&1 & - kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 & - sleep 1 - curl http://localhost:8090/CreateUser \ - -H "Content-Type: application/json" \ - -d '{"name": "your-user-name","rpm": 1000,"tpm": 100000}' - test-gateway: curl -v http://localhost:8888/v1/chat/completions \ - -H "user: your-user-name" \ -H "model: llama2-7b" \ -H "Content-Type: application/json" \ -H "Authorization: Bearer any_key" \ diff --git a/development/app/README.md b/development/app/README.md index 3b87c2bf..c533fb20 100644 --- a/development/app/README.md +++ b/development/app/README.md @@ -13,7 +13,7 @@ docker build -t aibrix/vllm-mock:nightly -f Dockerfile . ``` -1. (Optional) Load container image to docker context +1.b (Optional) Load container image to docker context > Note: If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker. > Therefore, the following command is not necessary. Only kind user need this step. @@ -22,12 +22,27 @@ docker build -t aibrix/vllm-mock:nightly -f Dockerfile . kind load docker-image aibrix/vllm-mock:nightly ``` -1. Deploy mocked model image +2. Deploy mocked model image ```shell -kubectl apply -f deployment.yaml +kubectl create -k config/mock # you can run following command to delete the deployment -kubectl delete -f deployment.yaml +kubectl delete -k config/mock +``` + +### Deploy the simulator app +Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for high-fidality vLLM simulation: +1. Builder simulator base model image +```dockerfile +docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile . +``` + +2. Deploy simulator model image +```shell +kubectl create -k config/simulator + +# you can run following command to delete the deployment +kubectl delete -k config/simulator ``` ### Test the metric invocation diff --git a/development/app/app.py b/development/app/app.py index 07e2aec4..a2a420b6 100644 --- a/development/app/app.py +++ b/development/app/app.py @@ -2,23 +2,54 @@ from werkzeug import serving import random import re +import logging +import sys import time +from datetime import datetime from random import randint import os +from typing import Optional try: from kubernetes import client, config except Exception as e: print(f"Failed to import kubernetes, skip: {e}") +from simulator import Simulator +from vidur.config import SimulationConfig +from vidur.entities import Request + # Global storage for overridden values overrides = {} -MODEL_NAME = 'llama2-70b' +MODEL_NAME = os.getenv('MODEL_NAME', 'llama2-70b') DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'llama2-70b') NAMESPACE = os.getenv('POD_NAMESPACE', 'default') DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1')) +modelMaps = { + "llama2-7b": "meta-llama/Llama-2-7b-hf", + "llama2-70b": "meta-llama/Llama-2-70b-hf" +} +sys.argv.append(f"--replica_config_model_name={modelMaps.get(MODEL_NAME, MODEL_NAME)}") + +tokenizer = None +simulator: Optional[Simulator] = None + +logger = logging.getLogger(__name__) + +def get_token_count(text): + try: + # Encode the text + encoded_input = tokenizer(text) + + # Get the number of tokens + return len(encoded_input['input_ids']) + except Exception as e: + logger.error(f"Failed to get number of tokens: {e}") + + return 1 + models = [ { "id": "meta-llama/Llama-2-7b-hf", @@ -127,73 +158,128 @@ def unload_model(): @app.route('/v1/completions', methods=['POST']) def completion(): - prompt = request.json.get('prompt') - model = request.json.get('model') - if not prompt or not model: - return jsonify({"status": "error", "message": "Prompt and model are required"}), 400 - - prompt_tokens = randint(1, 100) - completion_tokens = randint(1, 100) - - # Simulated response - response = { - "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7", - "object": "text_completion", - "created": 1589478378, - "model": model, - "system_fingerprint": "fp_44709d6fcb", - "choices": [ - { - "text": f"This is indeed a test from model {model}!", - "index": 0, - "logprobs": None, - "finish_reason": "length" + try: + prompt = request.json.get('prompt') + model = request.json.get('model') + max_tokens = request.json.get('max_tokens') + if not prompt or not model: + return jsonify({"status": "error", "message": "Prompt and model are required"}), 400 + + arrived_at = datetime.now().timestamp() + input_tokens = get_token_count(prompt) + output_tokens = max_tokens if max_tokens else randint(10, 500) + arrived_next = request.json.get('next_in') + if not arrived_next: + arrived_next = 0.0 + else: + arrived_next += arrived_at + + start = datetime.now().timestamp() + latency = 0.0 + if simulator is not None: + latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next)) + + # Simulated response + response = { + "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7", + "object": "text_completion", + "created": int(arrived_at), + "model": model, + "system_fingerprint": "fp_44709d6fcb", + "choices": [ + { + "text": f"This is simulated message from {model}!", + "index": 0, + "logprobs": None, + "finish_reason": "length" + } + ], + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + "time": latency } - ], - "usage": { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens } - } - return jsonify(response), 200 + overhead = datetime.now().timestamp()-start + if latency > overhead: + time.sleep(latency-overhead) + elif latency > 0.0: + logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}") + + return jsonify(response), 200 + except Exception as e: + err = { + "error": { + "message": f"The server had an error while processing your request: {e}", + "type": "server_error" + } + } + return jsonify(err), 500 @app.route('/v1/chat/completions', methods=['POST']) def chat_completions(): - messages = request.json.get('messages') - model = request.json.get('model') - if not messages or not model: - return jsonify({"status": "error", "message": "Messages and model are required"}), 400 - - prompt_tokens = randint(1, 100) - completion_tokens = randint(1, 100) - - # Simulated response - response = { - "id": "chatcmpl-abc123", - "object": "chat.completion", - "created": 1677858242, - "model": model, - "usage": { - "prompt_tokens": prompt_tokens, - "completion_tokens": completion_tokens, - "total_tokens": prompt_tokens + completion_tokens - }, - "choices": [ - { - "message": { - "role": "assistant", - "content": f"\n\nThis is a test from{model}!" - }, - "logprobs": None, - "finish_reason": "stop", - "index": 0 - } - ] - } - return jsonify(response), 200 + try: + messages = request.json.get('messages') + model = request.json.get('model') + max_tokens = request.json.get('max_tokens') + if not messages or not model: + return jsonify({"status": "error", "message": "Messages and model are required"}), 400 + + arrived_at = datetime.now().timestamp() + input_tokens = sum(get_token_count(message["content"]) for message in messages) + output_tokens = max_tokens if max_tokens else randint(10, 500) + arrived_next = request.json.get('next_in') + if not arrived_next: + arrived_next = 0.0 + else: + arrived_next += arrived_at + + start = datetime.now().timestamp() + latency = 0.0 + if simulator is not None: + latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next)) + + # Simulated response + response = { + "id": "chatcmpl-abc123", + "object": "chat.completion", + "created": int(arrived_at), + "model": model, + "usage": { + "prompt_tokens": input_tokens, + "completion_tokens": output_tokens, + "total_tokens": input_tokens + output_tokens, + "time": latency + }, + "choices": [ + { + "message": { + "role": "assistant", + "content": f"\n\nThis is simulated message from {model}!" + }, + "logprobs": None, + "finish_reason": "stop", + "index": 0 + } + ] + } + overhead = datetime.now().timestamp()-start + if latency > overhead: + time.sleep(latency-overhead) + else: + logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}") + return jsonify(response), 200 + except Exception as e: + err = { + "error": { + "message": f"The server had an error while processing your request: {e}", + "type": "server_error" + } + } + return jsonify(err), 500 @app.route('/set_metrics', methods=['POST']) def set_metrics(): @@ -481,11 +567,57 @@ def metrics(): if __name__ == '__main__': + logging.basicConfig(level=logging.DEBUG) + logging.getLogger("kubernetes.client.rest").setLevel(logging.ERROR) # Suppress kubenetes logs + + print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}, MODEL: {MODEL_NAME}") + + # Extract gpu_device without call argparse + gpu_device = "disabled" try: - # config.load_kube_config() - config.load_incluster_config() - except Exception as e: - print(f"Failed to load k8s config: {e}") + index = sys.argv.index("--replica_config_device") + if index + 1 < len(sys.argv): + gpu_device = sys.argv[index + 1] + except ValueError: + pass + + # Restore -h functionality + if '-h' in sys.argv: + SimulationConfig.create_from_cli_args() + + # Launch simulator + if gpu_device != "disabled": + # Load the tokenizer for your model + from transformers import AutoTokenizer + tokenizer = AutoTokenizer.from_pretrained( + 'bert-base-uncased', + model_max_length=16384, # Suppress warning + clean_up_tokenization_spaces=True) + + simulator = Simulator(SimulationConfig.create_from_cli_args()) + overrides = { + "total": 100.0, + "running": 0, + "waiting": 0, + "swapped": 0 + } + + thread = None + if simulator is not None: + thread = simulator.start() + + # Perform profiling and skip actual run + if '--time_limit' not in sys.argv: + try: + # config.load_kube_config() + config.load_incluster_config() + except Exception as e: + print(f"Failed to load k8s config: {e}") + + app.run(host='0.0.0.0', port=8000) + + if simulator is not None: + simulator.stop() - print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}") - app.run(host='0.0.0.0', port=8000) + if thread is not None: + thread.join() \ No newline at end of file diff --git a/development/app/config/heterogeneous/kustomization.yaml b/development/app/config/heterogeneous/kustomization.yaml new file mode 100644 index 00000000..91c78376 --- /dev/null +++ b/development/app/config/heterogeneous/kustomization.yaml @@ -0,0 +1,7 @@ +kind: Kustomization + +resources: +- ../simulator +- simulator_a40 + +apiVersion: kustomize.config.k8s.io/v1beta1 diff --git a/development/app/config/heterogeneous/simulator_a40/kustomization.yaml b/development/app/config/heterogeneous/simulator_a40/kustomization.yaml new file mode 100644 index 00000000..e593f011 --- /dev/null +++ b/development/app/config/heterogeneous/simulator_a40/kustomization.yaml @@ -0,0 +1,25 @@ +kind: Kustomization + +resources: +- ../../templates/deployment +- ../../templates/podautoscaler + +patches: +- path: rename_deployment_a40.json + target: + kind: Deployment + name: mock-llama2-7b +- path: rename_podautoscaler_a40.json + target: + kind: PodAutoscaler + name: podautoscaler-mock-llama2-7b +- path: patch_deployment_a40.yaml + target: + kind: Deployment + name: mock-llama2-7b +- path: patch_podautoscaler_a40.yaml + target: + kind: PodAutoscaler + name: podautoscaler-simulator-llama2-7b-a40 + +apiVersion: kustomize.config.k8s.io/v1beta1 diff --git a/development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml new file mode 100644 index 00000000..5daa884c --- /dev/null +++ b/development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml @@ -0,0 +1,25 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simulator-llama2-7b-a40 + labels: + model.aibrix.ai/name: "llama2-7b" +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: "llama2-7b" + template: + metadata: + labels: + model.aibrix.ai/name: "llama2-7b" + app: "simulator-llama2-7b-a40" + spec: + containers: + - name: llm-engine + image: aibrix/vllm-simulator-a40:nightly + env: + - name: MODEL_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['model.aibrix.ai/name'] \ No newline at end of file diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml new file mode 100644 index 00000000..93b2d37d --- /dev/null +++ b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml @@ -0,0 +1,15 @@ +# Pod autoscaler works with gpu-optimizer +apiVersion: autoscaling.aibrix.ai/v1alpha1 +kind: PodAutoscaler +metadata: + name: podautoscaler-simulator-llama2-7b-a40 +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: simulator-llama2-7b-a40 + metricsSources: + - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + path: /metrics/default/simulator-llama2-7b-a40 + metric: "vllm:deployment_replicas" + targetValue: "1" \ No newline at end of file diff --git a/development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json b/development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json new file mode 100644 index 00000000..bf964cbd --- /dev/null +++ b/development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json @@ -0,0 +1,7 @@ +[ + { + "op": "replace", + "path": "/metadata/name", + "value": "simulator-llama2-7b-a40" + } +] \ No newline at end of file diff --git a/development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json b/development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json new file mode 100644 index 00000000..99d1ed4d --- /dev/null +++ b/development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json @@ -0,0 +1,7 @@ +[ + { + "op": "replace", + "path": "/metadata/name", + "value": "podautoscaler-simulator-llama2-7b-a40" + } +] \ No newline at end of file diff --git a/development/app/deployment.yaml b/development/app/config/mock/components.yaml similarity index 50% rename from development/app/deployment.yaml rename to development/app/config/mock/components.yaml index 7e4fc772..e66c2f8e 100644 --- a/development/app/deployment.yaml +++ b/development/app/config/mock/components.yaml @@ -1,75 +1,8 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: llama2-70b - namespace: default - labels: - model.aibrix.ai/name: "llama2-70b" - model.aibrix.ai/port: "8000" - adapter.model.aibrix.ai/enabled: "true" -spec: - replicas: 3 - selector: - matchLabels: - adapter.model.aibrix.ai/enabled: "true" - model.aibrix.ai/name: "llama2-70b" - template: - metadata: - labels: - adapter.model.aibrix.ai/enabled: "true" - model.aibrix.ai/name: "llama2-70b" - spec: - serviceAccountName: mocked-app-sa - containers: - - name: llm-engine - image: aibrix/vllm-mock:nightly - ports: - - containerPort: 8000 - env: - - name: POD_NAME - valueFrom: - fieldRef: - fieldPath: metadata.name - - name: POD_NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: aibrix-runtime - image: aibrix/runtime:nightly - command: - - aibrix_runtime - - --port - - "8080" - env: - - name: INFERENCE_ENGINE - value: vllm - - name: INFERENCE_ENGINE_ENDPOINT - value: http://localhost:8000 - ports: - - containerPort: 8080 - protocol: TCP - livenessProbe: - httpGet: - path: /healthz - port: 8080 - initialDelaySeconds: 3 - periodSeconds: 2 - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 5 - periodSeconds: 10 ---- # Debug only: Make sure pod can be visited from controller that deployed in mac. apiVersion: v1 kind: Service metadata: - name: llama2-70b + name: llama2-7b namespace: default labels: prometheus-discovery: "true" @@ -79,7 +12,7 @@ metadata: prometheus.io/port: "8000" spec: selector: - model.aibrix.ai/name: "llama2-70b" + model.aibrix.ai/name: "llama2-7b" ports: - protocol: TCP name: metrics @@ -146,7 +79,7 @@ roleRef: # apiVersion: gateway.networking.k8s.io/v1 # kind: HTTPRoute # metadata: -# name: llama2-70b-router +# name: llama2-7b-router # namespace: default # spec: # parentRefs: @@ -156,7 +89,7 @@ roleRef: # - headers: # - type: Exact # name: model -# value: llama2-70b +# value: llama2-7b # backendRefs: -# - name: llama2-70b +# - name: llama2-7b # port: 8000 \ No newline at end of file diff --git a/development/app/config/mock/kustomization.yaml b/development/app/config/mock/kustomization.yaml new file mode 100644 index 00000000..e0947b6f --- /dev/null +++ b/development/app/config/mock/kustomization.yaml @@ -0,0 +1,3 @@ +resources: + - ../templates/deployment + - components.yaml diff --git a/development/app/config/simulator/kustomization.yaml b/development/app/config/simulator/kustomization.yaml new file mode 100644 index 00000000..6dff0bb1 --- /dev/null +++ b/development/app/config/simulator/kustomization.yaml @@ -0,0 +1,25 @@ +kind: Kustomization + +resources: +- ../mock +- ../templates/podautoscaler + +patches: +- path: rename_deployment_a100.json + target: + kind: Deployment + name: mock-llama2-7b +- path: rename_podautoscaler_a100.json + target: + kind: PodAutoscaler + name: podautoscaler-mock-llama2-7b +- path: patch_deployment_a100.yaml + target: + kind: Deployment + name: simulator-llama2-7b-a100 +- path: patch_podautoscaler_a100.yaml + target: + kind: PodAutoscaler + name: podautoscaler-simulator-llama2-7b-a100 + +apiVersion: kustomize.config.k8s.io/v1beta1 diff --git a/development/app/config/simulator/patch_deployment_a100.yaml b/development/app/config/simulator/patch_deployment_a100.yaml new file mode 100644 index 00000000..01b38a96 --- /dev/null +++ b/development/app/config/simulator/patch_deployment_a100.yaml @@ -0,0 +1,26 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: simulator-llama2-7b-a100 + labels: + model.aibrix.ai/name: "llama2-7b" + model.aibrix.ai/min_replicas: "1" # min replica for gpu optimizer when no workloads. +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: "llama2-7b" + template: + metadata: + labels: + model.aibrix.ai/name: "llama2-7b" + app: "simulator-llama2-7b-a100" + spec: + containers: + - name: llm-engine + image: aibrix/vllm-simulator:nightly + env: + - name: MODEL_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['model.aibrix.ai/name'] \ No newline at end of file diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml new file mode 100644 index 00000000..c04a2c19 --- /dev/null +++ b/development/app/config/simulator/patch_podautoscaler_a100.yaml @@ -0,0 +1,15 @@ +# Pod autoscaler works with gpu-optimizer +apiVersion: autoscaling.aibrix.ai/v1alpha1 +kind: PodAutoscaler +metadata: + name: podautoscaler-simulator-llama2-7b-a100 +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: simulator-llama2-7b-a100 + metricsSources: + - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + path: /metrics/default/simulator-llama2-7b-a100 + metric: "vllm:deployment_replicas" + targetValue: "1" \ No newline at end of file diff --git a/development/app/config/simulator/rename_deployment_a100.json b/development/app/config/simulator/rename_deployment_a100.json new file mode 100644 index 00000000..7788ba4f --- /dev/null +++ b/development/app/config/simulator/rename_deployment_a100.json @@ -0,0 +1,7 @@ +[ + { + "op": "replace", + "path": "/metadata/name", + "value": "simulator-llama2-7b-a100" + } +] \ No newline at end of file diff --git a/development/app/config/simulator/rename_podautoscaler_a100.json b/development/app/config/simulator/rename_podautoscaler_a100.json new file mode 100644 index 00000000..fc58fce7 --- /dev/null +++ b/development/app/config/simulator/rename_podautoscaler_a100.json @@ -0,0 +1,7 @@ +[ + { + "op": "replace", + "path": "/metadata/name", + "value": "podautoscaler-simulator-llama2-7b-a100" + } +] \ No newline at end of file diff --git a/development/app/config/templates/deployment/deployment.yaml b/development/app/config/templates/deployment/deployment.yaml new file mode 100644 index 00000000..153d2c17 --- /dev/null +++ b/development/app/config/templates/deployment/deployment.yaml @@ -0,0 +1,71 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: mock-llama2-7b + namespace: default + labels: + model.aibrix.ai/name: "llama2-7b" + model.aibrix.ai/port: "8000" + adapter.model.aibrix.ai/enabled: "true" +spec: + replicas: 3 + selector: + matchLabels: + adapter.model.aibrix.ai/enabled: "true" + model.aibrix.ai/name: "llama2-7b" + template: + metadata: + labels: + adapter.model.aibrix.ai/enabled: "true" + model.aibrix.ai/name: "llama2-7b" + app: "mock-llama2-7b" + spec: + serviceAccountName: mocked-app-sa + containers: + - name: llm-engine + image: aibrix/vllm-mock:nightly + ports: + - containerPort: 8000 + env: + - name: DEPLOYMENT_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['app'] + - name: POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: aibrix-runtime + image: aibrix/runtime:nightly + command: + - aibrix_runtime + - --port + - "8080" + env: + - name: INFERENCE_ENGINE + value: vllm + - name: INFERENCE_ENGINE_ENDPOINT + value: http://localhost:8000 + ports: + - containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 2 + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 \ No newline at end of file diff --git a/development/app/config/templates/deployment/kustomization.yaml b/development/app/config/templates/deployment/kustomization.yaml new file mode 100644 index 00000000..9519a26d --- /dev/null +++ b/development/app/config/templates/deployment/kustomization.yaml @@ -0,0 +1,2 @@ +resources: + - deployment.yaml diff --git a/development/app/config/templates/podautoscaler/kustomization.yaml b/development/app/config/templates/podautoscaler/kustomization.yaml new file mode 100644 index 00000000..77628acc --- /dev/null +++ b/development/app/config/templates/podautoscaler/kustomization.yaml @@ -0,0 +1,2 @@ +resources: + - podautoscaler.yaml diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml new file mode 100644 index 00000000..5f114688 --- /dev/null +++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml @@ -0,0 +1,22 @@ +# Pod autoscaler works with gpu-optimizer +apiVersion: autoscaling.aibrix.ai/v1alpha1 +kind: PodAutoscaler +metadata: + name: podautoscaler-mock-llama2-7b + labels: + app.kubernetes.io/name: aibrix + namespace: default +spec: + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: mock-llama2-7b + minReplicas: 0 + maxReplicas: 10 + targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured + metricsSources: + - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 + path: /metrics/default/simulator-llama2-7b + metric: "vllm:deployment_replicas" + targetValue: "1" + scalingStrategy: "KPA" \ No newline at end of file diff --git a/development/app/requirements.txt b/development/app/requirements.txt index 13814d63..7386688c 100644 --- a/development/app/requirements.txt +++ b/development/app/requirements.txt @@ -1,2 +1,14 @@ flask -kubernetes \ No newline at end of file +kubernetes +numpy +pandas +scikit-learn +wandb +kaleido +ddsketch +plotly_express +matplotlib +seaborn +fasteners +transformers +git+https://github.com/zhangjyr/vidur.git \ No newline at end of file diff --git a/development/simulator/simulator.py b/development/app/simulator.py similarity index 100% rename from development/simulator/simulator.py rename to development/app/simulator.py diff --git a/development/simulator/Dockerfile b/development/simulator/Dockerfile deleted file mode 100644 index a0777710..00000000 --- a/development/simulator/Dockerfile +++ /dev/null @@ -1,33 +0,0 @@ -# Use the official Python base image -FROM python:3.10-slim - -# Set environment variables -ENV PYTHONDONTWRITEBYTECODE=1 -ENV PYTHONUNBUFFERED=1 -ENV WANDB_MODE=disabled - -# Set the working directory -WORKDIR /simulator - -# Copy the requirements file into the container -COPY requirements.txt /simulator/ - -# Install dependencies -RUN apt update && apt install -y curl jq git - -RUN pip install --no-cache-dir -r requirements.txt - -# Copy the rest of the application code into the container -COPY ./*.py /simulator/ -# COPY ./model_cache /simulator/model_cache - -ENV MODEL_NAME=llama2-7b -ARG GPU_TYPE=a100 - # Trigger profiling -RUN python app.py --time_limit 1000 --replica_config_device ${GPU_TYPE} - -# Expose the port the app runs on -EXPOSE 8000 - -# Run the application -CMD ["python", "app.py"] diff --git a/development/simulator/README.md b/development/simulator/README.md deleted file mode 100644 index b337a053..00000000 --- a/development/simulator/README.md +++ /dev/null @@ -1,75 +0,0 @@ -# vLLM application simulator - -## Run locally - -Ensure that you have Python 3.10 installed on your system. Refer https://www.bitecode.dev/p/installing-python-the-bare-minimum -Create a virtual environment using venv module using python3.10 -m venv .venv -Activate the virtual environment using source .venv/bin/activate -Install the dependencies using python -m pip install -r requirements.txt -Run python app.py to start the server. -Run deactivate to deactivate the virtual environment - -## Run in kubernetes - -1. Build simulated base model image -```dockerfile -docker build -t aibrix/vllm-simulator:nightly -f Dockerfile . - -# If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker. -# Therefore, the following command is not necessary. -kind load docker-image aibrix/vllm-simulator:nightly -``` - -2. Deploy simulated model image -```shell -kubectl apply -f docs/development/simulator/deployment.yaml -kubectl -n aibrix-system port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 & -``` - -## Test python app separately - -```shell -curl http://localhost:8000/v1/chat/completions \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer any_key" \ - -d '{ - "model": "llama2-7b", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' -``` - -```shell -kubectl delete -f docs/development/simulator/deployment.yaml -``` - -## Test with envoy gateway - -Add User: - - -Port forward to the User and Envoy service: -```shell -kubectl -n aibrix-system port-forward svc/aibrix-gateway-users 8090:8090 1>/dev/null 2>&1 & -kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 & -``` - -Add User -```shell -curl http://localhost:8090/CreateUser \ - -H "Content-Type: application/json" \ - -d '{"name": "your-user-name","rpm": 100,"tpm": 1000}' -``` - -Test request (ensure header model name matches with deployment's model name for routing) -```shell -curl -v http://localhost:8888/v1/chat/completions \ - -H "user: your-user-name" \ - -H "model: llama2-7b" \ - -H "Content-Type: application/json" \ - -H "Authorization: Bearer any_key" \ - -d '{ - "model": "llama2-7b", - "messages": [{"role": "user", "content": "Say this is a test!"}], - "temperature": 0.7 - }' & \ No newline at end of file diff --git a/development/simulator/app.py b/development/simulator/app.py deleted file mode 100644 index 604f10a7..00000000 --- a/development/simulator/app.py +++ /dev/null @@ -1,336 +0,0 @@ -import logging -import os -import sys -import time -from datetime import datetime -from random import randint - -from flask import Flask, Response, jsonify, request - -try: - from kubernetes import client, config -except Exception as e: - print(f"Failed to import kubernetes, skip: {e}") - -from simulator import Simulator -from transformers import AutoTokenizer -from vidur.config import SimulationConfig -from vidur.config_optimizer.config_explorer.config import ModelConfig -from vidur.entities import Request - -MODEL_NAME = os.getenv('MODEL_NAME', 'llama2-70b') -DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'llama2-70b') -NAMESPACE = os.getenv('NAMESPACE', 'default') -DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1')) - -# Load the tokenizer for your model -tokenizer = AutoTokenizer.from_pretrained( - 'bert-base-uncased', - model_max_length=16384, # Suppress warning - clean_up_tokenization_spaces=True) - -app = Flask(__name__) -modelMaps = { - "llama2-7b": "meta-llama/Llama-2-7b-hf", - "llama2-70b": "meta-llama/Llama-2-70b-hf" -} -sys.argv.append(f"--replica_config_model_name={modelMaps.get(MODEL_NAME, MODEL_NAME)}") -simulator_config: SimulationConfig = SimulationConfig.create_from_cli_args() -simulator = Simulator(simulator_config) -v1 = None - -# Global storage for overridden values -overrides = {} - -logger = logging.getLogger(__name__) - -def get_token_count(text): - try: - # Encode the text - encoded_input = tokenizer(text) - - # Get the number of tokens - return len(encoded_input['input_ids']) - except Exception as e: - logger.error(f"Failed to get number of tokens: {e}") - - return 1 - -models = [ - { - "id": "meta-llama/Llama-2-7b-hf", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "meta-llama/Llama-2-7b-hf", - "parent": None, - "permission": [ - { - "id": "modelperm-cb1adf4457b2417e8c7770aadcffe4cc", - "object": "model_permission", - "created": 1715644056, - "allow_create_engine": False, - "allow_sampling": True, - "allow_logprobs": True, - "allow_search_indices": False, - "allow_view": True, - "allow_fine_tuning": False, - "organization": "*", - "group": None, - "is_blocking": False - } - ] - }, - { - "id": "startup-default-lora", - "object": "model", - "created": 1715644056, - "owned_by": "vllm", - "root": "meta-llama/Llama-2-7b-hf", - "parent": None, - "permission": [ - { - "id": "modelperm-6a01d79e4d0e452b94d52d2c2e8c8562", - "object": "model_permission", - "created": 1715644056, - "allow_create_engine": False, - "allow_sampling": True, - "allow_logprobs": True, - "allow_search_indices": False, - "allow_view": True, - "allow_fine_tuning": False, - "organization": "*", - "group": None, - "is_blocking": False - } - ] - } -] - -@app.route('/v1/models', methods=['GET']) -def get_models(): - return jsonify({ - "object": "list", - "data": models - }) - -@app.route('/v1/load_lora_adapter', methods=['POST']) -def load_model(): - lora_name = request.json.get('lora_name') - # Check if the model already exists - if any(model['id'] == lora_name for model in models): - return jsonify({"status": "success", "message": "Model already loaded"}), 200 - - new_model = { - 'id': lora_name, - 'created': int(time.time()), - 'object': "model", - 'owned_by': "vllm", - 'parent': None, - 'root': request.json.get('lora_path') - } - - models.append(new_model) - return jsonify({"status": "success", "message": "Model loaded successfully"}), 200 - - -@app.route('/v1/unload_lora_adapter', methods=['POST']) -def unload_model(): - model_id = request.json.get('lora_name') - global models - models = [model for model in models if model['id'] != model_id] - return jsonify({"status": "success", "message": "Model unloaded successfully"}), 200 - - -@app.route('/v1/completions', methods=['POST']) -def completion(): - try: - prompt = request.json.get('prompt') - model = request.json.get('model') - max_tokens = request.json.get('max_tokens') - if not prompt or not model: - return jsonify({"status": "error", "message": "Prompt and model are required"}), 400 - - arrived_at = datetime.now().timestamp() - input_tokens = get_token_count(prompt) - output_tokens = max_tokens if max_tokens else randint(10, 500) - arrived_next = request.json.get('next_in') - if not arrived_next: - arrived_next = 0.0 - else: - arrived_next += arrived_at - - start = datetime.now().timestamp() - latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next)) - - # Simulated response - response = { - "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7", - "object": "text_completion", - "created": int(arrived_at), - "model": model, - "system_fingerprint": "fp_44709d6fcb", - "choices": [ - { - "text": f"This is simulated message from {model}!", - "index": 0, - "logprobs": None, - "finish_reason": "length" - } - ], - "usage": { - "prompt_tokens": input_tokens, - "completion_tokens": output_tokens, - "total_tokens": input_tokens + output_tokens, - "time": latency - } - } - overhead = datetime.now().timestamp()-start - if latency > overhead: - time.sleep(latency-overhead) - else: - logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}") - - return jsonify(response), 200 - except Exception as e: - import traceback - traceback.print_exc() - - -@app.route('/v1/chat/completions', methods=['POST']) -def chat_completions(): - messages = request.json.get('messages') - model = request.json.get('model') - max_tokens = request.json.get('max_tokens') - if not messages or not model: - return jsonify({"status": "error", "message": "Messages and model are required"}), 400 - - arrived_at = datetime.now().timestamp() - input_tokens = sum(get_token_count(message["content"]) for message in messages) - output_tokens = max_tokens if max_tokens else randint(10, 500) - arrived_next = request.json.get('next_in') - if not arrived_next: - arrived_next = 0.0 - else: - arrived_next += arrived_at - - start = datetime.now().timestamp() - latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next)) - - # Simulated response - response = { - "id": "chatcmpl-abc123", - "object": "chat.completion", - "created": int(arrived_at), - "model": model, - "usage": { - "prompt_tokens": input_tokens, - "completion_tokens": output_tokens, - "total_tokens": input_tokens + output_tokens, - "time": latency - }, - "choices": [ - { - "message": { - "role": "assistant", - "content": f"\n\nThis is simulated message from {model}!" - }, - "logprobs": None, - "finish_reason": "stop", - "index": 0 - } - ] - } - overhead = datetime.now().timestamp()-start - if latency > overhead: - time.sleep(latency-overhead) - else: - logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}") - return jsonify(response), 200 - -@app.route('/set_metrics', methods=['POST']) -def set_metrics(): - global overrides - # Get JSON data from the request - data = request.json - if data: - # Update overrides with new key-value pairs - overrides.update(data) - return {"status": "success", "message": "Overrides updated"}, 200 - else: - return {"status": "error", "message": "No data provided"}, 400 - -@app.route('/metrics') -def metrics(): - # get deployment information - try: - apps_v1 = client.AppsV1Api() - resp = apps_v1.read_namespaced_deployment(DEPLOYMENT_NAME, NAMESPACE) - replicas = resp.spec.replicas if resp.spec.replicas is not None else 1 - except Exception as e: - print(f"Failed to get deployment information: {DEPLOYMENT_NAME=} {NAMESPACE=} {e=}, set replicas to {DEFAULT_REPLICAS}") - replicas = DEFAULT_REPLICAS - - # a reasonable mock total value - total = overrides.get("total", 0) - model_name = overrides.get("model_name", MODEL_NAME) - # calculate metrics with potential overrides - success_total = overrides.get("success_total", total / replicas) - avg_prompt_throughput = overrides.get("avg_prompt_throughput", total / replicas if replicas > 0 else 0) - avg_generation_throughput = overrides.get("avg_generation_throughput", total / replicas if replicas > 0 else 0) - running = overrides.get("running", 0) - waiting = overrides.get("waiting", 0) - swapped = overrides.get("swapped", 0) - max_running_capacity = 100 - gpu_cache_usage_perc = overrides.get("gpu_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100)) - - # construct Prometheus-style Metrics - metrics_output = f"""# HELP vllm:request_success_total Count of successfully processed requests. -# TYPE vllm:request_success_total counter -vllm:request_success_total{{finished_reason="stop",model_name="{model_name}"}} {success_total} -# HELP vllm:num_requests_running Number of requests currently running on GPU. -# TYPE vllm:num_requests_running gauge -vllm:num_requests_running{{model_name="{model_name}"}} {running} -# HELP vllm:num_requests_swapped Number of requests swapped to CPU. -# TYPE vllm:num_requests_swapped gauge -vllm:num_requests_swapped{{model_name="{model_name}"}} {swapped} -# HELP vllm:num_requests_waiting Number of requests waiting to be processed. -# TYPE vllm:num_requests_waiting gauge -vllm:num_requests_waiting{{model_name="{model_name}"}} {waiting} -# HELP vllm:avg_prompt_throughput_toks_per_s Average prefill throughput in tokens/s. -# TYPE vllm:avg_prompt_throughput_toks_per_s gauge -vllm:avg_prompt_throughput_toks_per_s{{model_name="{model_name}"}} {avg_prompt_throughput} -# HELP vllm:avg_generation_throughput_toks_per_s Average generation throughput in tokens/s. -# TYPE vllm:avg_generation_throughput_toks_per_s gauge -vllm:avg_generation_throughput_toks_per_s{{model_name="{model_name}"}} {avg_generation_throughput} -# HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage. -# TYPE vllm:gpu_cache_usage_perc gauge -vllm:gpu_cache_usage_perc{{model_name="model_name"}} {gpu_cache_usage_perc} -""" - return Response(metrics_output, mimetype='text/plain') - -if __name__ == '__main__': - logging.basicConfig(level=logging.DEBUG) - logging.getLogger("kubernetes.client.rest").setLevel(logging.ERROR) # Suppress kubenetes logs - - print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}, MODEL: {MODEL_NAME}") - - thread = simulator.start() - - import sys - if '--time_limit' not in sys.argv: - try: - # config.load_kube_config() - config.load_incluster_config() - except Exception as e: - print(f"Failed to load k8s config: {e}") - - # Perform profiling and skip actual run - app.run(host='0.0.0.0', port=8000) - - # latency = simulator.execute(Request(0, 25, 100)) - # print(f"request latency: {latency}") - - simulator.stop() - - thread.join() diff --git a/development/simulator/deployment-a100.yaml b/development/simulator/deployment-a100.yaml deleted file mode 100644 index 8436f5ef..00000000 --- a/development/simulator/deployment-a100.yaml +++ /dev/null @@ -1,161 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simulator-llama2-7b-a100 - namespace: aibrix-system - labels: - modeladapter.aibricks.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - model.aibrix.ai/port: "8000" - model.aibrix.ai/min_replicas: "1" # min replica for gpu optimizer when no workloads. -spec: - replicas: 1 - selector: - matchLabels: - modeladapter.aibricks.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - template: - metadata: - labels: - modeladapter.aibricks.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - app: "simulator-llama2-7b-a100" - spec: - serviceAccountName: pod-autoscaler - automountServiceAccountToken: true # Important! - containers: - - name: llmengine-simulator - image: aibrix/vllm-simulator:nightly - command: ["python", "app.py", "--replica_config_device", "a100"] - ports: - - containerPort: 8000 - env: - - name: DEPLOYMENT_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['app'] - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: MODEL_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['model.aibrix.ai/name'] ---- -# Debug only: Make sure pod can be visited from controller that deployed in mac. -apiVersion: v1 -kind: Service -metadata: - name: llama2-7b - namespace: aibrix-system -spec: - selector: - model.aibrix.ai/name: "llama2-7b" - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 - nodePort: 30081 - type: NodePort ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pod-autoscaler - namespace: aibrix-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: pod-reader - namespace: aibrix-system -rules: - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: read-pods - namespace: aibrix-system -subjects: - - kind: ServiceAccount - name: pod-autoscaler - namespace: aibrix-system -roleRef: - kind: Role - name: pod-reader - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: aibrix-system - name: deployment-reader -rules: - - apiGroups: ["apps"] - resources: ["deployments"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: deployment-reader-binding - namespace: aibrix-system -subjects: - - kind: ServiceAccount - name: pod-autoscaler - namespace: aibrix-system -roleRef: - kind: Role - name: deployment-reader - apiGroup: rbac.authorization.k8s.io -# --- -# for test-purpose, if need to create HTTPRoute object manually -# apiVersion: gateway.networking.k8s.io/v1 -# kind: HTTPRoute -# metadata: -# name: llama2-7b-router -# namespace: aibrix-system -# spec: -# parentRefs: -# - name: aibrix-eg -# rules: -# - matches: -# - headers: -# - type: Exact -# name: model -# value: llama2-7b -# backendRefs: -# - name: llama2-7b -# port: 8000 ---- -# Pod autoscaler works with gpu-optimizer -apiVersion: autoscaling.aibrix.ai/v1alpha1 -kind: PodAutoscaler -metadata: - name: podautoscaler-simulator-llama2-7b-a100 - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - namespace: aibrix-system -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: simulator-llama2-7b-a100 - minReplicas: 0 - maxReplicas: 10 - targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured - metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 - path: /metrics/aibrix-system/simulator-llama2-7b-a100 - metric: "vllm:deployment_replicas" - targetValue: "1" - scalingStrategy: "KPA" \ No newline at end of file diff --git a/development/simulator/deployment-a40.yaml b/development/simulator/deployment-a40.yaml deleted file mode 100644 index 7d172142..00000000 --- a/development/simulator/deployment-a40.yaml +++ /dev/null @@ -1,160 +0,0 @@ -apiVersion: apps/v1 -kind: Deployment -metadata: - name: simulator-llama2-7b-a40 - namespace: aibrix-system - labels: - modeladapter.aibricks.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - model.aibrix.ai/port: "8000" -spec: - replicas: 1 - selector: - matchLabels: - modeladapter.aibricks.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - template: - metadata: - labels: - modeladapter.aibricks.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - app: "simulator-llama2-7b-a40" - spec: - serviceAccountName: pod-autoscaler - automountServiceAccountToken: true # Important! - containers: - - name: llmengine-simulator - image: aibrix/vllm-simulator-a40:nightly - command: ["python", "app.py", "--replica_config_device", "a40"] - ports: - - containerPort: 8000 - env: - - name: DEPLOYMENT_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['app'] - - name: NAMESPACE - valueFrom: - fieldRef: - fieldPath: metadata.namespace - - name: MY_POD_IP - valueFrom: - fieldRef: - fieldPath: status.podIP - - name: MODEL_NAME - valueFrom: - fieldRef: - fieldPath: metadata.labels['model.aibrix.ai/name'] ---- -# Debug only: Make sure pod can be visited from controller that deployed in mac. -apiVersion: v1 -kind: Service -metadata: - name: llama2-7b - namespace: aibrix-system -spec: - selector: - model.aibrix.ai/name: "llama2-7b" - ports: - - protocol: TCP - port: 8000 - targetPort: 8000 - nodePort: 30081 - type: NodePort ---- -apiVersion: v1 -kind: ServiceAccount -metadata: - name: pod-autoscaler - namespace: aibrix-system ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - name: pod-reader - namespace: aibrix-system -rules: - - apiGroups: [""] - resources: ["pods"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: read-pods - namespace: aibrix-system -subjects: - - kind: ServiceAccount - name: pod-autoscaler - namespace: aibrix-system -roleRef: - kind: Role - name: pod-reader - apiGroup: rbac.authorization.k8s.io ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: Role -metadata: - namespace: aibrix-system - name: deployment-reader -rules: - - apiGroups: ["apps"] - resources: ["deployments"] - verbs: ["get", "list", "watch"] ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: RoleBinding -metadata: - name: deployment-reader-binding - namespace: aibrix-system -subjects: - - kind: ServiceAccount - name: pod-autoscaler - namespace: aibrix-system -roleRef: - kind: Role - name: deployment-reader - apiGroup: rbac.authorization.k8s.io -# --- -# for test-purpose, if need to create HTTPRoute object manually -# apiVersion: gateway.networking.k8s.io/v1 -# kind: HTTPRoute -# metadata: -# name: llama2-7b-router -# namespace: aibrix-system -# spec: -# parentRefs: -# - name: aibrix-eg -# rules: -# - matches: -# - headers: -# - type: Exact -# name: model -# value: llama2-7b -# backendRefs: -# - name: llama2-7b -# port: 8000 ---- -# Pod autoscaler works with gpu-optimizer -apiVersion: autoscaling.aibrix.ai/v1alpha1 -kind: PodAutoscaler -metadata: - name: podautoscaler-simulator-llama2-7b-a40 - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - namespace: aibrix-system -spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: simulator-llama2-7b-a40 - minReplicas: 0 - maxReplicas: 10 - targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured - metricsSources: - - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080 - path: /metrics/aibrix-system/simulator-llama2-7b-a40 - metric: "vllm:deployment_replicas" - targetValue: "1" - scalingStrategy: "KPA" \ No newline at end of file diff --git a/development/simulator/environment.yml b/development/simulator/environment.yml deleted file mode 100644 index 60f4b6d7..00000000 --- a/development/simulator/environment.yml +++ /dev/null @@ -1,29 +0,0 @@ -name: simulator -channels: - - conda-forge - - plotly -dependencies: - - python>=3.10 - - setuptools - - pip - - numpy - - plotly_express - - jupyterlab - - matplotlib - - pyyaml - - snakeviz - - scikit-learn - - python-kaleido - - wandb - - fasteners - - ray-all - - streamlit - - randomname - - flask - - kubernetes - - transformers - - pip: - - kaleido - - ddsketch - - paretoset - - git+https://github.com/zhangjyr/vidur.git \ No newline at end of file diff --git a/development/simulator/requirements.txt b/development/simulator/requirements.txt deleted file mode 100644 index 7386688c..00000000 --- a/development/simulator/requirements.txt +++ /dev/null @@ -1,14 +0,0 @@ -flask -kubernetes -numpy -pandas -scikit-learn -wandb -kaleido -ddsketch -plotly_express -matplotlib -seaborn -fasteners -transformers -git+https://github.com/zhangjyr/vidur.git \ No newline at end of file From f5e9abd2a1591fdedde37cae16f8e01c7edb57f2 Mon Sep 17 00:00:00 2001 From: Jingyuan Zhang Date: Tue, 3 Dec 2024 16:56:48 -0800 Subject: [PATCH 2/3] Fix namespace. --- development/app/README.md | 9 +++++++++ .../config/templates/podautoscaler/podautoscaler.yaml | 1 + 2 files changed, 10 insertions(+) diff --git a/development/app/README.md b/development/app/README.md index c533fb20..6a6d86df 100644 --- a/development/app/README.md +++ b/development/app/README.md @@ -37,6 +37,15 @@ Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for hig docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile . ``` +1.b (Optional) Load container image to docker context + +> Note: If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker. +> Therefore, the following command is not necessary. Only kind user need this step. + +```shell +kind load docker-image aibrix/vllm-simulator:nightly +``` + 2. Deploy simulator model image ```shell kubectl create -k config/simulator diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml index 5f114688..945fb13c 100644 --- a/development/app/config/templates/podautoscaler/podautoscaler.yaml +++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml @@ -5,6 +5,7 @@ metadata: name: podautoscaler-mock-llama2-7b labels: app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize namespace: default spec: scaleTargetRef: From 43d204c8f28b51176407dafdbc6d3347fa4ce287 Mon Sep 17 00:00:00 2001 From: Jingyuan Zhang Date: Wed, 4 Dec 2024 11:54:39 -0800 Subject: [PATCH 3/3] Using model tokenizer if possible. Token is needed for gated model. Change build arg from GPU_TYPE to SIMULATION --- development/app/Dockerfile | 11 ++++++----- development/app/Makefile | 4 ++-- development/app/README.md | 5 +++++ development/app/app.py | 19 +++++++++++++++---- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/development/app/Dockerfile b/development/app/Dockerfile index 314808ef..c5419e1f 100644 --- a/development/app/Dockerfile +++ b/development/app/Dockerfile @@ -21,16 +21,17 @@ RUN pip install --no-cache-dir -r requirements.txt COPY ./*.py /app/ ENV MODEL_NAME=llama2-7b -ARG GPU_TYPE=disabled +ENV HUGGINGFACE_TOKEN="your huggingface token" +ARG SIMULATION=disabled # Trigger profiling -RUN if [ "$GPU_TYPE" != "disabled" ]; then \ - python app.py --time_limit 1000 --replica_config_device ${GPU_TYPE}; \ +RUN if [ "$SIMULATION" != "disabled" ]; then \ + python app.py --time_limit 1000 --replica_config_device ${SIMULATION}; \ fi # Expose the port the app runs on EXPOSE 8000 # Run the application, environment variable is necessary to apply ARG -ENV GPU_TYPE=$GPU_TYPE -CMD python app.py --replica_config_device ${GPU_TYPE} +ENV SIMULATION=$SIMULATION +CMD python app.py --replica_config_device ${SIMULATION} diff --git a/development/app/Makefile b/development/app/Makefile index 909a775c..42edae1b 100644 --- a/development/app/Makefile +++ b/development/app/Makefile @@ -4,10 +4,10 @@ docker-build-mock: docker build -t aibrix/vllm-mock:nightly -f Dockerfile . docker-build-simulator: - docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile . + docker build -t aibrix/vllm-simulator:nightly --build-arg SIMULATION=a100 -f Dockerfile . docker-build-simulator-a40: - docker build -t aibrix/vllm-simulator-a40:nightly --build-arg GPU_TYPE=a40 -f Dockerfile . + docker build -t aibrix/vllm-simulator-a40:nightly --build-arg SIMULATION=a40 -f Dockerfile . docker-build: docker-build-mock diff --git a/development/app/README.md b/development/app/README.md index 6a6d86df..8f455006 100644 --- a/development/app/README.md +++ b/development/app/README.md @@ -32,6 +32,11 @@ kubectl delete -k config/mock ### Deploy the simulator app Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for high-fidality vLLM simulation: +0. Config HuggingFace token for model tokenizer by changing HUGGINGFACE_TOKEN in Dockerfile +``` +ENV HUGGINGFACE_TOKEN="your huggingface token" +``` + 1. Builder simulator base model image ```dockerfile docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile . diff --git a/development/app/app.py b/development/app/app.py index a2a420b6..c9b2b503 100644 --- a/development/app/app.py +++ b/development/app/app.py @@ -26,6 +26,7 @@ DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'llama2-70b') NAMESPACE = os.getenv('POD_NAMESPACE', 'default') DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1')) +HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN', "your huggingface token") modelMaps = { "llama2-7b": "meta-llama/Llama-2-7b-hf", @@ -589,10 +590,20 @@ def metrics(): if gpu_device != "disabled": # Load the tokenizer for your model from transformers import AutoTokenizer - tokenizer = AutoTokenizer.from_pretrained( - 'bert-base-uncased', - model_max_length=16384, # Suppress warning - clean_up_tokenization_spaces=True) + default_model = 'bert-base-uncased' + try: + token_model = modelMaps.get(MODEL_NAME, default_model) + tokenizer = AutoTokenizer.from_pretrained( + token_model, + token=HUGGINGFACE_TOKEN, + model_max_length=16384, # Suppress warning + clean_up_tokenization_spaces=True) + except Exception as e: + logger.error(f"Failed to initialize tokenizer, will use default tokenizer model: {e}") + tokenizer = AutoTokenizer.from_pretrained( + default_model, + model_max_length=16384, # Suppress warning + clean_up_tokenization_spaces=True) simulator = Simulator(SimulationConfig.create_from_cli_args()) overrides = {