From d081b3cc73708bac3c49522f33232b249dfdf30e Mon Sep 17 00:00:00 2001
From: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
Date: Tue, 3 Dec 2024 16:54:10 -0800
Subject: [PATCH 1/3] Merge simulator to app

---
 development/app/Dockerfile                    |  20 +-
 development/{simulator => app}/Makefile       |  62 ++--
 development/app/README.md                     |  23 +-
 development/app/app.py                        | 266 ++++++++++----
 .../config/heterogeneous/kustomization.yaml   |   7 +
 .../simulator_a40/kustomization.yaml          |  25 ++
 .../simulator_a40/patch_deployment_a40.yaml   |  25 ++
 .../patch_podautoscaler_a40.yaml              |  15 +
 .../simulator_a40/rename_deployment_a40.json  |   7 +
 .../rename_podautoscaler_a40.json             |   7 +
 .../mock/components.yaml}                     |  77 +---
 .../app/config/mock/kustomization.yaml        |   3 +
 .../app/config/simulator/kustomization.yaml   |  25 ++
 .../simulator/patch_deployment_a100.yaml      |  26 ++
 .../simulator/patch_podautoscaler_a100.yaml   |  15 +
 .../simulator/rename_deployment_a100.json     |   7 +
 .../simulator/rename_podautoscaler_a100.json  |   7 +
 .../templates/deployment/deployment.yaml      |  71 ++++
 .../templates/deployment/kustomization.yaml   |   2 +
 .../podautoscaler/kustomization.yaml          |   2 +
 .../podautoscaler/podautoscaler.yaml          |  22 ++
 development/app/requirements.txt              |  14 +-
 development/{simulator => app}/simulator.py   |   0
 development/simulator/Dockerfile              |  33 --
 development/simulator/README.md               |  75 ----
 development/simulator/app.py                  | 336 ------------------
 development/simulator/deployment-a100.yaml    | 161 ---------
 development/simulator/deployment-a40.yaml     | 160 ---------
 development/simulator/environment.yml         |  29 --
 development/simulator/requirements.txt        |  14 -
 30 files changed, 557 insertions(+), 979 deletions(-)
 rename development/{simulator => app}/Makefile (93%)
 create mode 100644 development/app/config/heterogeneous/kustomization.yaml
 create mode 100644 development/app/config/heterogeneous/simulator_a40/kustomization.yaml
 create mode 100644 development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml
 create mode 100644 development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
 create mode 100644 development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json
 create mode 100644 development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json
 rename development/app/{deployment.yaml => config/mock/components.yaml} (50%)
 create mode 100644 development/app/config/mock/kustomization.yaml
 create mode 100644 development/app/config/simulator/kustomization.yaml
 create mode 100644 development/app/config/simulator/patch_deployment_a100.yaml
 create mode 100644 development/app/config/simulator/patch_podautoscaler_a100.yaml
 create mode 100644 development/app/config/simulator/rename_deployment_a100.json
 create mode 100644 development/app/config/simulator/rename_podautoscaler_a100.json
 create mode 100644 development/app/config/templates/deployment/deployment.yaml
 create mode 100644 development/app/config/templates/deployment/kustomization.yaml
 create mode 100644 development/app/config/templates/podautoscaler/kustomization.yaml
 create mode 100644 development/app/config/templates/podautoscaler/podautoscaler.yaml
 rename development/{simulator => app}/simulator.py (100%)
 delete mode 100644 development/simulator/Dockerfile
 delete mode 100644 development/simulator/README.md
 delete mode 100644 development/simulator/app.py
 delete mode 100644 development/simulator/deployment-a100.yaml
 delete mode 100644 development/simulator/deployment-a40.yaml
 delete mode 100644 development/simulator/environment.yml
 delete mode 100644 development/simulator/requirements.txt

diff --git a/development/app/Dockerfile b/development/app/Dockerfile
index cec8d075..314808ef 100644
--- a/development/app/Dockerfile
+++ b/development/app/Dockerfile
@@ -1,9 +1,10 @@
 # Use the official Python base image
-FROM python:3.9-slim
+FROM python:3.10-slim
 
 # Set environment variables
 ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
+ENV WANDB_MODE=disabled
 
 # Set the working directory
 WORKDIR /app
@@ -12,15 +13,24 @@ WORKDIR /app
 COPY requirements.txt /app/
 
 # Install dependencies
-RUN apt update && apt install -y curl jq
+RUN apt update && apt install -y curl jq git
 
 RUN pip install --no-cache-dir -r requirements.txt
 
 # Copy the rest of the application code into the container
-COPY . /app/
+COPY ./*.py /app/
+
+ENV MODEL_NAME=llama2-7b
+ARG GPU_TYPE=disabled
+
+# Trigger profiling
+RUN if [ "$GPU_TYPE" != "disabled" ]; then \
+        python app.py --time_limit 1000 --replica_config_device ${GPU_TYPE}; \
+    fi
 
 # Expose the port the app runs on
 EXPOSE 8000
 
-# Run the application
-CMD ["python", "app.py"]
+# Run the application, environment variable is necessary to apply ARG
+ENV GPU_TYPE=$GPU_TYPE
+CMD python app.py --replica_config_device ${GPU_TYPE}
diff --git a/development/simulator/Makefile b/development/app/Makefile
similarity index 93%
rename from development/simulator/Makefile
rename to development/app/Makefile
index 2805f99d..909a775c 100644
--- a/development/simulator/Makefile
+++ b/development/app/Makefile
@@ -1,28 +1,55 @@
 all: build
 
-build-a100: 
-	docker build -t aibrix/vllm-simulator:nightly -f Dockerfile .
+docker-build-mock:
+	docker build -t aibrix/vllm-mock:nightly -f Dockerfile .
 
-build-a40: 
+docker-build-simulator: 
+	docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile .
+
+docker-build-simulator-a40:
 	docker build -t aibrix/vllm-simulator-a40:nightly --build-arg GPU_TYPE=a40 -f Dockerfile .
 
-build: build-a100
+docker-build: docker-build-mock
 
-deploy-a100:
-	kubectl apply -f deployment-a100.yaml
+deploy-mock:
+	kubectl create -k config/mock
+	sleep 2
+	kubectl port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 &
+	kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 &
 
-deploy-a40:
-	kubectl apply -f deployment-a40.yaml
+deploy-simulator:
+	kubectl create -k config/simulator
+	sleep 2
+	kubectl port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 &
+	kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 &
 
-deploy: deploy-a100
+deploy-heterogeneous:
+	kubectl create -k config/heterogeneous
 	sleep 2
-	kubectl -n aibrix-system port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 &
+	kubectl port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 &
+	kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 &
+
+deploy: deploy-mock
+
+clean-mock:
+	kubectl delete -k config/mock
+	sleep 1
+	curl http://localhost:8000/metrics
+	curl http://localhost:8888/metrics
 
-clean:
-	kubectl delete -f deployment-a100.yaml
-	kubectl delete -f deployment-a40.yaml
+clean-simulator:
+	kubectl delete -k config/simulator
 	sleep 1
 	curl http://localhost:8000/metrics
+	curl http://localhost:8888/metrics
+
+clean-heterogeneous:
+	kubectl delete -k config/heterogeneous
+	sleep 1
+	curl http://localhost:8000/metrics
+	curl http://localhost:8888/metrics
+
+clean: clean-mock
 
 test:
 	curl http://localhost:8000/v1/chat/completions \
@@ -56,17 +83,8 @@ test-long:
 			"max_tokens": 50 \
 		}'
 
-init-local-gateway-call:
-	kubectl -n aibrix-system port-forward svc/aibrix-gateway-users 8090:8090 1>/dev/null 2>&1 &
-	kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 &
-	sleep 1
-	curl http://localhost:8090/CreateUser \
-		-H "Content-Type: application/json" \
-		-d '{"name": "your-user-name","rpm": 1000,"tpm": 100000}'
-
 test-gateway:
 	curl -v http://localhost:8888/v1/chat/completions \
-		-H "user: your-user-name" \
 		-H "model: llama2-7b" \
 		-H "Content-Type: application/json" \
 		-H "Authorization: Bearer any_key" \
diff --git a/development/app/README.md b/development/app/README.md
index 3b87c2bf..c533fb20 100644
--- a/development/app/README.md
+++ b/development/app/README.md
@@ -13,7 +13,7 @@
 docker build -t aibrix/vllm-mock:nightly -f Dockerfile .
 ```
 
-1. (Optional) Load container image to docker context
+1.b (Optional) Load container image to docker context
 
 > Note: If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker.
 > Therefore, the following command is not necessary. Only kind user need this step.
@@ -22,12 +22,27 @@ docker build -t aibrix/vllm-mock:nightly -f Dockerfile .
 kind load docker-image aibrix/vllm-mock:nightly
 ```
 
-1. Deploy mocked model image
+2. Deploy mocked model image
 ```shell
-kubectl apply -f deployment.yaml
+kubectl create -k config/mock
 
 # you can run following command to delete the deployment 
-kubectl delete -f deployment.yaml
+kubectl delete -k config/mock
+```
+
+### Deploy the simulator app
+Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for high-fidality vLLM simulation:
+1. Builder simulator base model image
+```dockerfile
+docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile .
+```
+
+2. Deploy simulator model image
+```shell
+kubectl create -k config/simulator
+
+# you can run following command to delete the deployment 
+kubectl delete -k config/simulator
 ```
 
 ### Test the metric invocation
diff --git a/development/app/app.py b/development/app/app.py
index 07e2aec4..a2a420b6 100644
--- a/development/app/app.py
+++ b/development/app/app.py
@@ -2,23 +2,54 @@
 from werkzeug import serving
 import random
 import re
+import logging
+import sys
 import time
+from datetime import datetime
 from random import randint
 import os
+from typing import Optional
 
 try:
     from kubernetes import client, config
 except Exception as e:
     print(f"Failed to import kubernetes, skip: {e}")
 
+from simulator import Simulator
+from vidur.config import SimulationConfig
+from vidur.entities import Request
+
 # Global storage for overridden values
 overrides = {}
 
-MODEL_NAME = 'llama2-70b'
+MODEL_NAME = os.getenv('MODEL_NAME', 'llama2-70b')
 DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'llama2-70b')
 NAMESPACE = os.getenv('POD_NAMESPACE', 'default')
 DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1'))
 
+modelMaps = {
+    "llama2-7b": "meta-llama/Llama-2-7b-hf",
+    "llama2-70b": "meta-llama/Llama-2-70b-hf"
+}
+sys.argv.append(f"--replica_config_model_name={modelMaps.get(MODEL_NAME, MODEL_NAME)}")
+
+tokenizer = None
+simulator: Optional[Simulator] = None
+
+logger = logging.getLogger(__name__)
+
+def get_token_count(text):
+    try:
+        # Encode the text
+        encoded_input = tokenizer(text)
+
+        # Get the number of tokens
+        return len(encoded_input['input_ids'])
+    except Exception as e:
+        logger.error(f"Failed to get number of tokens: {e}")
+
+    return 1
+
 models = [
     {
         "id": "meta-llama/Llama-2-7b-hf",
@@ -127,73 +158,128 @@ def unload_model():
 
 @app.route('/v1/completions', methods=['POST'])
 def completion():
-    prompt = request.json.get('prompt')
-    model = request.json.get('model')
-    if not prompt or not model:
-        return jsonify({"status": "error", "message": "Prompt and model are required"}), 400
-
-    prompt_tokens = randint(1, 100)
-    completion_tokens = randint(1, 100)
-
-    # Simulated response
-    response = {
-        "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
-        "object": "text_completion",
-        "created": 1589478378,
-        "model": model,
-        "system_fingerprint": "fp_44709d6fcb",
-        "choices": [
-            {
-                "text": f"This is indeed a test from model {model}!",
-                "index": 0,
-                "logprobs": None,
-                "finish_reason": "length"
+    try:
+        prompt = request.json.get('prompt')
+        model = request.json.get('model')
+        max_tokens = request.json.get('max_tokens')
+        if not prompt or not model:
+            return jsonify({"status": "error", "message": "Prompt and model are required"}), 400
+        
+        arrived_at = datetime.now().timestamp()
+        input_tokens = get_token_count(prompt)
+        output_tokens = max_tokens if max_tokens else randint(10, 500)
+        arrived_next = request.json.get('next_in')
+        if not arrived_next:
+            arrived_next = 0.0
+        else:
+            arrived_next += arrived_at
+
+        start = datetime.now().timestamp()
+        latency = 0.0
+        if simulator is not None:
+            latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next))
+
+        # Simulated response
+        response = {
+            "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
+            "object": "text_completion",
+            "created": int(arrived_at),
+            "model": model,
+            "system_fingerprint": "fp_44709d6fcb",
+            "choices": [
+                {
+                    "text": f"This is simulated message from {model}!",
+                    "index": 0,
+                    "logprobs": None,
+                    "finish_reason": "length"
+                }
+            ],
+            "usage": {
+                "prompt_tokens": input_tokens,
+                "completion_tokens": output_tokens,
+                "total_tokens": input_tokens + output_tokens,
+                "time": latency
             }
-        ],
-        "usage": {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
         }
-    }
-    return jsonify(response), 200
+        overhead = datetime.now().timestamp()-start
+        if latency > overhead:
+            time.sleep(latency-overhead)
+        elif latency > 0.0:
+            logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}")
+        
+        return jsonify(response), 200
+    except Exception as e:
+        err = {
+            "error": {
+                "message": f"The server had an error while processing your request: {e}",
+                "type": "server_error"
+            }
+        }
+        return jsonify(err), 500
 
 
 @app.route('/v1/chat/completions', methods=['POST'])
 def chat_completions():
-    messages = request.json.get('messages')
-    model = request.json.get('model')
-    if not messages or not model:
-        return jsonify({"status": "error", "message": "Messages and model are required"}), 400
-
-    prompt_tokens = randint(1, 100)
-    completion_tokens = randint(1, 100)
-
-    # Simulated response
-    response = {
-        "id": "chatcmpl-abc123",
-        "object": "chat.completion",
-        "created": 1677858242,
-        "model": model,
-        "usage": {
-            "prompt_tokens": prompt_tokens,
-            "completion_tokens": completion_tokens,
-            "total_tokens": prompt_tokens + completion_tokens
-        },
-        "choices": [
-            {
-                "message": {
-                    "role": "assistant",
-                    "content": f"\n\nThis is a test from{model}!"
-                },
-                "logprobs": None,
-                "finish_reason": "stop",
-                "index": 0
-            }
-        ]
-    }
-    return jsonify(response), 200
+    try:
+        messages = request.json.get('messages')
+        model = request.json.get('model')
+        max_tokens = request.json.get('max_tokens')
+        if not messages or not model:
+            return jsonify({"status": "error", "message": "Messages and model are required"}), 400
+        
+        arrived_at = datetime.now().timestamp()
+        input_tokens = sum(get_token_count(message["content"]) for message in messages)
+        output_tokens = max_tokens if max_tokens else randint(10, 500)
+        arrived_next = request.json.get('next_in')
+        if not arrived_next:
+            arrived_next = 0.0
+        else:
+            arrived_next += arrived_at
+
+        start = datetime.now().timestamp()
+        latency = 0.0
+        if simulator is not None:
+            latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next))
+
+        # Simulated response
+        response = {
+            "id": "chatcmpl-abc123",
+            "object": "chat.completion",
+            "created": int(arrived_at),
+            "model": model,
+            "usage": {
+                "prompt_tokens": input_tokens,
+                "completion_tokens": output_tokens,
+                "total_tokens": input_tokens + output_tokens,
+                "time": latency
+            },
+            "choices": [
+                {
+                    "message": {
+                        "role": "assistant",
+                        "content": f"\n\nThis is simulated message from {model}!"
+                    },
+                    "logprobs": None,
+                    "finish_reason": "stop",
+                    "index": 0
+                }
+            ]
+        }
+        overhead = datetime.now().timestamp()-start
+        if latency > overhead:
+            time.sleep(latency-overhead)
+        else:
+            logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}")
 
+        return jsonify(response), 200
+    except Exception as e:
+        err = {
+            "error": {
+                "message": f"The server had an error while processing your request: {e}",
+                "type": "server_error"
+            }
+        }
+        return jsonify(err), 500
 
 @app.route('/set_metrics', methods=['POST'])
 def set_metrics():
@@ -481,11 +567,57 @@ def metrics():
 
 
 if __name__ == '__main__':
+    logging.basicConfig(level=logging.DEBUG)
+    logging.getLogger("kubernetes.client.rest").setLevel(logging.ERROR)  # Suppress kubenetes logs
+
+    print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}, MODEL: {MODEL_NAME}")
+
+    # Extract gpu_device without call argparse
+    gpu_device = "disabled"
     try:
-        # config.load_kube_config()
-        config.load_incluster_config()
-    except Exception as e:
-        print(f"Failed to load k8s config: {e}")
+        index = sys.argv.index("--replica_config_device")
+        if index + 1 < len(sys.argv):
+            gpu_device = sys.argv[index + 1]
+    except ValueError:
+        pass
+    
+    # Restore -h functionality
+    if '-h' in sys.argv:
+        SimulationConfig.create_from_cli_args()
+
+    # Launch simulator
+    if gpu_device != "disabled":
+        # Load the tokenizer for your model
+        from transformers import AutoTokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            'bert-base-uncased',
+            model_max_length=16384, # Suppress warning
+            clean_up_tokenization_spaces=True)
+
+        simulator = Simulator(SimulationConfig.create_from_cli_args())
+        overrides = {
+            "total": 100.0,
+            "running": 0,
+            "waiting": 0,
+            "swapped": 0
+        }
+
+    thread = None
+    if simulator is not None:
+        thread = simulator.start()
+
+    # Perform profiling and skip actual run
+    if '--time_limit' not in sys.argv:
+        try:
+            # config.load_kube_config()
+            config.load_incluster_config()
+        except Exception as e:
+            print(f"Failed to load k8s config: {e}")
+
+        app.run(host='0.0.0.0', port=8000)
+
+    if simulator is not None:
+        simulator.stop()
 
-    print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}")
-    app.run(host='0.0.0.0', port=8000)
+    if thread is not None:
+        thread.join()
\ No newline at end of file
diff --git a/development/app/config/heterogeneous/kustomization.yaml b/development/app/config/heterogeneous/kustomization.yaml
new file mode 100644
index 00000000..91c78376
--- /dev/null
+++ b/development/app/config/heterogeneous/kustomization.yaml
@@ -0,0 +1,7 @@
+kind: Kustomization
+
+resources:
+- ../simulator
+- simulator_a40
+
+apiVersion: kustomize.config.k8s.io/v1beta1
diff --git a/development/app/config/heterogeneous/simulator_a40/kustomization.yaml b/development/app/config/heterogeneous/simulator_a40/kustomization.yaml
new file mode 100644
index 00000000..e593f011
--- /dev/null
+++ b/development/app/config/heterogeneous/simulator_a40/kustomization.yaml
@@ -0,0 +1,25 @@
+kind: Kustomization
+
+resources:
+- ../../templates/deployment
+- ../../templates/podautoscaler
+
+patches:
+- path: rename_deployment_a40.json
+  target:
+    kind: Deployment
+    name: mock-llama2-7b
+- path: rename_podautoscaler_a40.json
+  target:
+    kind: PodAutoscaler
+    name: podautoscaler-mock-llama2-7b
+- path: patch_deployment_a40.yaml
+  target:
+    kind: Deployment
+    name: mock-llama2-7b
+- path: patch_podautoscaler_a40.yaml
+  target:
+    kind: PodAutoscaler
+    name: podautoscaler-simulator-llama2-7b-a40
+    
+apiVersion: kustomize.config.k8s.io/v1beta1
diff --git a/development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml
new file mode 100644
index 00000000..5daa884c
--- /dev/null
+++ b/development/app/config/heterogeneous/simulator_a40/patch_deployment_a40.yaml
@@ -0,0 +1,25 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: simulator-llama2-7b-a40
+  labels:
+    model.aibrix.ai/name: "llama2-7b"
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: "llama2-7b"
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: "llama2-7b"
+        app: "simulator-llama2-7b-a40"
+    spec:
+      containers:
+        - name: llm-engine
+          image: aibrix/vllm-simulator-a40:nightly
+          env:
+            - name: MODEL_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.labels['model.aibrix.ai/name']
\ No newline at end of file
diff --git a/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
new file mode 100644
index 00000000..93b2d37d
--- /dev/null
+++ b/development/app/config/heterogeneous/simulator_a40/patch_podautoscaler_a40.yaml
@@ -0,0 +1,15 @@
+# Pod autoscaler works with gpu-optimizer
+apiVersion: autoscaling.aibrix.ai/v1alpha1
+kind: PodAutoscaler
+metadata:
+  name: podautoscaler-simulator-llama2-7b-a40
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: simulator-llama2-7b-a40
+  metricsSources: 
+    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+      path: /metrics/default/simulator-llama2-7b-a40
+      metric: "vllm:deployment_replicas"
+  targetValue: "1"
\ No newline at end of file
diff --git a/development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json b/development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json
new file mode 100644
index 00000000..bf964cbd
--- /dev/null
+++ b/development/app/config/heterogeneous/simulator_a40/rename_deployment_a40.json
@@ -0,0 +1,7 @@
+[
+    {
+        "op": "replace",
+        "path": "/metadata/name",
+        "value": "simulator-llama2-7b-a40"
+    }
+]
\ No newline at end of file
diff --git a/development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json b/development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json
new file mode 100644
index 00000000..99d1ed4d
--- /dev/null
+++ b/development/app/config/heterogeneous/simulator_a40/rename_podautoscaler_a40.json
@@ -0,0 +1,7 @@
+[
+    {
+        "op": "replace",
+        "path": "/metadata/name",
+        "value": "podautoscaler-simulator-llama2-7b-a40"
+    }
+]
\ No newline at end of file
diff --git a/development/app/deployment.yaml b/development/app/config/mock/components.yaml
similarity index 50%
rename from development/app/deployment.yaml
rename to development/app/config/mock/components.yaml
index 7e4fc772..e66c2f8e 100644
--- a/development/app/deployment.yaml
+++ b/development/app/config/mock/components.yaml
@@ -1,75 +1,8 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: llama2-70b
-  namespace: default
-  labels:
-    model.aibrix.ai/name: "llama2-70b"
-    model.aibrix.ai/port: "8000"
-    adapter.model.aibrix.ai/enabled: "true"
-spec:
-  replicas: 3
-  selector:
-    matchLabels:
-      adapter.model.aibrix.ai/enabled: "true"
-      model.aibrix.ai/name: "llama2-70b"
-  template:
-    metadata:
-      labels:
-        adapter.model.aibrix.ai/enabled: "true"
-        model.aibrix.ai/name: "llama2-70b"
-    spec:
-      serviceAccountName: mocked-app-sa
-      containers:
-        - name: llm-engine
-          image: aibrix/vllm-mock:nightly
-          ports:
-            - containerPort: 8000
-          env:
-            - name: POD_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.name
-            - name: POD_NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-            - name: MY_POD_IP
-              valueFrom:
-                fieldRef:
-                  fieldPath: status.podIP
-        - name: aibrix-runtime
-          image: aibrix/runtime:nightly
-          command:
-            - aibrix_runtime
-            - --port
-            - "8080"
-          env:
-            - name: INFERENCE_ENGINE
-              value: vllm
-            - name: INFERENCE_ENGINE_ENDPOINT
-              value: http://localhost:8000
-          ports:
-            - containerPort: 8080
-              protocol: TCP
-          livenessProbe:
-            httpGet:
-              path: /healthz
-              port: 8080
-            initialDelaySeconds: 3
-            periodSeconds: 2
-          readinessProbe:
-            httpGet:
-              path: /ready
-              port: 8080
-            initialDelaySeconds: 5
-            periodSeconds: 10
----
 # Debug only: Make sure pod can be visited from controller that deployed in mac.
 apiVersion: v1
 kind: Service
 metadata:
-  name: llama2-70b
+  name: llama2-7b
   namespace: default
   labels:
     prometheus-discovery: "true"
@@ -79,7 +12,7 @@ metadata:
     prometheus.io/port: "8000"
 spec:
   selector:
-    model.aibrix.ai/name: "llama2-70b"
+    model.aibrix.ai/name: "llama2-7b"
   ports:
     - protocol: TCP
       name: metrics
@@ -146,7 +79,7 @@ roleRef:
 # apiVersion: gateway.networking.k8s.io/v1
 # kind: HTTPRoute
 # metadata:
-#   name: llama2-70b-router
+#   name: llama2-7b-router
 #   namespace: default
 # spec:
 #   parentRefs:
@@ -156,7 +89,7 @@ roleRef:
 #         - headers:
 #             - type: Exact
 #               name: model
-#               value: llama2-70b
+#               value: llama2-7b
 #       backendRefs:
-#         - name: llama2-70b
+#         - name: llama2-7b
 #           port: 8000
\ No newline at end of file
diff --git a/development/app/config/mock/kustomization.yaml b/development/app/config/mock/kustomization.yaml
new file mode 100644
index 00000000..e0947b6f
--- /dev/null
+++ b/development/app/config/mock/kustomization.yaml
@@ -0,0 +1,3 @@
+resources:
+  - ../templates/deployment
+  - components.yaml
diff --git a/development/app/config/simulator/kustomization.yaml b/development/app/config/simulator/kustomization.yaml
new file mode 100644
index 00000000..6dff0bb1
--- /dev/null
+++ b/development/app/config/simulator/kustomization.yaml
@@ -0,0 +1,25 @@
+kind: Kustomization
+
+resources:
+- ../mock
+- ../templates/podautoscaler
+
+patches:
+- path: rename_deployment_a100.json
+  target:
+    kind: Deployment
+    name: mock-llama2-7b
+- path: rename_podautoscaler_a100.json
+  target:
+    kind: PodAutoscaler
+    name: podautoscaler-mock-llama2-7b
+- path: patch_deployment_a100.yaml
+  target:
+    kind: Deployment
+    name: simulator-llama2-7b-a100
+- path: patch_podautoscaler_a100.yaml
+  target:
+    kind: PodAutoscaler
+    name: podautoscaler-simulator-llama2-7b-a100
+    
+apiVersion: kustomize.config.k8s.io/v1beta1
diff --git a/development/app/config/simulator/patch_deployment_a100.yaml b/development/app/config/simulator/patch_deployment_a100.yaml
new file mode 100644
index 00000000..01b38a96
--- /dev/null
+++ b/development/app/config/simulator/patch_deployment_a100.yaml
@@ -0,0 +1,26 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: simulator-llama2-7b-a100
+  labels:
+    model.aibrix.ai/name: "llama2-7b"
+    model.aibrix.ai/min_replicas: "1" # min replica for gpu optimizer when no workloads.
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: "llama2-7b"
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: "llama2-7b"
+        app: "simulator-llama2-7b-a100"
+    spec:
+      containers:
+        - name: llm-engine
+          image: aibrix/vllm-simulator:nightly
+          env:
+            - name: MODEL_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.labels['model.aibrix.ai/name']
\ No newline at end of file
diff --git a/development/app/config/simulator/patch_podautoscaler_a100.yaml b/development/app/config/simulator/patch_podautoscaler_a100.yaml
new file mode 100644
index 00000000..c04a2c19
--- /dev/null
+++ b/development/app/config/simulator/patch_podautoscaler_a100.yaml
@@ -0,0 +1,15 @@
+# Pod autoscaler works with gpu-optimizer
+apiVersion: autoscaling.aibrix.ai/v1alpha1
+kind: PodAutoscaler
+metadata:
+  name: podautoscaler-simulator-llama2-7b-a100
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: simulator-llama2-7b-a100
+  metricsSources: 
+    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+      path: /metrics/default/simulator-llama2-7b-a100
+      metric: "vllm:deployment_replicas"
+  targetValue: "1"
\ No newline at end of file
diff --git a/development/app/config/simulator/rename_deployment_a100.json b/development/app/config/simulator/rename_deployment_a100.json
new file mode 100644
index 00000000..7788ba4f
--- /dev/null
+++ b/development/app/config/simulator/rename_deployment_a100.json
@@ -0,0 +1,7 @@
+[
+    {
+        "op": "replace",
+        "path": "/metadata/name",
+        "value": "simulator-llama2-7b-a100"
+    }
+]
\ No newline at end of file
diff --git a/development/app/config/simulator/rename_podautoscaler_a100.json b/development/app/config/simulator/rename_podautoscaler_a100.json
new file mode 100644
index 00000000..fc58fce7
--- /dev/null
+++ b/development/app/config/simulator/rename_podautoscaler_a100.json
@@ -0,0 +1,7 @@
+[
+    {
+        "op": "replace",
+        "path": "/metadata/name",
+        "value": "podautoscaler-simulator-llama2-7b-a100"
+    }
+]
\ No newline at end of file
diff --git a/development/app/config/templates/deployment/deployment.yaml b/development/app/config/templates/deployment/deployment.yaml
new file mode 100644
index 00000000..153d2c17
--- /dev/null
+++ b/development/app/config/templates/deployment/deployment.yaml
@@ -0,0 +1,71 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: mock-llama2-7b
+  namespace: default
+  labels:
+    model.aibrix.ai/name: "llama2-7b"
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+spec:
+  replicas: 3
+  selector:
+    matchLabels:
+      adapter.model.aibrix.ai/enabled: "true"
+      model.aibrix.ai/name: "llama2-7b"
+  template:
+    metadata:
+      labels:
+        adapter.model.aibrix.ai/enabled: "true"
+        model.aibrix.ai/name: "llama2-7b"
+        app: "mock-llama2-7b"
+    spec:
+      serviceAccountName: mocked-app-sa
+      containers:
+        - name: llm-engine
+          image: aibrix/vllm-mock:nightly
+          ports:
+            - containerPort: 8000
+          env:
+            - name: DEPLOYMENT_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.labels['app']
+            - name: POD_NAME
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.name
+            - name: POD_NAMESPACE
+              valueFrom:
+                fieldRef:
+                  fieldPath: metadata.namespace
+            - name: MY_POD_IP
+              valueFrom:
+                fieldRef:
+                  fieldPath: status.podIP
+        - name: aibrix-runtime
+          image: aibrix/runtime:nightly
+          command:
+            - aibrix_runtime
+            - --port
+            - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
\ No newline at end of file
diff --git a/development/app/config/templates/deployment/kustomization.yaml b/development/app/config/templates/deployment/kustomization.yaml
new file mode 100644
index 00000000..9519a26d
--- /dev/null
+++ b/development/app/config/templates/deployment/kustomization.yaml
@@ -0,0 +1,2 @@
+resources:
+  - deployment.yaml
diff --git a/development/app/config/templates/podautoscaler/kustomization.yaml b/development/app/config/templates/podautoscaler/kustomization.yaml
new file mode 100644
index 00000000..77628acc
--- /dev/null
+++ b/development/app/config/templates/podautoscaler/kustomization.yaml
@@ -0,0 +1,2 @@
+resources:
+  - podautoscaler.yaml
diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml
new file mode 100644
index 00000000..5f114688
--- /dev/null
+++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml
@@ -0,0 +1,22 @@
+# Pod autoscaler works with gpu-optimizer
+apiVersion: autoscaling.aibrix.ai/v1alpha1
+kind: PodAutoscaler
+metadata:
+  name: podautoscaler-mock-llama2-7b
+  labels:
+    app.kubernetes.io/name: aibrix
+  namespace: default
+spec:
+  scaleTargetRef:
+    apiVersion: apps/v1
+    kind: Deployment
+    name: mock-llama2-7b
+  minReplicas: 0
+  maxReplicas: 10
+  targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured
+  metricsSources: 
+    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
+      path: /metrics/default/simulator-llama2-7b
+      metric: "vllm:deployment_replicas"
+  targetValue: "1"
+  scalingStrategy: "KPA"
\ No newline at end of file
diff --git a/development/app/requirements.txt b/development/app/requirements.txt
index 13814d63..7386688c 100644
--- a/development/app/requirements.txt
+++ b/development/app/requirements.txt
@@ -1,2 +1,14 @@
 flask
-kubernetes
\ No newline at end of file
+kubernetes
+numpy
+pandas
+scikit-learn
+wandb
+kaleido
+ddsketch
+plotly_express
+matplotlib
+seaborn
+fasteners
+transformers
+git+https://github.com/zhangjyr/vidur.git
\ No newline at end of file
diff --git a/development/simulator/simulator.py b/development/app/simulator.py
similarity index 100%
rename from development/simulator/simulator.py
rename to development/app/simulator.py
diff --git a/development/simulator/Dockerfile b/development/simulator/Dockerfile
deleted file mode 100644
index a0777710..00000000
--- a/development/simulator/Dockerfile
+++ /dev/null
@@ -1,33 +0,0 @@
-# Use the official Python base image
-FROM python:3.10-slim
-
-# Set environment variables
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PYTHONUNBUFFERED=1
-ENV WANDB_MODE=disabled
-
-# Set the working directory
-WORKDIR /simulator
-
-# Copy the requirements file into the container
-COPY requirements.txt /simulator/
-
-# Install dependencies
-RUN apt update && apt install -y curl jq git
-
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Copy the rest of the application code into the container
-COPY ./*.py /simulator/
-# COPY ./model_cache /simulator/model_cache
-
-ENV MODEL_NAME=llama2-7b
-ARG GPU_TYPE=a100
- # Trigger profiling
-RUN python app.py --time_limit 1000 --replica_config_device ${GPU_TYPE}
-
-# Expose the port the app runs on
-EXPOSE 8000
-
-# Run the application
-CMD ["python", "app.py"]
diff --git a/development/simulator/README.md b/development/simulator/README.md
deleted file mode 100644
index b337a053..00000000
--- a/development/simulator/README.md
+++ /dev/null
@@ -1,75 +0,0 @@
-# vLLM application simulator
-
-## Run locally
-
-Ensure that you have Python 3.10 installed on your system. Refer https://www.bitecode.dev/p/installing-python-the-bare-minimum
-Create a virtual environment using venv module using python3.10 -m venv .venv
-Activate the virtual environment using source .venv/bin/activate
-Install the dependencies using python -m pip install -r requirements.txt
-Run python app.py to start the server.
-Run deactivate to deactivate the virtual environment
-
-## Run in kubernetes
-
-1. Build simulated base model image
-```dockerfile
-docker build -t aibrix/vllm-simulator:nightly -f Dockerfile .
-
-# If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker.
-# Therefore, the following command is not necessary.
-kind load docker-image aibrix/vllm-simulator:nightly
-```
-
-2. Deploy simulated model image
-```shell
-kubectl apply -f docs/development/simulator/deployment.yaml
-kubectl -n aibrix-system port-forward svc/llama2-7b 8000:8000 1>/dev/null 2>&1 &
-```
-
-## Test python app separately
-
-```shell
-curl http://localhost:8000/v1/chat/completions \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer any_key" \
-  -d '{
-     "model": "llama2-7b",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }'
-```
-
-```shell
-kubectl delete -f docs/development/simulator/deployment.yaml
-```
-
-## Test with envoy gateway
-
-Add User:
-
-
-Port forward to the User and Envoy service:
-```shell
-kubectl -n aibrix-system port-forward svc/aibrix-gateway-users 8090:8090 1>/dev/null 2>&1 &
-kubectl -n envoy-gateway-system port-forward service/envoy-aibrix-system-aibrix-eg-903790dc 8888:80 1>/dev/null 2>&1 &
-```
-
-Add User
-```shell
-curl http://localhost:8090/CreateUser \
-  -H "Content-Type: application/json" \
-  -d '{"name": "your-user-name","rpm": 100,"tpm": 1000}'
-```
-
-Test request (ensure header model name matches with deployment's model name for routing)
-```shell
-curl -v http://localhost:8888/v1/chat/completions \
-  -H "user: your-user-name" \
-  -H "model: llama2-7b" \
-  -H "Content-Type: application/json" \
-  -H "Authorization: Bearer any_key" \
-  -d '{
-     "model": "llama2-7b",
-     "messages": [{"role": "user", "content": "Say this is a test!"}],
-     "temperature": 0.7
-   }' &
\ No newline at end of file
diff --git a/development/simulator/app.py b/development/simulator/app.py
deleted file mode 100644
index 604f10a7..00000000
--- a/development/simulator/app.py
+++ /dev/null
@@ -1,336 +0,0 @@
-import logging
-import os
-import sys
-import time
-from datetime import datetime
-from random import randint
-
-from flask import Flask, Response, jsonify, request
-
-try:
-    from kubernetes import client, config
-except Exception as e:
-    print(f"Failed to import kubernetes, skip: {e}")
-
-from simulator import Simulator
-from transformers import AutoTokenizer
-from vidur.config import SimulationConfig
-from vidur.config_optimizer.config_explorer.config import ModelConfig
-from vidur.entities import Request
-
-MODEL_NAME = os.getenv('MODEL_NAME', 'llama2-70b')
-DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'llama2-70b')
-NAMESPACE = os.getenv('NAMESPACE', 'default')
-DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1'))
-
-# Load the tokenizer for your model
-tokenizer = AutoTokenizer.from_pretrained(
-    'bert-base-uncased',
-    model_max_length=16384, # Suppress warning
-    clean_up_tokenization_spaces=True)  
-
-app = Flask(__name__)
-modelMaps = {
-    "llama2-7b": "meta-llama/Llama-2-7b-hf",
-    "llama2-70b": "meta-llama/Llama-2-70b-hf"
-}
-sys.argv.append(f"--replica_config_model_name={modelMaps.get(MODEL_NAME, MODEL_NAME)}")
-simulator_config: SimulationConfig = SimulationConfig.create_from_cli_args()
-simulator = Simulator(simulator_config)
-v1 = None
-
-# Global storage for overridden values
-overrides = {}
-
-logger = logging.getLogger(__name__)
-
-def get_token_count(text):
-    try:
-        # Encode the text
-        encoded_input = tokenizer(text)
-
-        # Get the number of tokens
-        return len(encoded_input['input_ids'])
-    except Exception as e:
-        logger.error(f"Failed to get number of tokens: {e}")
-
-    return 1
-
-models = [
-    {
-        "id": "meta-llama/Llama-2-7b-hf",
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "meta-llama/Llama-2-7b-hf",
-        "parent": None,
-        "permission": [
-            {
-                "id": "modelperm-cb1adf4457b2417e8c7770aadcffe4cc",
-                "object": "model_permission",
-                "created": 1715644056,
-                "allow_create_engine": False,
-                "allow_sampling": True,
-                "allow_logprobs": True,
-                "allow_search_indices": False,
-                "allow_view": True,
-                "allow_fine_tuning": False,
-                "organization": "*",
-                "group": None,
-                "is_blocking": False
-            }
-        ]
-    },
-    {
-        "id": "startup-default-lora",
-        "object": "model",
-        "created": 1715644056,
-        "owned_by": "vllm",
-        "root": "meta-llama/Llama-2-7b-hf",
-        "parent": None,
-        "permission": [
-            {
-                "id": "modelperm-6a01d79e4d0e452b94d52d2c2e8c8562",
-                "object": "model_permission",
-                "created": 1715644056,
-                "allow_create_engine": False,
-                "allow_sampling": True,
-                "allow_logprobs": True,
-                "allow_search_indices": False,
-                "allow_view": True,
-                "allow_fine_tuning": False,
-                "organization": "*",
-                "group": None,
-                "is_blocking": False
-            }
-        ]
-    }
-]
-
-@app.route('/v1/models', methods=['GET'])
-def get_models():
-    return jsonify({
-        "object": "list",
-        "data": models
-    })
-
-@app.route('/v1/load_lora_adapter', methods=['POST'])
-def load_model():
-    lora_name = request.json.get('lora_name')
-    # Check if the model already exists
-    if any(model['id'] == lora_name for model in models):
-        return jsonify({"status": "success", "message": "Model already loaded"}), 200
-
-    new_model = {
-        'id': lora_name,
-        'created': int(time.time()),
-        'object': "model",
-        'owned_by': "vllm",
-        'parent': None,
-        'root': request.json.get('lora_path')
-    }
-
-    models.append(new_model)
-    return jsonify({"status": "success", "message": "Model loaded successfully"}), 200
-
-
-@app.route('/v1/unload_lora_adapter', methods=['POST'])
-def unload_model():
-    model_id = request.json.get('lora_name')
-    global models
-    models = [model for model in models if model['id'] != model_id]
-    return jsonify({"status": "success", "message": "Model unloaded successfully"}), 200
-
-
-@app.route('/v1/completions', methods=['POST'])
-def completion():
-    try:
-        prompt = request.json.get('prompt')
-        model = request.json.get('model')
-        max_tokens = request.json.get('max_tokens')
-        if not prompt or not model:
-            return jsonify({"status": "error", "message": "Prompt and model are required"}), 400
-        
-        arrived_at = datetime.now().timestamp()
-        input_tokens = get_token_count(prompt)
-        output_tokens = max_tokens if max_tokens else randint(10, 500)
-        arrived_next = request.json.get('next_in')
-        if not arrived_next:
-            arrived_next = 0.0
-        else:
-            arrived_next += arrived_at
-
-        start = datetime.now().timestamp()
-        latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next))
-
-        # Simulated response
-        response = {
-            "id": "cmpl-uqkvlQyYK7bGYrRHQ0eXlWi7",
-            "object": "text_completion",
-            "created": int(arrived_at),
-            "model": model,
-            "system_fingerprint": "fp_44709d6fcb",
-            "choices": [
-                {
-                    "text": f"This is simulated message from {model}!",
-                    "index": 0,
-                    "logprobs": None,
-                    "finish_reason": "length"
-                }
-            ],
-            "usage": {
-                "prompt_tokens": input_tokens,
-                "completion_tokens": output_tokens,
-                "total_tokens": input_tokens + output_tokens,
-                "time": latency
-            }
-        }
-        overhead = datetime.now().timestamp()-start
-        if latency > overhead:
-            time.sleep(latency-overhead)
-        else:
-            logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}")
-        
-        return jsonify(response), 200
-    except Exception as e:
-        import traceback
-        traceback.print_exc()
-
-
-@app.route('/v1/chat/completions', methods=['POST'])
-def chat_completions():
-    messages = request.json.get('messages')
-    model = request.json.get('model')
-    max_tokens = request.json.get('max_tokens')
-    if not messages or not model:
-        return jsonify({"status": "error", "message": "Messages and model are required"}), 400
-    
-    arrived_at = datetime.now().timestamp()
-    input_tokens = sum(get_token_count(message["content"]) for message in messages)
-    output_tokens = max_tokens if max_tokens else randint(10, 500)
-    arrived_next = request.json.get('next_in')
-    if not arrived_next:
-        arrived_next = 0.0
-    else:
-        arrived_next += arrived_at
-
-    start = datetime.now().timestamp()
-    latency = simulator.execute(Request(arrived_at, input_tokens, output_tokens, arrived_next=arrived_next))
-
-    # Simulated response
-    response = {
-        "id": "chatcmpl-abc123",
-        "object": "chat.completion",
-        "created": int(arrived_at),
-        "model": model,
-        "usage": {
-            "prompt_tokens": input_tokens,
-            "completion_tokens": output_tokens,
-            "total_tokens": input_tokens + output_tokens,
-            "time": latency
-        },
-        "choices": [
-            {
-                "message": {
-                    "role": "assistant",
-                    "content": f"\n\nThis is simulated message from {model}!"
-                },
-                "logprobs": None,
-                "finish_reason": "stop",
-                "index": 0
-            }
-        ]
-    }
-    overhead = datetime.now().timestamp()-start
-    if latency > overhead:
-        time.sleep(latency-overhead)
-    else:
-        logger.warning(f"Latency is less than overhead: L{latency} - O{overhead}")
-    return jsonify(response), 200
-
-@app.route('/set_metrics', methods=['POST'])
-def set_metrics():
-    global overrides
-    # Get JSON data from the request
-    data = request.json
-    if data:
-        # Update overrides with new key-value pairs
-        overrides.update(data)
-        return {"status": "success", "message": "Overrides updated"}, 200
-    else:
-        return {"status": "error", "message": "No data provided"}, 400
-
-@app.route('/metrics')
-def metrics():
-    # get deployment information
-    try:
-        apps_v1 = client.AppsV1Api()
-        resp = apps_v1.read_namespaced_deployment(DEPLOYMENT_NAME, NAMESPACE)
-        replicas = resp.spec.replicas if resp.spec.replicas is not None else 1
-    except Exception as e:
-        print(f"Failed to get deployment information: {DEPLOYMENT_NAME=} {NAMESPACE=} {e=}, set replicas to {DEFAULT_REPLICAS}")
-        replicas = DEFAULT_REPLICAS
-
-    # a reasonable mock total value
-    total = overrides.get("total", 0)
-    model_name = overrides.get("model_name", MODEL_NAME)
-    # calculate metrics with potential overrides
-    success_total = overrides.get("success_total", total / replicas)
-    avg_prompt_throughput = overrides.get("avg_prompt_throughput", total / replicas if replicas > 0 else 0)
-    avg_generation_throughput = overrides.get("avg_generation_throughput", total / replicas if replicas > 0 else 0)
-    running = overrides.get("running", 0)
-    waiting = overrides.get("waiting", 0)
-    swapped = overrides.get("swapped", 0)
-    max_running_capacity = 100
-    gpu_cache_usage_perc = overrides.get("gpu_cache_usage_perc", min(100.0, (running / max_running_capacity) * 100))
-
-    # construct Prometheus-style Metrics
-    metrics_output = f"""# HELP vllm:request_success_total Count of successfully processed requests.
-# TYPE vllm:request_success_total counter
-vllm:request_success_total{{finished_reason="stop",model_name="{model_name}"}} {success_total}
-# HELP vllm:num_requests_running Number of requests currently running on GPU.
-# TYPE vllm:num_requests_running gauge
-vllm:num_requests_running{{model_name="{model_name}"}} {running}
-# HELP vllm:num_requests_swapped Number of requests swapped to CPU.
-# TYPE vllm:num_requests_swapped gauge
-vllm:num_requests_swapped{{model_name="{model_name}"}} {swapped}
-# HELP vllm:num_requests_waiting Number of requests waiting to be processed.
-# TYPE vllm:num_requests_waiting gauge
-vllm:num_requests_waiting{{model_name="{model_name}"}} {waiting}
-# HELP vllm:avg_prompt_throughput_toks_per_s Average prefill throughput in tokens/s.
-# TYPE vllm:avg_prompt_throughput_toks_per_s gauge
-vllm:avg_prompt_throughput_toks_per_s{{model_name="{model_name}"}} {avg_prompt_throughput}
-# HELP vllm:avg_generation_throughput_toks_per_s Average generation throughput in tokens/s.
-# TYPE vllm:avg_generation_throughput_toks_per_s gauge
-vllm:avg_generation_throughput_toks_per_s{{model_name="{model_name}"}} {avg_generation_throughput}
-# HELP vllm:gpu_cache_usage_perc GPU KV-cache usage. 1 means 100 percent usage.
-# TYPE vllm:gpu_cache_usage_perc gauge
-vllm:gpu_cache_usage_perc{{model_name="model_name"}} {gpu_cache_usage_perc}
-"""
-    return Response(metrics_output, mimetype='text/plain')
-
-if __name__ == '__main__':
-    logging.basicConfig(level=logging.DEBUG)
-    logging.getLogger("kubernetes.client.rest").setLevel(logging.ERROR)  # Suppress kubenetes logs
-
-    print(f"Starting app. DEPLOYMENT_NAME: {DEPLOYMENT_NAME}, NAMESPACE: {NAMESPACE}, MODEL: {MODEL_NAME}")
-   
-    thread = simulator.start()
-
-    import sys
-    if '--time_limit' not in sys.argv:
-        try:
-            # config.load_kube_config()
-            config.load_incluster_config()
-        except Exception as e:
-            print(f"Failed to load k8s config: {e}")
-
-        # Perform profiling and skip actual run
-        app.run(host='0.0.0.0', port=8000)
-
-    # latency = simulator.execute(Request(0, 25, 100))
-    # print(f"request latency: {latency}")
-
-    simulator.stop()
-
-    thread.join()
diff --git a/development/simulator/deployment-a100.yaml b/development/simulator/deployment-a100.yaml
deleted file mode 100644
index 8436f5ef..00000000
--- a/development/simulator/deployment-a100.yaml
+++ /dev/null
@@ -1,161 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: simulator-llama2-7b-a100
-  namespace: aibrix-system
-  labels:
-    modeladapter.aibricks.ai/enabled: "true"
-    model.aibrix.ai/name: "llama2-7b"
-    model.aibrix.ai/port: "8000"
-    model.aibrix.ai/min_replicas: "1" # min replica for gpu optimizer when no workloads.
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      modeladapter.aibricks.ai/enabled: "true"
-      model.aibrix.ai/name: "llama2-7b"
-  template:
-    metadata:
-      labels:
-        modeladapter.aibricks.ai/enabled: "true"
-        model.aibrix.ai/name: "llama2-7b"
-        app: "simulator-llama2-7b-a100"
-    spec:
-      serviceAccountName: pod-autoscaler
-      automountServiceAccountToken: true # Important!
-      containers:
-        - name: llmengine-simulator
-          image: aibrix/vllm-simulator:nightly
-          command: ["python", "app.py", "--replica_config_device", "a100"]
-          ports:
-            - containerPort: 8000
-          env:
-            - name: DEPLOYMENT_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.labels['app']
-            - name: NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-            - name: MY_POD_IP
-              valueFrom:
-                fieldRef:
-                  fieldPath: status.podIP
-            - name: MODEL_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.labels['model.aibrix.ai/name']
----
-# Debug only: Make sure pod can be visited from controller that deployed in mac.
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama2-7b
-  namespace: aibrix-system
-spec:
-  selector:
-    model.aibrix.ai/name: "llama2-7b"
-  ports:
-    - protocol: TCP
-      port: 8000
-      targetPort: 8000
-      nodePort: 30081
-  type: NodePort
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: pod-autoscaler
-  namespace: aibrix-system
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  name: pod-reader
-  namespace: aibrix-system
-rules:
-  - apiGroups: [""]
-    resources: ["pods"]
-    verbs: ["get", "list", "watch"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: read-pods
-  namespace: aibrix-system
-subjects:
-  - kind: ServiceAccount
-    name: pod-autoscaler
-    namespace: aibrix-system
-roleRef:
-  kind: Role
-  name: pod-reader
-  apiGroup: rbac.authorization.k8s.io
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  namespace: aibrix-system
-  name: deployment-reader
-rules:
-  - apiGroups: ["apps"]
-    resources: ["deployments"]
-    verbs: ["get", "list", "watch"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: deployment-reader-binding
-  namespace: aibrix-system
-subjects:
-  - kind: ServiceAccount
-    name: pod-autoscaler
-    namespace: aibrix-system
-roleRef:
-  kind: Role
-  name: deployment-reader
-  apiGroup: rbac.authorization.k8s.io
-# ---
-# for test-purpose, if need to create HTTPRoute object manually
-# apiVersion: gateway.networking.k8s.io/v1
-# kind: HTTPRoute
-# metadata:
-#   name: llama2-7b-router
-#   namespace: aibrix-system
-# spec:
-#   parentRefs:
-#     - name: aibrix-eg
-#   rules:
-#     - matches:
-#         - headers:
-#             - type: Exact
-#               name: model
-#               value: llama2-7b
-#       backendRefs:
-#         - name: llama2-7b
-#           port: 8000
----
-# Pod autoscaler works with gpu-optimizer
-apiVersion: autoscaling.aibrix.ai/v1alpha1
-kind: PodAutoscaler
-metadata:
-  name: podautoscaler-simulator-llama2-7b-a100
-  labels:
-    app.kubernetes.io/name: aibrix
-    app.kubernetes.io/managed-by: kustomize
-  namespace: aibrix-system
-spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: simulator-llama2-7b-a100
-  minReplicas: 0
-  maxReplicas: 10
-  targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured
-  metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
-      path: /metrics/aibrix-system/simulator-llama2-7b-a100
-      metric: "vllm:deployment_replicas"
-  targetValue: "1"
-  scalingStrategy: "KPA"
\ No newline at end of file
diff --git a/development/simulator/deployment-a40.yaml b/development/simulator/deployment-a40.yaml
deleted file mode 100644
index 7d172142..00000000
--- a/development/simulator/deployment-a40.yaml
+++ /dev/null
@@ -1,160 +0,0 @@
-apiVersion: apps/v1
-kind: Deployment
-metadata:
-  name: simulator-llama2-7b-a40
-  namespace: aibrix-system
-  labels:
-    modeladapter.aibricks.ai/enabled: "true"
-    model.aibrix.ai/name: "llama2-7b"
-    model.aibrix.ai/port: "8000"
-spec:
-  replicas: 1
-  selector:
-    matchLabels:
-      modeladapter.aibricks.ai/enabled: "true"
-      model.aibrix.ai/name: "llama2-7b"
-  template:
-    metadata:
-      labels:
-        modeladapter.aibricks.ai/enabled: "true"
-        model.aibrix.ai/name: "llama2-7b"
-        app: "simulator-llama2-7b-a40"
-    spec:
-      serviceAccountName: pod-autoscaler
-      automountServiceAccountToken: true # Important!
-      containers:
-        - name: llmengine-simulator
-          image: aibrix/vllm-simulator-a40:nightly
-          command: ["python", "app.py", "--replica_config_device", "a40"]
-          ports:
-            - containerPort: 8000
-          env:
-            - name: DEPLOYMENT_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.labels['app']
-            - name: NAMESPACE
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.namespace
-            - name: MY_POD_IP
-              valueFrom:
-                fieldRef:
-                  fieldPath: status.podIP
-            - name: MODEL_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: metadata.labels['model.aibrix.ai/name']
----
-# Debug only: Make sure pod can be visited from controller that deployed in mac.
-apiVersion: v1
-kind: Service
-metadata:
-  name: llama2-7b
-  namespace: aibrix-system
-spec:
-  selector:
-    model.aibrix.ai/name: "llama2-7b"
-  ports:
-    - protocol: TCP
-      port: 8000
-      targetPort: 8000
-      nodePort: 30081
-  type: NodePort
----
-apiVersion: v1
-kind: ServiceAccount
-metadata:
-  name: pod-autoscaler
-  namespace: aibrix-system
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  name: pod-reader
-  namespace: aibrix-system
-rules:
-  - apiGroups: [""]
-    resources: ["pods"]
-    verbs: ["get", "list", "watch"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: read-pods
-  namespace: aibrix-system
-subjects:
-  - kind: ServiceAccount
-    name: pod-autoscaler
-    namespace: aibrix-system
-roleRef:
-  kind: Role
-  name: pod-reader
-  apiGroup: rbac.authorization.k8s.io
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: Role
-metadata:
-  namespace: aibrix-system
-  name: deployment-reader
-rules:
-  - apiGroups: ["apps"]
-    resources: ["deployments"]
-    verbs: ["get", "list", "watch"]
----
-apiVersion: rbac.authorization.k8s.io/v1
-kind: RoleBinding
-metadata:
-  name: deployment-reader-binding
-  namespace: aibrix-system
-subjects:
-  - kind: ServiceAccount
-    name: pod-autoscaler
-    namespace: aibrix-system
-roleRef:
-  kind: Role
-  name: deployment-reader
-  apiGroup: rbac.authorization.k8s.io
-# ---
-# for test-purpose, if need to create HTTPRoute object manually
-# apiVersion: gateway.networking.k8s.io/v1
-# kind: HTTPRoute
-# metadata:
-#   name: llama2-7b-router
-#   namespace: aibrix-system
-# spec:
-#   parentRefs:
-#     - name: aibrix-eg
-#   rules:
-#     - matches:
-#         - headers:
-#             - type: Exact
-#               name: model
-#               value: llama2-7b
-#       backendRefs:
-#         - name: llama2-7b
-#           port: 8000
----
-# Pod autoscaler works with gpu-optimizer
-apiVersion: autoscaling.aibrix.ai/v1alpha1
-kind: PodAutoscaler
-metadata:
-  name: podautoscaler-simulator-llama2-7b-a40
-  labels:
-    app.kubernetes.io/name: aibrix
-    app.kubernetes.io/managed-by: kustomize
-  namespace: aibrix-system
-spec:
-  scaleTargetRef:
-    apiVersion: apps/v1
-    kind: Deployment
-    name: simulator-llama2-7b-a40
-  minReplicas: 0
-  maxReplicas: 10
-  targetMetric: "avg_prompt_throughput_toks_per_s" # Ignore if metricsSources is configured
-  metricsSources: 
-    - endpoint: gpu-optimizer.aibrix-system.svc.cluster.local:8080
-      path: /metrics/aibrix-system/simulator-llama2-7b-a40
-      metric: "vllm:deployment_replicas"
-  targetValue: "1"
-  scalingStrategy: "KPA"
\ No newline at end of file
diff --git a/development/simulator/environment.yml b/development/simulator/environment.yml
deleted file mode 100644
index 60f4b6d7..00000000
--- a/development/simulator/environment.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-name: simulator
-channels:
-  - conda-forge
-  - plotly
-dependencies:
-  - python>=3.10
-  - setuptools
-  - pip
-  - numpy
-  - plotly_express
-  - jupyterlab
-  - matplotlib
-  - pyyaml
-  - snakeviz
-  - scikit-learn
-  - python-kaleido
-  - wandb
-  - fasteners
-  - ray-all
-  - streamlit
-  - randomname
-  - flask
-  - kubernetes
-  - transformers
-  - pip:
-    - kaleido
-    - ddsketch
-    - paretoset
-    - git+https://github.com/zhangjyr/vidur.git
\ No newline at end of file
diff --git a/development/simulator/requirements.txt b/development/simulator/requirements.txt
deleted file mode 100644
index 7386688c..00000000
--- a/development/simulator/requirements.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-flask
-kubernetes
-numpy
-pandas
-scikit-learn
-wandb
-kaleido
-ddsketch
-plotly_express
-matplotlib
-seaborn
-fasteners
-transformers
-git+https://github.com/zhangjyr/vidur.git
\ No newline at end of file

From f5e9abd2a1591fdedde37cae16f8e01c7edb57f2 Mon Sep 17 00:00:00 2001
From: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
Date: Tue, 3 Dec 2024 16:56:48 -0800
Subject: [PATCH 2/3] Fix namespace.

---
 development/app/README.md                                | 9 +++++++++
 .../config/templates/podautoscaler/podautoscaler.yaml    | 1 +
 2 files changed, 10 insertions(+)

diff --git a/development/app/README.md b/development/app/README.md
index c533fb20..6a6d86df 100644
--- a/development/app/README.md
+++ b/development/app/README.md
@@ -37,6 +37,15 @@ Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for hig
 docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile .
 ```
 
+1.b (Optional) Load container image to docker context
+
+> Note: If you are using Docker-Desktop on Mac, Kubernetes shares the local image repository with Docker.
+> Therefore, the following command is not necessary. Only kind user need this step.
+
+```shell
+kind load docker-image aibrix/vllm-simulator:nightly
+```
+
 2. Deploy simulator model image
 ```shell
 kubectl create -k config/simulator
diff --git a/development/app/config/templates/podautoscaler/podautoscaler.yaml b/development/app/config/templates/podautoscaler/podautoscaler.yaml
index 5f114688..945fb13c 100644
--- a/development/app/config/templates/podautoscaler/podautoscaler.yaml
+++ b/development/app/config/templates/podautoscaler/podautoscaler.yaml
@@ -5,6 +5,7 @@ metadata:
   name: podautoscaler-mock-llama2-7b
   labels:
     app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
   namespace: default
 spec:
   scaleTargetRef:

From 43d204c8f28b51176407dafdbc6d3347fa4ce287 Mon Sep 17 00:00:00 2001
From: Jingyuan Zhang <jingyuan.zhang0929@bytedance.com>
Date: Wed, 4 Dec 2024 11:54:39 -0800
Subject: [PATCH 3/3] Using model tokenizer if possible. Token is needed for
 gated model. Change build arg from GPU_TYPE to SIMULATION

---
 development/app/Dockerfile | 11 ++++++-----
 development/app/Makefile   |  4 ++--
 development/app/README.md  |  5 +++++
 development/app/app.py     | 19 +++++++++++++++----
 4 files changed, 28 insertions(+), 11 deletions(-)

diff --git a/development/app/Dockerfile b/development/app/Dockerfile
index 314808ef..c5419e1f 100644
--- a/development/app/Dockerfile
+++ b/development/app/Dockerfile
@@ -21,16 +21,17 @@ RUN pip install --no-cache-dir -r requirements.txt
 COPY ./*.py /app/
 
 ENV MODEL_NAME=llama2-7b
-ARG GPU_TYPE=disabled
+ENV HUGGINGFACE_TOKEN="your huggingface token"
+ARG SIMULATION=disabled
 
 # Trigger profiling
-RUN if [ "$GPU_TYPE" != "disabled" ]; then \
-        python app.py --time_limit 1000 --replica_config_device ${GPU_TYPE}; \
+RUN if [ "$SIMULATION" != "disabled" ]; then \
+        python app.py --time_limit 1000 --replica_config_device ${SIMULATION}; \
     fi
 
 # Expose the port the app runs on
 EXPOSE 8000
 
 # Run the application, environment variable is necessary to apply ARG
-ENV GPU_TYPE=$GPU_TYPE
-CMD python app.py --replica_config_device ${GPU_TYPE}
+ENV SIMULATION=$SIMULATION
+CMD python app.py --replica_config_device ${SIMULATION}
diff --git a/development/app/Makefile b/development/app/Makefile
index 909a775c..42edae1b 100644
--- a/development/app/Makefile
+++ b/development/app/Makefile
@@ -4,10 +4,10 @@ docker-build-mock:
 	docker build -t aibrix/vllm-mock:nightly -f Dockerfile .
 
 docker-build-simulator: 
-	docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile .
+	docker build -t aibrix/vllm-simulator:nightly --build-arg SIMULATION=a100 -f Dockerfile .
 
 docker-build-simulator-a40:
-	docker build -t aibrix/vllm-simulator-a40:nightly --build-arg GPU_TYPE=a40 -f Dockerfile .
+	docker build -t aibrix/vllm-simulator-a40:nightly --build-arg SIMULATION=a40 -f Dockerfile .
 
 docker-build: docker-build-mock
 
diff --git a/development/app/README.md b/development/app/README.md
index 6a6d86df..8f455006 100644
--- a/development/app/README.md
+++ b/development/app/README.md
@@ -32,6 +32,11 @@ kubectl delete -k config/mock
 
 ### Deploy the simulator app
 Alternatively, [vidur](https://github.com/microsoft/vidur) is integrated for high-fidality vLLM simulation:
+0. Config HuggingFace token for model tokenizer by changing HUGGINGFACE_TOKEN in Dockerfile
+```
+ENV HUGGINGFACE_TOKEN="your huggingface token"
+```
+
 1. Builder simulator base model image
 ```dockerfile
 docker build -t aibrix/vllm-simulator:nightly --build-arg GPU_TYPE=a100 -f Dockerfile .
diff --git a/development/app/app.py b/development/app/app.py
index a2a420b6..c9b2b503 100644
--- a/development/app/app.py
+++ b/development/app/app.py
@@ -26,6 +26,7 @@
 DEPLOYMENT_NAME = os.getenv('DEPLOYMENT_NAME', 'llama2-70b')
 NAMESPACE = os.getenv('POD_NAMESPACE', 'default')
 DEFAULT_REPLICAS = int(os.getenv('DEFAULT_REPLICAS', '1'))
+HUGGINGFACE_TOKEN = os.getenv('HUGGINGFACE_TOKEN', "your huggingface token")
 
 modelMaps = {
     "llama2-7b": "meta-llama/Llama-2-7b-hf",
@@ -589,10 +590,20 @@ def metrics():
     if gpu_device != "disabled":
         # Load the tokenizer for your model
         from transformers import AutoTokenizer
-        tokenizer = AutoTokenizer.from_pretrained(
-            'bert-base-uncased',
-            model_max_length=16384, # Suppress warning
-            clean_up_tokenization_spaces=True)
+        default_model = 'bert-base-uncased'
+        try:
+            token_model = modelMaps.get(MODEL_NAME, default_model)
+            tokenizer = AutoTokenizer.from_pretrained(
+                token_model,
+                token=HUGGINGFACE_TOKEN,
+                model_max_length=16384, # Suppress warning
+                clean_up_tokenization_spaces=True)
+        except Exception as e:
+            logger.error(f"Failed to initialize tokenizer, will use default tokenizer model: {e}")
+            tokenizer = AutoTokenizer.from_pretrained(
+                default_model,
+                model_max_length=16384, # Suppress warning
+                clean_up_tokenization_spaces=True)
 
         simulator = Simulator(SimulationConfig.create_from_cli_args())
         overrides = {