diff --git a/benchmarks/autoscaling/README.md b/benchmarks/autoscaling/README.md index d71bb093..827ea337 100644 --- a/benchmarks/autoscaling/README.md +++ b/benchmarks/autoscaling/README.md @@ -7,7 +7,7 @@ You can run autoscaling benchmark experiment by simply running the command below What you have to check before running it -- run deployment for your application (refer to `deepseek-llm-7b-chat-v100/deploy.yaml`) +- run deployment for your application (refer to `deepseek-llm-7b-chat/deploy.yaml`) - change the name field under scaleTargetRed in all autoscaling yaml files. - check the deployment name in run-test.py @@ -17,7 +17,7 @@ For example, scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: deepseek-llm-7b-chat-v100 (*this one) + name: deepseek-llm-7b-chat (*this one) ... ``` diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml deleted file mode 100644 index 40fa8918..00000000 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/apa.yaml +++ /dev/null @@ -1,26 +0,0 @@ -apiVersion: autoscaling.aibrix.ai/v1alpha1 -kind: PodAutoscaler -metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-apa - namespace: default - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - autoscaling.aibrix.ai/up-fluctuation-tolerance: "0.1" - autoscaling.aibrix.ai/down-fluctuation-tolerance: "0.2" - apa.autoscaling.aibrix.ai/window: "30s" -spec: - scalingStrategy: "APA" - minReplicas: 1 - maxReplicas: 10 - metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "0.5" - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: deepseek-llm-7b-chat-v100 \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml deleted file mode 100644 index 97e83fb5..00000000 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/hpa.yaml +++ /dev/null @@ -1,23 +0,0 @@ -apiVersion: autoscaling.aibrix.ai/v1alpha1 -kind: PodAutoscaler -metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-hpa - namespace: default - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize -spec: - scalingStrategy: "HPA" - minReplicas: 1 - maxReplicas: 10 - metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "/metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "50" - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: deepseek-llm-7b-chat-v100 \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml deleted file mode 100644 index a2bf4ee8..00000000 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/kpa.yaml +++ /dev/null @@ -1,24 +0,0 @@ -apiVersion: autoscaling.aibrix.ai/v1alpha1 -kind: PodAutoscaler -metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-kpa - namespace: default - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - kpa.autoscaling.aibrix.ai/scale-down-delay: "3m" -spec: - scalingStrategy: "KPA" - minReplicas: 1 - maxReplicas: 10 - metricsSources: - - metricSourceType: "pod" - protocolType: "http" - port: "8000" - path: "metrics" - targetMetric: "gpu_cache_usage_perc" - targetValue: "0.5" - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: deepseek-llm-7b-chat-v100 \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml new file mode 100644 index 00000000..a81d1381 --- /dev/null +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/apa.yaml @@ -0,0 +1,27 @@ +apiVersion: autoscaling.aibrix.ai/v1alpha1 +kind: PodAutoscaler +metadata: + name: deepseek-llm-7b-chat-v100-apa + namespace: default + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize + annotations: + autoscaling.aibrix.ai/up-fluctuation-tolerance: '0.1' + autoscaling.aibrix.ai/down-fluctuation-tolerance: '0.2' + apa.autoscaling.aibrix.ai/window: 30s +spec: + scalingStrategy: APA + minReplicas: 1 + maxReplicas: 8 + metricsSources: + - metricSourceType: pod + protocolType: http + port: '8000' + path: metrics + targetMetric: gpu_cache_usage_perc + targetValue: '0.5' + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml similarity index 83% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml index b4b107cd..c503357e 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/deploy.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/deploy.yaml @@ -4,7 +4,7 @@ metadata: labels: model.aibrix.ai/name: deepseek-llm-7b-chat model.aibrix.ai/port: "8000" - name: deepseek-llm-7b-chat-v100 + name: deepseek-llm-7b-chat namespace: default spec: replicas: 1 @@ -54,23 +54,23 @@ spec: successThreshold: 1 timeoutSeconds: 1 lifecycle: - preStop: - exec: - command: - - /bin/sh - - -c - - | - while true; do - RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}') - WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}') - if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then - echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 - exit 0 - else - echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 - sleep 5 - fi - done + preStop: + exec: + command: + - /bin/sh + - -c + - | + while true; do + RUNNING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_running' | grep -v '#' | awk '{print $2}') + WAITING=$(curl -s http://localhost:8000/metrics | grep 'vllm:num_requests_waiting' | grep -v '#' | awk '{print $2}') + if [ "$RUNNING" = "0.0" ] && [ "$WAITING" = "0.0" ]; then + echo "Terminating: No active or waiting requests, safe to terminate" >> /proc/1/fd/1 + exit 0 + else + echo "Terminating: Running: $RUNNING, Waiting: $WAITING" >> /proc/1/fd/1 + sleep 5 + fi + done name: vllm-openai ports: - containerPort: 8000 @@ -127,7 +127,7 @@ spec: periodSeconds: 10 initContainers: - name: init-model - image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc2 + image: aibrix-container-registry-cn-beijing.cr.volces.com/aibrix/runtime:v0.2.0-rc.2 command: - aibrix_download - --model-uri @@ -158,7 +158,7 @@ spec: volumeMounts: - mountPath: /models name: model-hostpath - terminationGracePeriodSeconds: 300 + terminationGracePeriodSeconds: 60 volumes: - name: model-hostpath hostPath: diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml new file mode 100644 index 00000000..55adfc00 --- /dev/null +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/hpa.yaml @@ -0,0 +1,23 @@ +apiVersion: autoscaling.aibrix.ai/v1alpha1 +kind: PodAutoscaler +metadata: + name: deepseek-llm-7b-chat-hpa + namespace: default + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize +spec: + scalingStrategy: HPA + minReplicas: 1 + maxReplicas: 8 + metricsSources: + - metricSourceType: pod + protocolType: http + port: '8000' + path: /metrics + targetMetric: gpu_cache_usage_perc + targetValue: '50' + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml new file mode 100644 index 00000000..c49d4546 --- /dev/null +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/kpa.yaml @@ -0,0 +1,25 @@ +apiVersion: autoscaling.aibrix.ai/v1alpha1 +kind: PodAutoscaler +metadata: + name: deepseek-llm-7b-chat-kpa + namespace: default + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize + annotations: + kpa.autoscaling.aibrix.ai/scale-down-delay: 3m +spec: + scalingStrategy: KPA + minReplicas: 1 + maxReplicas: 8 + metricsSources: + - metricSourceType: pod + protocolType: http + port: '8000' + path: metrics + targetMetric: gpu_cache_usage_perc + targetValue: '0.5' + scaleTargetRef: + apiVersion: apps/v1 + kind: Deployment + name: deepseek-llm-7b-chat \ No newline at end of file diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml similarity index 50% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml index 2a4cd76a..e26bd7d8 100644 --- a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/optimizer-kpa.yaml +++ b/benchmarks/autoscaling/deepseek-llm-7b-chat/optimizer-kpa.yaml @@ -1,24 +1,25 @@ apiVersion: autoscaling.aibrix.ai/v1alpha1 kind: PodAutoscaler metadata: - name: podautoscaler-deepseek-llm-7b-chat-v100-gpu-optimizer + name: deepseek-llm-7b-chat-gpu-optimizer namespace: default labels: app.kubernetes.io/name: aibrix app.kubernetes.io/managed-by: kustomize + annotations: kpa.autoscaling.aibrix.ai/scale-down-delay: 0s spec: scalingStrategy: KPA minReplicas: 1 - maxReplicas: 10 + maxReplicas: 8 metricsSources: - - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 - metricSourceType: domain - path: /metrics/default/deepseek-llm-7b-chat-v100 - protocolType: http - targetMetric: vllm:deployment_replicas - targetValue: "1" + - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 + metricSourceType: domain + path: /metrics/default/deepseek-llm-7b-chat + protocolType: http + targetMetric: vllm:deployment_replicas + targetValue: "1" scaleTargetRef: apiVersion: apps/v1 kind: Deployment - name: deepseek-llm-7b-chat-v100 + name: deepseek-llm-7b-chat diff --git a/benchmarks/autoscaling/deepseek-llm-7b-chat-v100/svc.yaml b/benchmarks/autoscaling/deepseek-llm-7b-chat/svc.yaml similarity index 100% rename from benchmarks/autoscaling/deepseek-llm-7b-chat-v100/svc.yaml rename to benchmarks/autoscaling/deepseek-llm-7b-chat/svc.yaml diff --git a/benchmarks/autoscaling/overnight_run.sh b/benchmarks/autoscaling/overnight_run.sh index b553f25e..3870dfaf 100755 --- a/benchmarks/autoscaling/overnight_run.sh +++ b/benchmarks/autoscaling/overnight_run.sh @@ -7,8 +7,7 @@ if [ -z "${workload_path}" ]; then exit 1 fi -# autoscalers="hpa kpa apa optimizer-kpa" -autoscalers="apa optimizer-kpa" +autoscalers="hpa kpa apa optimizer-kpa" for autoscaler in ${autoscalers}; do start_time=$(date +%s) echo "--------------------------------" diff --git a/benchmarks/autoscaling/run-test.sh b/benchmarks/autoscaling/run-test.sh index 64e3ffb7..229ca3cc 100755 --- a/benchmarks/autoscaling/run-test.sh +++ b/benchmarks/autoscaling/run-test.sh @@ -4,8 +4,8 @@ input_workload_path=$1 autoscaler=$2 aibrix_repo="/Users/bytedance/projects/aibrix-2" # root dir of aibrix repo api_key="sk-kFJ12nKsFVfVmGpj3QzX65s4RbN2xJqWzPYCjYu7wT3BlbLi" # set your api key -k8s_yaml_dir="deepseek-llm-7b-chat-v100" -target_deployment="deepseek-llm-7b-chat-v100" # "aibrix-model-deepseek-llm-7b-chat" +k8s_yaml_dir="deepseek-llm-7b-chat" +target_deployment="deepseek-llm-7b-chat" # "aibrix-model-deepseek-llm-7b-chat" target_ai_model=deepseek-llm-7b-chat echo "Make sure ${target_deployment} is the right deployment." @@ -60,8 +60,10 @@ echo "started port-forwarding with PID: $PORT_FORWARD_PID" # Clean up any existing autoscalers kubectl delete podautoscaler --all --all-namespaces kubectl delete hpa --all --all-namespaces +kubectl delete -f ${k8s_yaml_dir}/deploy.yaml # Apply new autoscaler +kubectl apply -f ${k8s_yaml_dir}/deploy.yaml kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml echo "kubectl apply -f ${k8s_yaml_dir}/${autoscaler}.yaml" python set_num_replicas.py --deployment ${target_deployment} --replicas 1 @@ -122,6 +124,7 @@ sleep 1 # Cleanup kubectl delete podautoscaler --all --all-namespaces python set_num_replicas.py --deployment ${target_deployment} --replicas 1 +kubectl delete -f ${k8s_yaml_dir}/deploy.yaml # Stop monitoring processes echo "Stopping monitoring processes..." diff --git a/benchmarks/autoscaling/streaming_pod_log_to_file.py b/benchmarks/autoscaling/streaming_pod_log_to_file.py index fd48b0e9..47b098ec 100644 --- a/benchmarks/autoscaling/streaming_pod_log_to_file.py +++ b/benchmarks/autoscaling/streaming_pod_log_to_file.py @@ -11,20 +11,22 @@ def get_all_pods(namespace): pod_list = pod_list_output.decode('utf-8').split() return pod_list -def write_logs(keyword, fname, process): +def write_logs(keywords, fname, process): with open(fname, 'w') as log_file: while True: line = process.stdout.readline() if not line: break - if keyword is None: - # If there is no keyword, write all logs - log_file.write(line) - log_file.flush() - if keyword and keyword in line: - # If there is keyword, write only the lines containing the keyword + if len(keywords) == 0: # If there is no keyword, write all logs log_file.write(line) log_file.flush() + else: + for keyword in keywords: + if keyword in line: + # If there is keyword, write only the lines containing the keyword + log_file.write(line) + log_file.flush() + break def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace): if not os.path.exists(pod_log_dir): @@ -38,12 +40,11 @@ def save_proxy_logs_streaming(pod_log_dir, pod_name, namespace): stderr=subprocess.PIPE, universal_newlines=True ) - # if namespace == "default": - # keyword = "Avg prompt throughput:" - # else: - # keyword = None - keyword = None # you can specify keyword here to filter logs - log_thread = threading.Thread(target=write_logs, args=(keyword, fname, process)) + if namespace == "default": + keywords = ["Avg prompt throughput:", "logger.py", "engine.py"] + else: + keywords = [] + log_thread = threading.Thread(target=write_logs, args=(keywords, fname, process)) log_thread.start() return process, log_thread