diff --git a/config/default/kustomization.yaml b/config/default/kustomization.yaml index f472003f..2402b1b1 100644 --- a/config/default/kustomization.yaml +++ b/config/default/kustomization.yaml @@ -23,6 +23,7 @@ resources: - ../rbac - ../manager - ../gateway +- ../gpu-optimizer - ../dependency/kuberay-operator # [WEBHOOK] To enable webhook, uncomment all the sections with [WEBHOOK] prefix including the one in # crd/kustomization.yaml diff --git a/config/gpu-optimizer/deployment.yaml b/config/gpu-optimizer/deployment.yaml new file mode 100644 index 00000000..796912e4 --- /dev/null +++ b/config/gpu-optimizer/deployment.yaml @@ -0,0 +1,26 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: gpu-optimizer + namespace: aibrix-system +spec: + replicas: 1 + selector: + matchLabels: + app: gpu-optimizer + template: + metadata: + labels: + app: gpu-optimizer + spec: + serviceAccountName: gpu-optimizer-sa + automountServiceAccountToken: true + containers: + - name: gpu-optimizer + image: aibrix/runtime:nightly + command: ["python", "-m", "aibrix.gpu_optimizer.app"] + ports: + - containerPort: 8080 + env: + - name: REDIS_HOST + value: aibrix-redis-master.aibrix-system.svc.cluster.local \ No newline at end of file diff --git a/config/gpu-optimizer/kustomization.yaml b/config/gpu-optimizer/kustomization.yaml new file mode 100644 index 00000000..bb0c7530 --- /dev/null +++ b/config/gpu-optimizer/kustomization.yaml @@ -0,0 +1,4 @@ +resources: +- deployment.yaml +- service.yaml +- rbac.yaml \ No newline at end of file diff --git a/config/gpu-optimizer/rbac.yaml b/config/gpu-optimizer/rbac.yaml new file mode 100644 index 00000000..dd56e2e1 --- /dev/null +++ b/config/gpu-optimizer/rbac.yaml @@ -0,0 +1,27 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: gpu-optimizer-sa + namespace: aibrix-system +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: gpu-optimizer-clusterrole +rules: + - apiGroups: ["apps"] + resources: ["deployments"] + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: gpu-optimizer-clusterrole-binding +subjects: + - kind: ServiceAccount + name: gpu-optimizer-sa + namespace: aibrix-system +roleRef: + kind: ClusterRole + name: gpu-optimizer-clusterrole + apiGroup: rbac.authorization.k8s.io \ No newline at end of file diff --git a/config/gpu-optimizer/service.yaml b/config/gpu-optimizer/service.yaml new file mode 100644 index 00000000..6968aeed --- /dev/null +++ b/config/gpu-optimizer/service.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: gpu-optimizer + namespace: aibrix-system +spec: + selector: + app: gpu-optimizer + ports: + - protocol: TCP + port: 8080 + targetPort: 8080 + type: ClusterIP \ No newline at end of file diff --git a/config/overlays/vke/default/kustomization.yaml b/config/overlays/vke/default/kustomization.yaml index 4598d51d..e75188a6 100644 --- a/config/overlays/vke/default/kustomization.yaml +++ b/config/overlays/vke/default/kustomization.yaml @@ -7,6 +7,7 @@ resources: - ../../../rbac - manager - gateway +- ../../../gpu-optimizer - ../../../dependency/kuberay-operator diff --git a/python/aibrix/aibrix/gpu_optimizer/README.md b/python/aibrix/aibrix/gpu_optimizer/README.md index 347a2603..74965805 100644 --- a/python/aibrix/aibrix/gpu_optimizer/README.md +++ b/python/aibrix/aibrix/gpu_optimizer/README.md @@ -28,7 +28,7 @@ kubectl -n aibrix-system port-forward svc/aibrix-redis-master 6379:6379 1>/dev/n # Or use make make debug-init -python optimizer/profiling/gen-profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b" +python optimizer/profiling/gen_profile.py simulator-llama2-7b-a100 -o "redis://localhost:6379/?model=llama2-7b" # Or use make make DP=simulator-llama2-7b-a100 gen-profile ``` @@ -36,7 +36,7 @@ make DP=simulator-llama2-7b-a100 gen-profile 5. Deploy GPU Optimizer ```shell kubectl apply -f deployment.yaml -kubectl -n aibrix-system port-forward svc/gpu-optimizer 8080:8080 1>/dev/null 2>&1 & +kubectl -n aibrix-system port-forward svc/aibrix-gpu-optimizer 8080:8080 1>/dev/null 2>&1 & # Or use make make deploy @@ -47,7 +47,7 @@ make deploy 5. Start workload and see how model scale. Benchmark toolkit can be used to generate workload as: ```shell # Make sure gateway's local access, see docs/development/simulator/README.md for details. -python optimizer/profiling/gpu-benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b +python optimizer/profiling/gpu_benchmark.py --backend=vllm --port 8888 --request-rate=10 --num-prompts=100 --input_len 2000 --output_len 128 --model=llama2-7b ``` 6. Observability: visit http://localhost:8080/dash/llama2-7b for workload pattern visualization. A independent visualization demo can access by: