diff --git a/development/tutorials/lora/model_adapter.yaml b/development/tutorials/lora/model_adapter.yaml index fd18aca9..81df65a3 100644 --- a/development/tutorials/lora/model_adapter.yaml +++ b/development/tutorials/lora/model_adapter.yaml @@ -13,22 +13,3 @@ spec: model.aibrix.ai/name: llama2-7b artifactURL: huggingface://yard1/llama-2-7b-sql-lora-test schedulerName: default -# --- -# # for test-purpose, if need to create HTTPRoute object manually -# apiVersion: gateway.networking.k8s.io/v1 -# kind: HTTPRoute -# metadata: -# name: lora-1-router -# namespace: aibrix-system -# spec: -# parentRefs: -# - name: aibrix-eg -# rules: -# - matches: -# - headers: -# - type: Exact -# name: model -# value: lora-1 -# backendRefs: -# - name: lora-1 -# port: 8000 \ No newline at end of file diff --git a/docs/source/features/autoscaling.rst b/docs/source/features/autoscaling.rst index 395e1fd0..31a65a79 100644 --- a/docs/source/features/autoscaling.rst +++ b/docs/source/features/autoscaling.rst @@ -15,7 +15,7 @@ In the following sections, we will demonstrate how users can create various type Supported Autoscaling Mechanism ------------------------------ +------------------------------- - HPA: it is same as vanilla K8s HPA. HPA, the native Kubernetes autoscaler, is utilized when users deploy a specification with AIBrix that calls for an HPA. This setup scales the replicas of a demo deployment based on CPU utilization. - KPA: it is from Knative. KPA has panic mode which scales up more quickly based on short term history. More rapid scaling is possible. The KPA, inspired by Knative, maintains two time windows: a longer ``stable window`` and a shorter ``panic window``. It rapidly scales up resources in response to sudden spikes in traffic based on the panic window measurements. Unlike other solutions that might rely on Prometheus for gathering deployment metrics, AIBrix fetches and maintains metrics internally, enabling faster response times. Example of a KPA scaling operation using a mocked vllm-based Llama2-7b deployment diff --git a/docs/source/features/distributed-kv-cache.rst b/docs/source/features/distributed-kv-cache.rst index 4dd01210..ee40cb3e 100644 --- a/docs/source/features/distributed-kv-cache.rst +++ b/docs/source/features/distributed-kv-cache.rst @@ -58,73 +58,9 @@ After deployment, we can see all the components by using ``kubectl get pods -n a After all components are running, we can use the following yaml to deploy the inference service: -.. code-block:: yaml - :emphasize-lines: 39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64 +.. literalinclude:: ../../../samples/kvcache/deployment.yaml + :language: yaml - apiVersion: apps/v1 - kind: Deployment - metadata: - name: aibrix-model-deepseek-coder-33b-instruct - labels: - model.aibrix.ai/name: deepseek-coder-33b-instruct - model.aibrix.ai/port: "8000" - spec: - strategy: - rollingUpdate: - maxSurge: 1 - maxUnavailable: 1 - type: RollingUpdate - template: - spec: - containers: - - name: vllm-openai - image: aibrix/vllm-openai:v0.6.1-edb07092-20250118 - imagePullPolicy: Always - command: - - python3 - - -m - - vllm.entrypoints.openai.api_server - - --port - - "8000" - - --model - - deepseek-ai/deepseek-coder-33b-instruct - - --served-model-name - - deepseek-coder-33b-instruct - - --distributed-executor-backend - - ray - - --trust-remote-code - - --tensor-parallel-size - - "4" - - --max-model-len - - "17000" - - --enable-prefix-caching - - --disable-fastapi-docs - env: - - name: VLLM_USE_VINEYARD_CACHE - value: "1" - - name: VINEYARD_CACHE_CPU_MEM_LIMIT_GB - value: "70" - - name: AIBRIX_LLM_KV_CACHE - value: "1" - - name: AIBRIX_LLM_KV_CACHE_KV_CACHE_NS - value: "aibrix" - - name: AIBRIX_LLM_KV_CACHE_CHUNK_SIZE - value: "16" - - name: AIBRIX_LLM_KV_CACHE_SOCKET - value: /var/run/vineyard.sock - - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT - value: "aibrix-model-deepseek-coder-33b-kvcache-rpc:9600" - - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE - value: "1" - - name: "VINEYARD_CACHE_METRICS_ENABLED" - value: "1" - volumeMounts: - - mountPath: /var/run - name: kvcache-socket - volumes: - - name: kvcache-socket - hostPath: - path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-33b-kvcache .. note:: * ``metadata.name`` MUST match with ``kvcache.orchestration.aibrix.ai/pod-affinity-workload`` in the kv cache deployment diff --git a/docs/source/features/heterogeneous-gpu.rst b/docs/source/features/heterogeneous-gpu.rst index d131a0b7..ed583254 100644 --- a/docs/source/features/heterogeneous-gpu.rst +++ b/docs/source/features/heterogeneous-gpu.rst @@ -81,32 +81,8 @@ Now the GPU Optimizer is ready to work. You should observe that the number of wo A simple example of PodAutoscaler spec for a10 GPU is as follows: -.. code-block:: yaml - - apiVersion: autoscaling.aibrix.ai/v1alpha1 - kind: PodAutoscaler - metadata: - name: podautoscaler-deepseek-coder-7b-a10 - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - kpa.autoscaling.aibrix.ai/scale-down-delay: 0s - namespace: default - spec: - scaleTargetRef: - apiVersion: apps/v1 - kind: Deployment - name: deepseek-coder-7b-a10 # replace with corresponding deployment name - minReplicas: 0 # Note that minReplicas must be set to be 0, otherwise it will prevent the gpu optimizer to scale down to 0. - maxReplicas: 10 # replace with max number of nodes in the cluster - metricsSources: - - metricSourceType: domain - protocolType: http - endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080 - path: /metrics/default/deepseek-coder-7b-a10 # replace with /metrics/default/[deployment name] - targetMetric: "vllm:deployment_replicas" - targetValue: "1" - scalingStrategy: "KPA" +.. literalinclude:: ../../../samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml + :language: yaml Miscellaneous diff --git a/docs/source/features/lora-dynamic-loading.rst b/docs/source/features/lora-dynamic-loading.rst index 9ce743ab..2a02e106 100644 --- a/docs/source/features/lora-dynamic-loading.rst +++ b/docs/source/features/lora-dynamic-loading.rst @@ -56,84 +56,16 @@ Prerequisites Create base model ^^^^^^^^^^^^^^^^^ -.. code-block:: yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - name: llama2-7b - namespace: default - labels: - model.aibrix.ai/name: "llama2-7b" - model.aibrix.ai/port: "8000" - adapter.model.aibrix.ai/enabled: "true" - spec: - replicas: 3 - selector: - matchLabels: - adapter.model.aibrix.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - template: - metadata: - labels: - adapter.model.aibrix.ai/enabled: "true" - model.aibrix.ai/name: "llama2-7b" - spec: - serviceAccountName: mocked-app-sa - containers: - - name: llm-engine - # TODO: update - image: aibrix/vllm-mock:nightly - ports: - - containerPort: 8000 - - name: aibrix-runtime - image: aibrix/runtime:nightly - command: - - aibrix_runtime - - --port - - "8080" - env: - - name: INFERENCE_ENGINE - value: vllm - - name: INFERENCE_ENGINE_ENDPOINT - value: http://localhost:8000 - ports: - - containerPort: 8080 - protocol: TCP - livenessProbe: - httpGet: - path: /healthz - port: 8080 - initialDelaySeconds: 3 - periodSeconds: 2 - readinessProbe: - httpGet: - path: /ready - port: 8080 - initialDelaySeconds: 5 - periodSeconds: 10 +.. literalinclude:: ../../../samples/adapter/base.yaml + :language: yaml + Create lora model adapter ^^^^^^^^^^^^^^^^^^^^^^^^^ -.. code-block:: yaml - - apiVersion: model.aibrix.ai/v1alpha1 - kind: ModelAdapter - metadata: - name: llama-2-7b-sql-lora - namespace: default - labels: - model.aibrix.ai/name: "llama-2-7b-sql-lora" - model.aibrix.ai/port: "8000" - spec: - baseModel: llama2-7b - podSelector: - matchLabels: - model.aibrix.ai/name: llama2-7b - artifactURL: huggingface://yard1/llama-2-7b-sql-lora-test - schedulerName: default +.. literalinclude:: ../../../samples/adapter/adapter.yaml + :language: yaml If you run ```kubectl describe modeladapter llama-2-7b-sql-lora``, you will see the status of the lora adapter. @@ -187,22 +119,5 @@ User may pass in the argument ``--api-key`` or environment variable ``VLLM_API_K In that case, lora model adapter can not query the vLLM server correctly, showing ``{"error":"Unauthorized"}`` error. You need to update ``additionalConfig`` field to pass in the API key. -.. code-block:: yaml - - apiVersion: model.aibrix.ai/v1alpha1 - kind: ModelAdapter - metadata: - name: text2sql-lora - namespace: default - labels: - model.aibrix.ai/name: "text2sql-lora" - model.aibrix.ai/port: "8000" - spec: - baseModel: llama2-7b - podSelector: - matchLabels: - model.aibrix.ai/name: llama2-7b - artifactURL: huggingface://yard1/llama-2-7b-sql-lora-test - additionalConfig: - api-key: test-key-1234567890 - schedulerName: default +.. literalinclude:: ../../../samples/adapter/adapter-api-key.yaml + :language: yaml diff --git a/docs/source/features/multi-node-inference.rst b/docs/source/features/multi-node-inference.rst index 996fa7ef..304ea49a 100644 --- a/docs/source/features/multi-node-inference.rst +++ b/docs/source/features/multi-node-inference.rst @@ -70,59 +70,5 @@ If you are using vLLM earlier version, you have two options. RayClusterReplicaSet ^^^^^^^^^^^^^^^^^^^^ -.. code-block:: yaml - - apiVersion: orchestration.aibrix.ai/v1alpha1 - kind: RayClusterFleet - metadata: - labels: - app.kubernetes.io/name: aibrix - app.kubernetes.io/managed-by: kustomize - name: facebook-opt-13b - spec: - replicas: 1 - selector: - matchLabels: - model.aibrix.ai/name: facebook-opt-13b - strategy: - rollingUpdate: - maxSurge: 25% - maxUnavailable: 25% - type: RollingUpdate - template: - metadata: - labels: - model.aibrix.ai/name: facebook-opt-13b - annotations: - ray.io/overwrite-container-cmd: "true" - spec: - rayVersion: '2.10.0' # should match the Ray version in the image of the containers - headGroupSpec: - rayStartParams: - dashboard-host: '0.0.0.0' - template: - spec: - containers: - - name: ray-head - image: aibrix/vllm-openai:v0.6.1.post2-distributed - ports: - - containerPort: 6379 - name: gcs-server - - containerPort: 8265 - name: dashboard - - containerPort: 10001 - name: client - - containerPort: 8000 - name: service - command: ["/bin/bash", "-lc", "--"] - # Starting from v1.1.0, KubeRay injects the environment variable `KUBERAY_GEN_RAY_START_CMD` - # into the Ray container. This variable can be used to retrieve the generated Ray start command. - # Note that this environment variable does not include the `ulimit` command. - args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve /models/llama-2-7b-hf/ --served-model-name meta-llama/llama-2-7b-hf --tensor-parallel-size 2 --distributed-executor-backend ray"] - resources: - limits: - cpu: "8000m" - nvidia.com/gpu: 2 - requests: - cpu: "8000m" - nvidia.com/gpu: 2 +.. literalinclude:: ../../../samples/distributed/multi-host.yaml + :language: yaml \ No newline at end of file diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation/installation.rst similarity index 90% rename from docs/source/getting_started/installation.rst rename to docs/source/getting_started/installation/installation.rst index 93c16430..fedbb660 100644 --- a/docs/source/getting_started/installation.rst +++ b/docs/source/getting_started/installation/installation.rst @@ -43,6 +43,17 @@ Nightly Version kubectl create -k config/default +Install AIBrix in testing Environments +-------------------------------------- + +.. toctree:: + :maxdepth: 1 + :caption: Getting Started + + lambda.rst + mac-for-desktop.rst + + Install Individual AIBrix Components ------------------------------------ diff --git a/docs/source/getting_started/lambda.rst b/docs/source/getting_started/installation/lambda.rst similarity index 89% rename from docs/source/getting_started/lambda.rst rename to docs/source/getting_started/installation/lambda.rst index 2b6dd8fe..066b7890 100644 --- a/docs/source/getting_started/lambda.rst +++ b/docs/source/getting_started/installation/lambda.rst @@ -1,8 +1,8 @@ -.. _lambda_cloud_installation: +.. _lambda_cloud: -================================================= -AIBrix Single-Node Deployment on Lambda Instances -================================================= +============ +Lambda Cloud +============ This guide provides a step-by-step tutorial to deploy AIBrix on a single-node Lambda instance for testing purposes. The setup includes installing dependencies, verifying the installation, setting up the cluster, and deploying AIBrix components. @@ -15,14 +15,14 @@ Before you begin, ensure you have the following: You can follow `lambda cloud docs `_ to launch an instance. -.. figure:: ../assets/images/cloud/lambda-cloud-instance.png +.. figure:: ../../assets/images/cloud/lambda-cloud-instance.png :alt: lambda-cloud-instance :width: 70% :align: center After launching the instance, you can get the instance's IP address and ssh into the instance. -.. figure::../assets/images/cloud/lambda-cloud-ssh.png +.. figure::../../assets/images/cloud/lambda-cloud-ssh.png :alt: lambda-cloud-ssh :width: 70% :align: center @@ -46,7 +46,7 @@ Run the following script to install the necessary dependencies including `nvkind - Configures the NVIDIA Container Toolkit - Updates Docker settings for GPU compatibility -.. figure::../assets/images/cloud/lambda-cloud-installation.png +.. figure::../../assets/images/cloud/lambda-cloud-installation.png :alt: lambda-cloud-installation :width: 70% :align: center @@ -72,7 +72,7 @@ Run the following script to ensure that the NVIDIA drivers and Docker integratio If all checks pass successfully like below, proceed to the next step. -.. figure::../assets/images/cloud/lambda-cloud-verify-installation.png +.. figure::../../assets/images/cloud/lambda-cloud-verify-installation.png :alt: lambda-cloud-verify-installation :width: 70% :align: center diff --git a/docs/source/getting_started/installation/mac-for-desktop.rst b/docs/source/getting_started/installation/mac-for-desktop.rst new file mode 100644 index 00000000..01a2b931 --- /dev/null +++ b/docs/source/getting_started/installation/mac-for-desktop.rst @@ -0,0 +1,6 @@ +.. _mac-for-desktop: + +=============== +Mac for Desktop +=============== + diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 0e54dd04..c4c467e6 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -33,130 +33,15 @@ Wait for few minutes and run `kubectl get pods -n aibrix-system` to check pod st Deploy base model ^^^^^^^^^^^^^^^^^ -Save yaml as `deployment.yaml` and run `kubectl apply -f deployment.yaml`. - -.. code-block:: yaml - - apiVersion: apps/v1 - kind: Deployment - metadata: - labels: - # Note: The label value `model.aibrix.ai/name` here must match with the service name. - model.aibrix.ai/name: qwen25-7b-Instruct - model.aibrix.ai/port: "8000" - adapter.model.aibrix.ai/enabled: true - name: qwen25-7b-Instruct - namespace: default - spec: - replicas: 1 - selector: - matchLabels: - model.aibrix.ai/name: qwen25-7b-Instruct - strategy: - rollingUpdate: - maxSurge: 25% - maxUnavailable: 25% - type: RollingUpdate - template: - metadata: - labels: - model.aibrix.ai/name: qwen25-7b-Instruct - spec: - containers: - - command: - - python3 - - -m - - vllm.entrypoints.openai.api_server - - --host - - "0.0.0.0" - - --port - - "8000" - - --model - - Qwen/Qwen2.5-7B-Instruct - - --served-model-name - # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name` - - qwen25-7b-Instruct - - --trust-remote-code - - --enable-lora - env: - - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING - value: "true" - image: aibrix/vllm-openai:v0.6.1.post2 - imagePullPolicy: Always - livenessProbe: - failureThreshold: 3 - httpGet: - path: /health - port: 8000 - scheme: HTTP - initialDelaySeconds: 90 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - name: vllm-openai - ports: - - containerPort: 8000 - protocol: TCP - readinessProbe: - failureThreshold: 3 - httpGet: - path: /health - port: 8000 - scheme: HTTP - initialDelaySeconds: 90 - periodSeconds: 5 - successThreshold: 1 - timeoutSeconds: 1 - resources: - limits: - nvidia.com/gpu: "1" - requests: - nvidia.com/gpu: "1" - volumeMounts: - - name: dshm - mountPath: /dev/shm - volumes: - - name: dshm - emptyDir: - medium: Memory - sizeLimit: "4Gi" - -Save yaml as `service.yaml` and run `kubectl apply -f service.yaml`. - -.. code-block:: yaml - - apiVersion: v1 - kind: Service - metadata: - labels: - # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment - model.aibrix.ai/name: qwen25-7b-Instruct - prometheus-discovery: "true" - annotations: - prometheus.io/scrape: "true" - prometheus.io/port: "8080" - name: qwen25-7b-Instruct - namespace: default - spec: - ports: - - name: serve - port: 8000 - protocol: TCP - targetPort: 8000 - - name: http - port: 8080 - protocol: TCP - targetPort: 8080 - selector: - model.aibrix.ai/name: qwen25-7b-Instruct - type: ClusterIP +Save yaml as `model.yaml` and run `kubectl apply -f model.yaml`. -.. note:: +.. literalinclude:: ../../../samples/quickstart/model.yaml + :language: yaml - Ensure that: +Ensure that: - 1. The `Service` name matches the `model.aibrix.ai/name` label value in the `Deployment`. - 2. The `--served-model-name` argument value in the `Deployment` command is also consistent with the `Service` name and `model.aibrix.ai/name` label. +1. The `Service` name matches the `model.aibrix.ai/name` label value in the `Deployment`. +2. The `--served-model-name` argument value in the `Deployment` command is also consistent with the `Service` name and `model.aibrix.ai/name` label. Invoke the model endpoint using gateway api diff --git a/docs/source/index.rst b/docs/source/index.rst index e8fab8e6..eaf0cfb2 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -29,7 +29,7 @@ Documentation designs/architecture.rst getting_started/quickstart.rst - getting_started/installation.rst + getting_started/installation/installation.rst getting_started/faq.rst .. toctree:: diff --git a/samples/adapter/adapter-api-key.yaml b/samples/adapter/adapter-api-key.yaml new file mode 100644 index 00000000..4a44a798 --- /dev/null +++ b/samples/adapter/adapter-api-key.yaml @@ -0,0 +1,17 @@ +apiVersion: model.aibrix.ai/v1alpha1 +kind: ModelAdapter +metadata: + name: qwen-code-lora + namespace: default + labels: + model.aibrix.ai/name: "qwen-code-lora" + model.aibrix.ai/port: "8000" +spec: + baseModel: llama2-7b + podSelector: + matchLabels: + model.aibrix.ai/name: llama2-7b + artifactURL: huggingface://ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora + additionalConfig: + api-key: test-key-1234567890 + schedulerName: default diff --git a/samples/adapter/adapter.yaml b/samples/adapter/adapter.yaml new file mode 100644 index 00000000..4a44a798 --- /dev/null +++ b/samples/adapter/adapter.yaml @@ -0,0 +1,17 @@ +apiVersion: model.aibrix.ai/v1alpha1 +kind: ModelAdapter +metadata: + name: qwen-code-lora + namespace: default + labels: + model.aibrix.ai/name: "qwen-code-lora" + model.aibrix.ai/port: "8000" +spec: + baseModel: llama2-7b + podSelector: + matchLabels: + model.aibrix.ai/name: llama2-7b + artifactURL: huggingface://ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora + additionalConfig: + api-key: test-key-1234567890 + schedulerName: default diff --git a/samples/adapter/base.yaml b/samples/adapter/base.yaml new file mode 100644 index 00000000..3cd58423 --- /dev/null +++ b/samples/adapter/base.yaml @@ -0,0 +1,73 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + model.aibrix.ai/name: qwen-coder-15b-instruct # Note: The label value `model.aibrix.ai/name` here must match with the service name. + model.aibrix.ai/port: "8000" + adapter.model.aibrix.ai/enabled: "true" + name: qwen-coder-1.5b-instruct + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: qwen-coder-1.5b-instruct + template: + metadata: + labels: + model.aibrix.ai/name: qwen-coder-1.5b-instruct + spec: + containers: + - command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8000" + - --model + - Qwen/Qwen2.5-Coder-1.5B-Instruct + - --served-model-name + # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name` + - qwen-coder-1.5b-instruct + - --enable-lora + image: vllm/vllm-openai:v0.7.1 + imagePullPolicy: Always + name: vllm-openai + ports: + - containerPort: 8000 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" + - name: aibrix-runtime + image: aibrix/runtime:v0.2.0-rc.2 + command: + - aibrix_runtime + - --port + - "8080" + env: + - name: INFERENCE_ENGINE + value: vllm + - name: INFERENCE_ENGINE_ENDPOINT + value: http://localhost:8000 + ports: + - containerPort: 8080 + protocol: TCP + livenessProbe: + httpGet: + path: /healthz + port: 8080 + initialDelaySeconds: 3 + periodSeconds: 2 + readinessProbe: + httpGet: + path: /ready + port: 8080 + initialDelaySeconds: 5 + periodSeconds: 10 + +--- diff --git a/samples/distributed/multi-host.yaml b/samples/distributed/multi-host.yaml new file mode 100644 index 00000000..7205a010 --- /dev/null +++ b/samples/distributed/multi-host.yaml @@ -0,0 +1,54 @@ +apiVersion: orchestration.aibrix.ai/v1alpha1 +kind: RayClusterFleet +metadata: + labels: + app.kubernetes.io/name: aibrix + app.kubernetes.io/managed-by: kustomize + name: facebook-opt-13b +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: facebook-opt-13b + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + labels: + model.aibrix.ai/name: facebook-opt-13b + annotations: + ray.io/overwrite-container-cmd: "true" + spec: + rayVersion: '2.10.0' # should match the Ray version in the image of the containers + headGroupSpec: + rayStartParams: + dashboard-host: '0.0.0.0' + template: + spec: + containers: + - name: ray-head + image: aibrix/vllm-openai:v0.6.1.post2-distributed + ports: + - containerPort: 6379 + name: gcs-server + - containerPort: 8265 + name: dashboard + - containerPort: 10001 + name: client + - containerPort: 8000 + name: service + command: ["/bin/bash", "-lc", "--"] + # Starting from v1.1.0, KubeRay injects the environment variable `KUBERAY_GEN_RAY_START_CMD` + # into the Ray container. This variable can be used to retrieve the generated Ray start command. + # Note that this environment variable does not include the `ulimit` command. + args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve /models/llama-2-7b-hf/ --served-model-name meta-llama/llama-2-7b-hf --tensor-parallel-size 2 --distributed-executor-backend ray"] + resources: + limits: + cpu: "8000m" + nvidia.com/gpu: 2 + requests: + cpu: "8000m" + nvidia.com/gpu: 2 diff --git a/samples/kvcache/deployment.yaml b/samples/kvcache/deployment.yaml new file mode 100644 index 00000000..44b6f2c3 --- /dev/null +++ b/samples/kvcache/deployment.yaml @@ -0,0 +1,64 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: aibrix-model-deepseek-coder-33b-instruct + labels: + model.aibrix.ai/name: deepseek-coder-33b-instruct + model.aibrix.ai/port: "8000" +spec: + strategy: + rollingUpdate: + maxSurge: 1 + maxUnavailable: 1 + type: RollingUpdate + template: + spec: + containers: + - name: vllm-openai + image: aibrix/vllm-openai:v0.6.1-edb07092-20250118 + imagePullPolicy: Always + command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --port + - "8000" + - --model + - deepseek-ai/deepseek-coder-33b-instruct + - --served-model-name + - deepseek-coder-33b-instruct + - --distributed-executor-backend + - ray + - --trust-remote-code + - --tensor-parallel-size + - "4" + - --max-model-len + - "17000" + - --enable-prefix-caching + - --disable-fastapi-docs + env: + - name: VLLM_USE_VINEYARD_CACHE + value: "1" + - name: VINEYARD_CACHE_CPU_MEM_LIMIT_GB + value: "70" + - name: AIBRIX_LLM_KV_CACHE + value: "1" + - name: AIBRIX_LLM_KV_CACHE_KV_CACHE_NS + value: "aibrix" + - name: AIBRIX_LLM_KV_CACHE_CHUNK_SIZE + value: "16" + - name: AIBRIX_LLM_KV_CACHE_SOCKET + value: /var/run/vineyard.sock + - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT + value: "aibrix-model-deepseek-coder-33b-kvcache-rpc:9600" + - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE + value: "1" + - name: "VINEYARD_CACHE_METRICS_ENABLED" + value: "1" + volumeMounts: + - mountPath: /var/run + name: kvcache-socket + volumes: + - name: kvcache-socket + hostPath: + path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-33b-kvcache-rpc \ No newline at end of file diff --git a/samples/quickstart/model.yaml b/samples/quickstart/model.yaml new file mode 100644 index 00000000..781c4d15 --- /dev/null +++ b/samples/quickstart/model.yaml @@ -0,0 +1,70 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b # Note: The label value `model.aibrix.ai/name` here must match with the service name. + model.aibrix.ai/port: "8000" + name: deepseek-r1-distill-llama-8b + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + template: + metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + spec: + containers: + - command: + - python3 + - -m + - vllm.entrypoints.openai.api_server + - --host + - "0.0.0.0" + - --port + - "8000" + - --model +# - deepseek-ai/DeepSeek-R1-Distill-Llama-8B + - Qwen/Qwen2.5-Coder-1.5B-Instruct + - --served-model-name + # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name` + - deepseek-r1-distill-llama-8b + image: vllm/vllm-openai:v0.7.1 + imagePullPolicy: Always + name: vllm-openai + ports: + - containerPort: 8000 + protocol: TCP + resources: + limits: + nvidia.com/gpu: "1" + requests: + nvidia.com/gpu: "1" +--- + +apiVersion: v1 +kind: Service +metadata: + labels: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + prometheus-discovery: "true" + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + name: deepseek-r1-distill-llama-8b # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment + namespace: default +spec: + ports: + - name: serve + port: 8000 + protocol: TCP + targetPort: 8000 + - name: http + port: 8080 + protocol: TCP + targetPort: 8080 + selector: + model.aibrix.ai/name: deepseek-r1-distill-llama-8b + type: LoadBalancer \ No newline at end of file