diff --git a/development/tutorials/lora/model_adapter.yaml b/development/tutorials/lora/model_adapter.yaml
index fd18aca9..81df65a3 100644
--- a/development/tutorials/lora/model_adapter.yaml
+++ b/development/tutorials/lora/model_adapter.yaml
@@ -13,22 +13,3 @@ spec:
       model.aibrix.ai/name: llama2-7b
   artifactURL: huggingface://yard1/llama-2-7b-sql-lora-test
   schedulerName: default
-# ---
-# # for test-purpose, if need to create HTTPRoute object manually
-# apiVersion: gateway.networking.k8s.io/v1
-# kind: HTTPRoute
-# metadata:
-#   name: lora-1-router
-#   namespace: aibrix-system
-# spec:
-#   parentRefs:
-#     - name: aibrix-eg
-#   rules:
-#     - matches:
-#         - headers:
-#             - type: Exact
-#               name: model
-#               value: lora-1
-#       backendRefs:
-#         - name: lora-1
-#           port: 8000
\ No newline at end of file
diff --git a/docs/source/features/autoscaling.rst b/docs/source/features/autoscaling.rst
index 395e1fd0..31a65a79 100644
--- a/docs/source/features/autoscaling.rst
+++ b/docs/source/features/autoscaling.rst
@@ -15,7 +15,7 @@ In the following sections, we will demonstrate how users can create various type
 
 
 Supported Autoscaling Mechanism
------------------------------
+-------------------------------
 
 - HPA: it is same as vanilla K8s HPA. HPA, the native Kubernetes autoscaler, is utilized when users deploy a specification with AIBrix that calls for an HPA. This setup scales the replicas of a demo deployment based on CPU utilization.
 - KPA: it is from Knative. KPA has panic mode which scales up more quickly based on short term history. More rapid scaling is possible. The KPA, inspired by Knative, maintains two time windows: a longer ``stable window`` and a shorter ``panic window``. It rapidly scales up resources in response to sudden spikes in traffic based on the panic window measurements. Unlike other solutions that might rely on Prometheus for gathering deployment metrics, AIBrix fetches and maintains metrics internally, enabling faster response times. Example of a KPA scaling operation using a mocked vllm-based Llama2-7b deployment
diff --git a/docs/source/features/distributed-kv-cache.rst b/docs/source/features/distributed-kv-cache.rst
index 4dd01210..ee40cb3e 100644
--- a/docs/source/features/distributed-kv-cache.rst
+++ b/docs/source/features/distributed-kv-cache.rst
@@ -58,73 +58,9 @@ After deployment, we can see all the components by using ``kubectl get pods -n a
 
 After all components are running, we can use the following yaml to deploy the inference service:
 
-.. code-block:: yaml
-    :emphasize-lines: 39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64
+.. literalinclude:: ../../../samples/kvcache/deployment.yaml
+   :language: yaml
 
-    apiVersion: apps/v1
-    kind: Deployment
-    metadata:
-      name: aibrix-model-deepseek-coder-33b-instruct
-      labels:
-        model.aibrix.ai/name: deepseek-coder-33b-instruct
-        model.aibrix.ai/port: "8000"
-    spec:
-      strategy:
-        rollingUpdate:
-          maxSurge: 1
-          maxUnavailable: 1
-        type: RollingUpdate
-      template:
-        spec:
-          containers:
-            - name: vllm-openai
-              image: aibrix/vllm-openai:v0.6.1-edb07092-20250118
-              imagePullPolicy: Always
-              command:
-              - python3
-              - -m
-              - vllm.entrypoints.openai.api_server
-              - --port
-              - "8000"
-              - --model
-              - deepseek-ai/deepseek-coder-33b-instruct
-              - --served-model-name
-              - deepseek-coder-33b-instruct
-              - --distributed-executor-backend
-              - ray
-              - --trust-remote-code
-              - --tensor-parallel-size
-              - "4"
-              - --max-model-len
-              - "17000"
-              - --enable-prefix-caching
-              - --disable-fastapi-docs
-              env:
-              - name: VLLM_USE_VINEYARD_CACHE
-                value: "1"
-              - name: VINEYARD_CACHE_CPU_MEM_LIMIT_GB
-                value: "70"
-              - name: AIBRIX_LLM_KV_CACHE
-                value: "1"
-              - name: AIBRIX_LLM_KV_CACHE_KV_CACHE_NS
-                value: "aibrix"
-              - name: AIBRIX_LLM_KV_CACHE_CHUNK_SIZE
-                value: "16"
-              - name: AIBRIX_LLM_KV_CACHE_SOCKET
-                value: /var/run/vineyard.sock
-              - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT
-                value: "aibrix-model-deepseek-coder-33b-kvcache-rpc:9600"
-              - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE
-                value: "1"
-              - name: "VINEYARD_CACHE_METRICS_ENABLED"
-                value: "1"
-              volumeMounts:
-                - mountPath: /var/run
-                  name: kvcache-socket
-          volumes:
-            - name: kvcache-socket
-              hostPath:
-                path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-33b-kvcache
 
 .. note::
     * ``metadata.name`` MUST match with ``kvcache.orchestration.aibrix.ai/pod-affinity-workload`` in the kv cache deployment
diff --git a/docs/source/features/heterogeneous-gpu.rst b/docs/source/features/heterogeneous-gpu.rst
index d131a0b7..ed583254 100644
--- a/docs/source/features/heterogeneous-gpu.rst
+++ b/docs/source/features/heterogeneous-gpu.rst
@@ -81,32 +81,8 @@ Now the GPU Optimizer is ready to work. You should observe that the number of wo
 
 A simple example of PodAutoscaler spec for a10 GPU is as follows:
 
-.. code-block:: yaml
-
-    apiVersion: autoscaling.aibrix.ai/v1alpha1
-    kind: PodAutoscaler
-    metadata:
-      name: podautoscaler-deepseek-coder-7b-a10
-      labels:
-        app.kubernetes.io/name: aibrix
-        app.kubernetes.io/managed-by: kustomize
-        kpa.autoscaling.aibrix.ai/scale-down-delay: 0s
-      namespace: default
-    spec:
-      scaleTargetRef:
-        apiVersion: apps/v1
-        kind: Deployment
-        name: deepseek-coder-7b-a10 # replace with corresponding deployment name
-      minReplicas: 0 # Note that minReplicas must be set to be 0, otherwise it will prevent the gpu optimizer to scale down to 0.
-      maxReplicas: 10 # replace with max number of nodes in the cluster
-      metricsSources: 
-        - metricSourceType: domain
-          protocolType: http
-          endpoint: aibrix-gpu-optimizer.aibrix-system.svc.cluster.local:8080
-          path: /metrics/default/deepseek-coder-7b-a10 # replace with /metrics/default/[deployment name]
-          targetMetric: "vllm:deployment_replicas"
-          targetValue: "1"
-      scalingStrategy: "KPA"
+.. literalinclude:: ../../../samples/heterogeneous/deepseek-coder-7b-l20-podautoscaler.yaml
+   :language: yaml
 
 
 Miscellaneous
diff --git a/docs/source/features/lora-dynamic-loading.rst b/docs/source/features/lora-dynamic-loading.rst
index 9ce743ab..2a02e106 100644
--- a/docs/source/features/lora-dynamic-loading.rst
+++ b/docs/source/features/lora-dynamic-loading.rst
@@ -56,84 +56,16 @@ Prerequisites
 Create base model
 ^^^^^^^^^^^^^^^^^
 
-.. code-block:: yaml
-
-    apiVersion: apps/v1
-    kind: Deployment
-    metadata:
-      name: llama2-7b
-      namespace: default
-      labels:
-        model.aibrix.ai/name: "llama2-7b"
-        model.aibrix.ai/port: "8000"
-        adapter.model.aibrix.ai/enabled: "true"
-    spec:
-      replicas: 3
-      selector:
-        matchLabels:
-          adapter.model.aibrix.ai/enabled: "true"
-          model.aibrix.ai/name: "llama2-7b"
-      template:
-        metadata:
-          labels:
-            adapter.model.aibrix.ai/enabled: "true"
-            model.aibrix.ai/name: "llama2-7b"
-        spec:
-          serviceAccountName: mocked-app-sa
-          containers:
-            - name: llm-engine
-              # TODO: update
-              image: aibrix/vllm-mock:nightly
-              ports:
-                - containerPort: 8000
-            - name: aibrix-runtime
-              image: aibrix/runtime:nightly
-              command:
-                - aibrix_runtime
-                - --port
-                - "8080"
-              env:
-                - name: INFERENCE_ENGINE
-                  value: vllm
-                - name: INFERENCE_ENGINE_ENDPOINT
-                  value: http://localhost:8000
-              ports:
-                - containerPort: 8080
-                  protocol: TCP
-              livenessProbe:
-                httpGet:
-                  path: /healthz
-                  port: 8080
-                initialDelaySeconds: 3
-                periodSeconds: 2
-              readinessProbe:
-                httpGet:
-                  path: /ready
-                  port: 8080
-                initialDelaySeconds: 5
-                periodSeconds: 10
+.. literalinclude:: ../../../samples/adapter/base.yaml
+   :language: yaml
+
 
 
 Create lora model adapter
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
-.. code-block:: yaml
-
-    apiVersion: model.aibrix.ai/v1alpha1
-    kind: ModelAdapter
-    metadata:
-      name: llama-2-7b-sql-lora
-      namespace: default
-      labels:
-        model.aibrix.ai/name: "llama-2-7b-sql-lora"
-        model.aibrix.ai/port: "8000"
-    spec:
-      baseModel: llama2-7b
-      podSelector:
-        matchLabels:
-          model.aibrix.ai/name: llama2-7b
-      artifactURL:  huggingface://yard1/llama-2-7b-sql-lora-test
-      schedulerName: default
+.. literalinclude:: ../../../samples/adapter/adapter.yaml
+   :language: yaml
 
 If you run ```kubectl describe modeladapter llama-2-7b-sql-lora``, you will see the status of the lora adapter.
 
@@ -187,22 +119,5 @@ User may pass in the argument ``--api-key`` or environment variable ``VLLM_API_K
 
 In that case, lora model adapter can not query the vLLM server correctly, showing ``{"error":"Unauthorized"}`` error. You need to update ``additionalConfig`` field to pass in the API key.
 
-.. code-block:: yaml
-
-    apiVersion: model.aibrix.ai/v1alpha1
-    kind: ModelAdapter
-    metadata:
-      name: text2sql-lora
-      namespace: default
-      labels:
-        model.aibrix.ai/name: "text2sql-lora"
-        model.aibrix.ai/port: "8000"
-    spec:
-      baseModel: llama2-7b
-      podSelector:
-        matchLabels:
-          model.aibrix.ai/name: llama2-7b
-      artifactURL: huggingface://yard1/llama-2-7b-sql-lora-test
-      additionalConfig:
-        api-key: test-key-1234567890
-      schedulerName: default
+.. literalinclude:: ../../../samples/adapter/adapter-api-key.yaml
+   :language: yaml
diff --git a/docs/source/features/multi-node-inference.rst b/docs/source/features/multi-node-inference.rst
index 996fa7ef..304ea49a 100644
--- a/docs/source/features/multi-node-inference.rst
+++ b/docs/source/features/multi-node-inference.rst
@@ -70,59 +70,5 @@ If you are using vLLM earlier version, you have two options.
 RayClusterReplicaSet
 ^^^^^^^^^^^^^^^^^^^^
 
-.. code-block:: yaml
-
-    apiVersion: orchestration.aibrix.ai/v1alpha1
-    kind: RayClusterFleet
-    metadata:
-      labels:
-        app.kubernetes.io/name: aibrix
-        app.kubernetes.io/managed-by: kustomize
-      name: facebook-opt-13b
-    spec:
-      replicas: 1
-      selector:
-        matchLabels:
-          model.aibrix.ai/name: facebook-opt-13b
-      strategy:
-        rollingUpdate:
-          maxSurge: 25%
-          maxUnavailable: 25%
-        type: RollingUpdate
-      template:
-        metadata:
-          labels:
-            model.aibrix.ai/name: facebook-opt-13b
-          annotations:
-            ray.io/overwrite-container-cmd: "true"
-        spec:
-          rayVersion: '2.10.0' # should match the Ray version in the image of the containers
-          headGroupSpec:
-            rayStartParams:
-              dashboard-host: '0.0.0.0'
-            template:
-              spec:
-                containers:
-                  - name: ray-head
-                    image: aibrix/vllm-openai:v0.6.1.post2-distributed
-                    ports:
-                      - containerPort: 6379
-                        name: gcs-server
-                      - containerPort: 8265
-                        name: dashboard
-                      - containerPort: 10001
-                        name: client
-                      - containerPort: 8000
-                        name: service
-                    command: ["/bin/bash", "-lc", "--"]
-                    # Starting from v1.1.0, KubeRay injects the environment variable `KUBERAY_GEN_RAY_START_CMD`
-                    # into the Ray container. This variable can be used to retrieve the generated Ray start command.
-                    # Note that this environment variable does not include the `ulimit` command.
-                    args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve /models/llama-2-7b-hf/ --served-model-name meta-llama/llama-2-7b-hf --tensor-parallel-size 2 --distributed-executor-backend ray"]
-                    resources:
-                      limits:
-                        cpu: "8000m"
-                        nvidia.com/gpu: 2
-                      requests:
-                        cpu: "8000m"
-                        nvidia.com/gpu: 2
+.. literalinclude:: ../../../samples/distributed/multi-host.yaml
+   :language: yaml
\ No newline at end of file
diff --git a/docs/source/getting_started/installation.rst b/docs/source/getting_started/installation/installation.rst
similarity index 90%
rename from docs/source/getting_started/installation.rst
rename to docs/source/getting_started/installation/installation.rst
index 93c16430..fedbb660 100644
--- a/docs/source/getting_started/installation.rst
+++ b/docs/source/getting_started/installation/installation.rst
@@ -43,6 +43,17 @@ Nightly Version
     kubectl create -k config/default
 
 
+Install AIBrix in testing Environments
+--------------------------------------
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Getting Started
+
+   lambda.rst
+   mac-for-desktop.rst
+
+
 Install Individual AIBrix Components
 ------------------------------------
 
diff --git a/docs/source/getting_started/lambda.rst b/docs/source/getting_started/installation/lambda.rst
similarity index 89%
rename from docs/source/getting_started/lambda.rst
rename to docs/source/getting_started/installation/lambda.rst
index 2b6dd8fe..066b7890 100644
--- a/docs/source/getting_started/lambda.rst
+++ b/docs/source/getting_started/installation/lambda.rst
@@ -1,8 +1,8 @@
-.. _lambda_cloud_installation:
+.. _lambda_cloud:
 
-=================================================
-AIBrix Single-Node Deployment on Lambda Instances
-=================================================
+============
+Lambda Cloud
+============
 
 This guide provides a step-by-step tutorial to deploy AIBrix on a single-node Lambda instance for testing purposes. The setup includes installing dependencies, verifying the installation, setting up the cluster, and deploying AIBrix components.
 
@@ -15,14 +15,14 @@ Before you begin, ensure you have the following:
 
 You can follow `lambda cloud docs <https://docs.lambdalabs.com/>`_ to launch an instance.
 
-.. figure:: ../assets/images/cloud/lambda-cloud-instance.png
+.. figure:: ../../assets/images/cloud/lambda-cloud-instance.png
     :alt: lambda-cloud-instance
     :width: 70%
     :align: center
 
 After launching the instance, you can get the instance's IP address and ssh into the instance.
 
-.. figure::../assets/images/cloud/lambda-cloud-ssh.png
+.. figure::../../assets/images/cloud/lambda-cloud-ssh.png
     :alt: lambda-cloud-ssh
     :width: 70%
     :align: center
@@ -46,7 +46,7 @@ Run the following script to install the necessary dependencies including `nvkind
 - Configures the NVIDIA Container Toolkit
 - Updates Docker settings for GPU compatibility
 
-.. figure::../assets/images/cloud/lambda-cloud-installation.png
+.. figure::../../assets/images/cloud/lambda-cloud-installation.png
     :alt: lambda-cloud-installation
     :width: 70%
     :align: center
@@ -72,7 +72,7 @@ Run the following script to ensure that the NVIDIA drivers and Docker integratio
 
 If all checks pass successfully like below, proceed to the next step.
 
-.. figure::../assets/images/cloud/lambda-cloud-verify-installation.png
+.. figure::../../assets/images/cloud/lambda-cloud-verify-installation.png
     :alt: lambda-cloud-verify-installation
     :width: 70%
     :align: center
diff --git a/docs/source/getting_started/installation/mac-for-desktop.rst b/docs/source/getting_started/installation/mac-for-desktop.rst
new file mode 100644
index 00000000..01a2b931
--- /dev/null
+++ b/docs/source/getting_started/installation/mac-for-desktop.rst
@@ -0,0 +1,6 @@
+.. _mac-for-desktop:
+
+===============
+Mac for Desktop
+===============
+
diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst
index 0e54dd04..c4c467e6 100644
--- a/docs/source/getting_started/quickstart.rst
+++ b/docs/source/getting_started/quickstart.rst
@@ -33,130 +33,15 @@ Wait for few minutes and run `kubectl get pods -n aibrix-system` to check pod st
 Deploy base model
 ^^^^^^^^^^^^^^^^^
 
-Save yaml as `deployment.yaml` and run `kubectl apply -f deployment.yaml`.
-
-.. code-block:: yaml
-
-    apiVersion: apps/v1
-    kind: Deployment
-    metadata:
-      labels:
-        # Note: The label value `model.aibrix.ai/name` here must match with the service name.
-        model.aibrix.ai/name: qwen25-7b-Instruct
-        model.aibrix.ai/port: "8000"
-        adapter.model.aibrix.ai/enabled: true
-      name: qwen25-7b-Instruct
-      namespace: default
-    spec:
-      replicas: 1
-      selector:
-        matchLabels:
-          model.aibrix.ai/name: qwen25-7b-Instruct
-      strategy:
-        rollingUpdate:
-          maxSurge: 25%
-          maxUnavailable: 25%
-        type: RollingUpdate
-      template:
-        metadata:
-          labels:
-            model.aibrix.ai/name: qwen25-7b-Instruct
-        spec:
-          containers:
-            - command:
-                - python3
-                - -m
-                - vllm.entrypoints.openai.api_server
-                - --host
-                - "0.0.0.0"
-                - --port
-                - "8000"
-                - --model
-                - Qwen/Qwen2.5-7B-Instruct
-                - --served-model-name
-                # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name`
-                - qwen25-7b-Instruct
-                - --trust-remote-code
-                - --enable-lora
-              env:
-                - name: VLLM_ALLOW_RUNTIME_LORA_UPDATING
-                  value: "true"
-              image: aibrix/vllm-openai:v0.6.1.post2
-              imagePullPolicy: Always
-              livenessProbe:
-                failureThreshold: 3
-                httpGet:
-                  path: /health
-                  port: 8000
-                  scheme: HTTP
-                initialDelaySeconds: 90
-                periodSeconds: 5
-                successThreshold: 1
-                timeoutSeconds: 1
-              name: vllm-openai
-              ports:
-                - containerPort: 8000
-                  protocol: TCP
-              readinessProbe:
-                failureThreshold: 3
-                httpGet:
-                  path: /health
-                  port: 8000
-                  scheme: HTTP
-                initialDelaySeconds: 90
-                periodSeconds: 5
-                successThreshold: 1
-                timeoutSeconds: 1
-              resources:
-                limits:
-                  nvidia.com/gpu: "1"
-                requests:
-                  nvidia.com/gpu: "1"
-              volumeMounts:
-                - name: dshm
-                  mountPath: /dev/shm
-          volumes:
-            - name: dshm
-              emptyDir:
-                medium: Memory
-                sizeLimit: "4Gi"
-
-Save yaml as `service.yaml` and run `kubectl apply -f service.yaml`.
-
-.. code-block:: yaml
-
-    apiVersion: v1
-    kind: Service
-    metadata:
-      labels:
-        # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
-        model.aibrix.ai/name: qwen25-7b-Instruct
-        prometheus-discovery: "true"
-      annotations:
-        prometheus.io/scrape: "true"
-        prometheus.io/port: "8080"
-      name: qwen25-7b-Instruct
-      namespace: default
-    spec:
-      ports:
-        - name: serve
-          port: 8000
-          protocol: TCP
-          targetPort: 8000
-        - name: http
-          port: 8080
-          protocol: TCP
-          targetPort: 8080
-      selector:
-        model.aibrix.ai/name: qwen25-7b-Instruct
-      type: ClusterIP
+Save yaml as `model.yaml` and run `kubectl apply -f model.yaml`.
 
-.. note::
+.. literalinclude:: ../../../samples/quickstart/model.yaml
+   :language: yaml
 
-   Ensure that:
+Ensure that:
 
-   1. The `Service` name matches the `model.aibrix.ai/name` label value in the `Deployment`.
-   2. The `--served-model-name` argument value in the `Deployment` command is also consistent with the `Service` name and `model.aibrix.ai/name` label.
+1. The `Service` name matches the `model.aibrix.ai/name` label value in the `Deployment`.
+2. The `--served-model-name` argument value in the `Deployment` command is also consistent with the `Service` name and `model.aibrix.ai/name` label.
 
 
 Invoke the model endpoint using gateway api
diff --git a/docs/source/index.rst b/docs/source/index.rst
index e8fab8e6..eaf0cfb2 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -29,7 +29,7 @@ Documentation
 
    designs/architecture.rst
    getting_started/quickstart.rst
-   getting_started/installation.rst
+   getting_started/installation/installation.rst
    getting_started/faq.rst
 
 .. toctree::
diff --git a/samples/adapter/adapter-api-key.yaml b/samples/adapter/adapter-api-key.yaml
new file mode 100644
index 00000000..4a44a798
--- /dev/null
+++ b/samples/adapter/adapter-api-key.yaml
@@ -0,0 +1,17 @@
+apiVersion: model.aibrix.ai/v1alpha1
+kind: ModelAdapter
+metadata:
+  name: qwen-code-lora
+  namespace: default
+  labels:
+    model.aibrix.ai/name: "qwen-code-lora"
+    model.aibrix.ai/port: "8000"
+spec:
+  baseModel: llama2-7b
+  podSelector:
+    matchLabels:
+      model.aibrix.ai/name: llama2-7b
+  artifactURL: huggingface://ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora
+  additionalConfig:
+    api-key: test-key-1234567890
+  schedulerName: default
diff --git a/samples/adapter/adapter.yaml b/samples/adapter/adapter.yaml
new file mode 100644
index 00000000..4a44a798
--- /dev/null
+++ b/samples/adapter/adapter.yaml
@@ -0,0 +1,17 @@
+apiVersion: model.aibrix.ai/v1alpha1
+kind: ModelAdapter
+metadata:
+  name: qwen-code-lora
+  namespace: default
+  labels:
+    model.aibrix.ai/name: "qwen-code-lora"
+    model.aibrix.ai/port: "8000"
+spec:
+  baseModel: llama2-7b
+  podSelector:
+    matchLabels:
+      model.aibrix.ai/name: llama2-7b
+  artifactURL: huggingface://ai-blond/Qwen-Qwen2.5-Coder-1.5B-Instruct-lora
+  additionalConfig:
+    api-key: test-key-1234567890
+  schedulerName: default
diff --git a/samples/adapter/base.yaml b/samples/adapter/base.yaml
new file mode 100644
index 00000000..3cd58423
--- /dev/null
+++ b/samples/adapter/base.yaml
@@ -0,0 +1,73 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: qwen-coder-15b-instruct # Note: The label value `model.aibrix.ai/name` here must match with the service name.
+    model.aibrix.ai/port: "8000"
+    adapter.model.aibrix.ai/enabled: "true"
+  name: qwen-coder-1.5b-instruct
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: qwen-coder-1.5b-instruct
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: qwen-coder-1.5b-instruct
+    spec:
+      containers:
+        - command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model
+            - Qwen/Qwen2.5-Coder-1.5B-Instruct
+            - --served-model-name
+            # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name`
+            - qwen-coder-1.5b-instruct
+            - --enable-lora
+          image: vllm/vllm-openai:v0.7.1
+          imagePullPolicy: Always
+          name: vllm-openai
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+        - name: aibrix-runtime
+          image: aibrix/runtime:v0.2.0-rc.2
+          command:
+            - aibrix_runtime
+            - --port
+            - "8080"
+          env:
+            - name: INFERENCE_ENGINE
+              value: vllm
+            - name: INFERENCE_ENGINE_ENDPOINT
+              value: http://localhost:8000
+          ports:
+            - containerPort: 8080
+              protocol: TCP
+          livenessProbe:
+            httpGet:
+              path: /healthz
+              port: 8080
+            initialDelaySeconds: 3
+            periodSeconds: 2
+          readinessProbe:
+            httpGet:
+              path: /ready
+              port: 8080
+            initialDelaySeconds: 5
+            periodSeconds: 10
+
+---
diff --git a/samples/distributed/multi-host.yaml b/samples/distributed/multi-host.yaml
new file mode 100644
index 00000000..7205a010
--- /dev/null
+++ b/samples/distributed/multi-host.yaml
@@ -0,0 +1,54 @@
+apiVersion: orchestration.aibrix.ai/v1alpha1
+kind: RayClusterFleet
+metadata:
+  labels:
+    app.kubernetes.io/name: aibrix
+    app.kubernetes.io/managed-by: kustomize
+  name: facebook-opt-13b
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: facebook-opt-13b
+  strategy:
+    rollingUpdate:
+      maxSurge: 25%
+      maxUnavailable: 25%
+    type: RollingUpdate
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: facebook-opt-13b
+      annotations:
+        ray.io/overwrite-container-cmd: "true"
+    spec:
+      rayVersion: '2.10.0' # should match the Ray version in the image of the containers
+      headGroupSpec:
+        rayStartParams:
+          dashboard-host: '0.0.0.0'
+        template:
+          spec:
+            containers:
+              - name: ray-head
+                image: aibrix/vllm-openai:v0.6.1.post2-distributed
+                ports:
+                  - containerPort: 6379
+                    name: gcs-server
+                  - containerPort: 8265
+                    name: dashboard
+                  - containerPort: 10001
+                    name: client
+                  - containerPort: 8000
+                    name: service
+                command: ["/bin/bash", "-lc", "--"]
+                # Starting from v1.1.0, KubeRay injects the environment variable `KUBERAY_GEN_RAY_START_CMD`
+                # into the Ray container. This variable can be used to retrieve the generated Ray start command.
+                # Note that this environment variable does not include the `ulimit` command.
+                args: ["ulimit -n 65536; echo head; ray start --head --num-cpus=8 --num-gpus=2 --dashboard-host=0.0.0.0 --metrics-export-port=8080 --dashboard-agent-listen-port=52365; vllm serve /models/llama-2-7b-hf/ --served-model-name meta-llama/llama-2-7b-hf --tensor-parallel-size 2 --distributed-executor-backend ray"]
+                resources:
+                  limits:
+                    cpu: "8000m"
+                    nvidia.com/gpu: 2
+                  requests:
+                    cpu: "8000m"
+                    nvidia.com/gpu: 2
diff --git a/samples/kvcache/deployment.yaml b/samples/kvcache/deployment.yaml
new file mode 100644
index 00000000..44b6f2c3
--- /dev/null
+++ b/samples/kvcache/deployment.yaml
@@ -0,0 +1,64 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: aibrix-model-deepseek-coder-33b-instruct
+  labels:
+    model.aibrix.ai/name: deepseek-coder-33b-instruct
+    model.aibrix.ai/port: "8000"
+spec:
+  strategy:
+    rollingUpdate:
+      maxSurge: 1
+      maxUnavailable: 1
+    type: RollingUpdate
+  template:
+    spec:
+      containers:
+        - name: vllm-openai
+          image: aibrix/vllm-openai:v0.6.1-edb07092-20250118
+          imagePullPolicy: Always
+          command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --port
+            - "8000"
+            - --model
+            - deepseek-ai/deepseek-coder-33b-instruct
+            - --served-model-name
+            - deepseek-coder-33b-instruct
+            - --distributed-executor-backend
+            - ray
+            - --trust-remote-code
+            - --tensor-parallel-size
+            - "4"
+            - --max-model-len
+            - "17000"
+            - --enable-prefix-caching
+            - --disable-fastapi-docs
+          env:
+            - name: VLLM_USE_VINEYARD_CACHE
+              value: "1"
+            - name: VINEYARD_CACHE_CPU_MEM_LIMIT_GB
+              value: "70"
+            - name: AIBRIX_LLM_KV_CACHE
+              value: "1"
+            - name: AIBRIX_LLM_KV_CACHE_KV_CACHE_NS
+              value: "aibrix"
+            - name: AIBRIX_LLM_KV_CACHE_CHUNK_SIZE
+              value: "16"
+            - name: AIBRIX_LLM_KV_CACHE_SOCKET
+              value: /var/run/vineyard.sock
+            - name: AIBRIX_LLM_KV_CACHE_RPC_ENDPOINT
+              value: "aibrix-model-deepseek-coder-33b-kvcache-rpc:9600"
+            - name: VINEYARD_CACHE_ENABLE_ASYNC_UPDATE
+              value: "1"
+            - name: "VINEYARD_CACHE_METRICS_ENABLED"
+              value: "1"
+          volumeMounts:
+            - mountPath: /var/run
+              name: kvcache-socket
+      volumes:
+        - name: kvcache-socket
+          hostPath:
+            path: /var/run/vineyard-kubernetes/default/aibrix-model-deepseek-coder-33b-kvcache-rpc
\ No newline at end of file
diff --git a/samples/quickstart/model.yaml b/samples/quickstart/model.yaml
new file mode 100644
index 00000000..781c4d15
--- /dev/null
+++ b/samples/quickstart/model.yaml
@@ -0,0 +1,70 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b # Note: The label value `model.aibrix.ai/name` here must match with the service name.
+    model.aibrix.ai/port: "8000"
+  name: deepseek-r1-distill-llama-8b
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  template:
+    metadata:
+      labels:
+        model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    spec:
+      containers:
+        - command:
+            - python3
+            - -m
+            - vllm.entrypoints.openai.api_server
+            - --host
+            - "0.0.0.0"
+            - --port
+            - "8000"
+            - --model
+#            - deepseek-ai/DeepSeek-R1-Distill-Llama-8B
+            - Qwen/Qwen2.5-Coder-1.5B-Instruct
+            - --served-model-name
+            # Note: The `--served-model-name` argument value must also match the Service name and the Deployment label `model.aibrix.ai/name`
+            - deepseek-r1-distill-llama-8b
+          image: vllm/vllm-openai:v0.7.1
+          imagePullPolicy: Always
+          name: vllm-openai
+          ports:
+            - containerPort: 8000
+              protocol: TCP
+          resources:
+            limits:
+              nvidia.com/gpu: "1"
+            requests:
+              nvidia.com/gpu: "1"
+---
+
+apiVersion: v1
+kind: Service
+metadata:
+  labels:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+    prometheus-discovery: "true"
+  annotations:
+    prometheus.io/scrape: "true"
+    prometheus.io/port: "8080"
+  name: deepseek-r1-distill-llama-8b # Note: The Service name must match the label value `model.aibrix.ai/name` in the Deployment
+  namespace: default
+spec:
+  ports:
+    - name: serve
+      port: 8000
+      protocol: TCP
+      targetPort: 8000
+    - name: http
+      port: 8080
+      protocol: TCP
+      targetPort: 8080
+  selector:
+    model.aibrix.ai/name: deepseek-r1-distill-llama-8b
+  type: LoadBalancer
\ No newline at end of file