diff --git a/.github/workflows/automation.yaml b/.github/workflows/automation.yaml
new file mode 100644
index 00000000..1de3dbc8
--- /dev/null
+++ b/.github/workflows/automation.yaml
@@ -0,0 +1,566 @@
+name: Container Release Process Automation
+
+on:
+  pull_request: 
+  repository_dispatch:
+    types: [container-pushed]
+  workflow_dispatch:
+    inputs:
+      models:
+        description: "Comma-separated list of model names to deploy"
+        required: false
+        default: "llama-3-3-70b-instruct,llama-4-scout-17b-16e-instruct"
+        type: string
+
+env:
+  REGISTRY: ord.ocir.io
+  REGISTRY_NAMESPACE: idqj093njucb
+  K8S_NAMESPACE: github-actions
+  CONTAINER_NAME: official-sgl
+  # TODO: Add all supported models
+  ISVC_MODELS: llama-3-3-70b-instruct,llama-4-scout-17b-16e-instruct
+  BUCKET_NAME: ome-benchmark-results
+  BENCHMARK_CONTAINER: ghcr.io/moirai-internal/genai-bench:v0.0.2
+  # TODO: Switch to official sanity check image
+  SANITY_CHECK_CONTAINER: phx.ocir.io/idqj093njucb/sanity-check:dev
+
+jobs:
+  deploy:
+    runs-on: gh-arc-runner
+    permissions:
+      contents: read
+      packages: write
+      id-token: write
+    env:
+      IMAGE_NAME: ${{ github.event.client_payload.image_name }}
+      FULL_TAG: ${{ github.event.client_payload.full_tag }}
+      BRANCH_NAME: ${{ github.event.client_payload.branch_name }}
+      VERSION: ${{ github.event.client_payload.version }}
+      COMMIT_HASH: ${{ github.event.client_payload.commit_hash }}
+
+    steps:
+      - name: Set up kubectl
+        uses: azure/setup-kubectl@v3
+        with:
+          version: 'v1.30.1'
+
+      - name: Prepare model list and sanitize names
+        id: models
+        run: |
+          if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then
+            MODELS="${{ github.event.inputs.models }}"
+          else
+            # Default models for auto-deployment
+            MODELS=${ISVC_MODELS}
+          fi
+
+          # Helper functions for name sanitization
+          sanitize_model_name() {
+            echo "$1" | sed 's/[^a-zA-Z0-9]/-/g' | tr '[:upper:]' '[:lower:]'
+          }
+
+          sanitize_namespace() {
+            local namespace="$1"
+            namespace=$(echo "$namespace" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
+            echo "$namespace" | cut -c1-63 | sed 's/-$//'
+          }
+
+          create_runtime_name() {
+            local model_name="$1"
+            local runtime_name="srt-$model_name"
+            runtime_name=$(echo "$runtime_name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g')
+            echo "$runtime_name" | cut -c1-63 | sed 's/-$//'
+          }
+
+          # Process each model and create comprehensive JSON with all sanitized names
+          IFS=',' read -ra MODEL_ARRAY <<< "$MODELS"
+          MODELS_WITH_NAMES="[]"
+
+          for model in "${MODEL_ARRAY[@]}"; do
+            MODEL_NAME=$(sanitize_model_name "$model")
+            NAMESPACE=$(sanitize_namespace "${K8S_NAMESPACE}")
+            RUNTIME_NAME=$(create_runtime_name "$MODEL_NAME")
+
+            # Create JSON object for this model
+            MODEL_OBJECT=$(jq -n \
+              --arg model "$model" \
+              --arg model_name "$MODEL_NAME" \
+              --arg namespace "$NAMESPACE" \
+              --arg runtime_name "$RUNTIME_NAME" \
+              '{
+                original: $model,
+                model_name: $model_name,
+                namespace: $namespace,
+                runtime_name: $runtime_name
+              }')
+
+            # Add to array
+            MODELS_WITH_NAMES=$(echo "$MODELS_WITH_NAMES" | jq --argjson obj "$MODEL_OBJECT" '. += [$obj]')
+          done
+
+          # Output the comprehensive model metadata
+          echo "models_metadata=$(echo "$MODELS_WITH_NAMES" | tr -d '\n')" >> $GITHUB_OUTPUT
+
+          echo "Models to deploy: $MODELS"
+          echo "Models with sanitized names:"
+          echo "$MODELS_WITH_NAMES" | jq .
+
+      - name: Deploy to Kubernetes
+        id: deployment
+        # Change benchmark endpoint to use inference service
+        run: |
+          MODELS_METADATA='${{ steps.models.outputs.models_metadata }}'
+          IMAGE="${IMAGE_NAME}:${FULL_TAG}"
+
+          # Initialize status tracking
+          DEPLOYMENT_STATUS="[]"
+
+          # Deploy each model sequentially using pre-computed names
+          # Use process substitution to avoid subshell and preserve variables
+          while read -r model_info; do
+            # Extract all names from JSON
+            MODEL=$(echo "$model_info" | jq -r '.original')
+            MODEL_NAME=$(echo "$model_info" | jq -r '.model_name')
+            NAMESPACE=$(echo "$model_info" | jq -r '.namespace')
+            RUNTIME_NAME=$(echo "$model_info" | jq -r '.runtime_name')
+
+            echo "Deploying model: $MODEL"
+            echo "Namespace: $NAMESPACE"
+            echo "Runtime: $RUNTIME_NAME"
+
+            # Create InferenceService manifest
+            cat <<EOF > isvc-$MODEL_NAME.yaml
+          ---
+          apiVersion: ome.io/v1beta1
+          kind: InferenceService
+          metadata:
+            name: $MODEL_NAME
+            namespace: $NAMESPACE
+            labels:
+              sglang.version: "$VERSION"
+              sidecar.istio.io/inject	: "true"
+            annotations:
+              sglang.deployed-by: "github-actions"
+              sglang.deployment-time: "$(date -u +%Y-%m-%dT%H:%M:%SZ)"
+          spec:
+            engine:
+              minReplicas: 1
+              maxReplicas: 1
+              runner:
+                name: ome-container
+                image: $IMAGE
+            model:
+              name: $MODEL_NAME
+            runtime:
+              name: $RUNTIME_NAME
+          EOF
+
+            # create benchmark job manifest
+            cat <<EOF > benchmark-$MODEL_NAME.yaml
+          apiVersion: ome.io/v1beta1
+          kind: BenchmarkJob
+          metadata:
+            name: benchmark-$MODEL_NAME
+            namespace: $NAMESPACE
+          spec:
+            podOverride:
+              image: ${BENCHMARK_CONTAINER}
+            huggingFaceSecretReference:
+              name: huggingface-secret
+            endpoint:
+              inferenceService:
+                name: $MODEL_NAME
+                namespace: $NAMESPACE
+              # endpoint:
+              #   url: http://$MODEL_NAME-engine.$NAMESPACE.svc.cluster.local:8080
+              #   apiFormat: openai
+              #   modelName: $MODEL_NAME
+            task: text-to-text
+            # numConcurrency:
+            #   - 1
+            #   - 128
+            maxTimePerIteration: 10
+            maxRequestsPerIteration: 1000
+            outputLocation:
+              storageUri: "oci://n/${REGISTRY_NAMESPACE}/b/${BUCKET_NAME}/o/${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}"
+              parameters:
+                auth: "instance_principal"
+                region: "eu-frankfurt-1"
+          EOF
+
+            # create sanity-check job manifest
+            cat <<EOF > sanity-check-$MODEL_NAME.yaml
+          apiVersion: batch/v1
+          kind: Job
+          metadata:
+            name: sanity-check-$MODEL_NAME
+            namespace: $NAMESPACE
+            labels:
+              sglang.version: "$VERSION"
+          spec:
+            template:
+              spec:
+                containers:
+                - name: sanity-check
+                  image: ${SANITY_CHECK_CONTAINER}
+                  imagePullPolicy: Always
+                  env:
+                  - name: BASE_URL
+                    value: "http://$MODEL_NAME-engine.$NAMESPACE.svc.cluster.local:8080"
+                  - name: VLLM_ENDPOINT
+                    value: "$MODEL_NAME-engine.$NAMESPACE.svc.cluster.local"
+                  - name: VLLM_PORT
+                    value: "8080"
+                restartPolicy: Never
+            backoffLimit: 3
+          EOF
+
+            # Apply the InferenceService
+            kubectl apply -f isvc-$MODEL_NAME.yaml
+
+            # Wait for InferenceService to be ready (with timeout)
+            echo "Waiting for InferenceService $MODEL_NAME to be ready in namespace $NAMESPACE..."
+
+            # Check if the InferenceService exists and wait for it to be ready
+            TIMEOUT=600
+            ELAPSED=0
+            READY=false
+
+            while [ $ELAPSED -lt $TIMEOUT ]; do
+              # Check if InferenceService exists and get its status
+              if kubectl get inferenceservice $MODEL_NAME -n $NAMESPACE >/dev/null 2>&1; then
+                STATUS=$(kubectl get inferenceservice $MODEL_NAME -n $NAMESPACE -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown")
+                if [ "$STATUS" = "True" ]; then
+                  READY=true
+                  break
+                fi
+                echo "InferenceService status: $STATUS (waiting...)"
+              else
+                echo "InferenceService not found yet (waiting...)"
+              fi
+
+              sleep 60
+              ELAPSED=$((ELAPSED + 10))
+            done
+
+            if [ "$READY" = "true" ]; then
+              echo "✅ Successfully deployed InferenceService for model: $MODEL"
+              echo "   Namespace: $NAMESPACE"
+              echo "   InferenceService: $MODEL_NAME"
+              echo "   Runtime: $RUNTIME_NAME"
+
+              # Track InferenceService success
+              ISVC_STATUS="SUCCESS"
+
+              # Deploy benchmark job now that InferenceService is ready
+              echo "🏃 Starting benchmark job for model: $MODEL"
+              kubectl apply -f benchmark-$MODEL_NAME.yaml
+
+              # Wait for benchmark job to complete (with timeout)
+              echo "Waiting for BenchmarkJob benchmark-$MODEL_NAME to complete in namespace $NAMESPACE..."
+
+              BENCHMARK_TIMEOUT=18000  # 5 hours
+              BENCHMARK_ELAPSED=0
+              BENCHMARK_SUCCESS=false
+
+              while [ $BENCHMARK_ELAPSED -lt $BENCHMARK_TIMEOUT ]; do
+                # Check if BenchmarkJob exists and get its status
+                if kubectl get benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE >/dev/null 2>&1; then
+                  # Get the overall status/state of the benchmark job
+                  BENCHMARK_STATUS=$(kubectl get benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE -o jsonpath='{.status.state}' 2>/dev/null || echo "Running")
+
+                  if [ "$BENCHMARK_STATUS" = "Succeeded" ] || [ "$BENCHMARK_STATUS" = "Completed" ]; then
+                    BENCHMARK_SUCCESS=true
+                    break
+                  elif [ "$BENCHMARK_STATUS" = "Failed" ]; then
+                    echo "❌ Benchmark job failed for model: $MODEL"
+                    break
+                  fi
+                  # echo "BenchmarkJob status: $BENCHMARK_STATUS (waiting...)"
+                else
+                  echo "BenchmarkJob not found yet (waiting...)"
+                fi
+
+                sleep 30
+                BENCHMARK_ELAPSED=$((BENCHMARK_ELAPSED + 30))
+              done
+
+              if [ "$BENCHMARK_SUCCESS" = "true" ]; then
+                echo "✅ Successfully completed benchmark for model: $MODEL"
+                echo "   Benchmark results should be available at: oci://n/${REGISTRY_NAMESPACE}/b/${BUCKET_NAME}/o/${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}"
+                BENCHMARK_STATUS="SUCCESS"
+              else
+                echo "❌ Failed to complete benchmark or timeout for model: $MODEL"
+                echo "Checking BenchmarkJob status..."
+                kubectl get benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE -o yaml || echo "BenchmarkJob not found"
+                echo "Checking benchmark job events..."
+                kubectl get events -n $NAMESPACE --sort-by=.metadata.creationTimestamp | grep -i benchmark | tail -10
+
+                # Get benchmark job pod logs for debugging
+                echo "Fetching benchmark job pod logs for debugging..."
+                BENCHMARK_POD=$(kubectl get pods -n $NAMESPACE -l job-name=benchmark-$MODEL_NAME -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "")
+                if [ -n "$BENCHMARK_POD" ]; then
+                  echo "--- Benchmark Pod Logs ---"
+                  kubectl logs $BENCHMARK_POD -n $NAMESPACE --tail=100 || echo "Failed to get pod logs"
+                  echo "--- End of Benchmark Pod Logs ---"
+                else
+                  echo "No benchmark pod found for job benchmark-$MODEL_NAME"
+                fi
+                BENCHMARK_STATUS="FAILED"
+              fi
+
+              # Deploy sanity-check job after benchmark
+              echo "🔍 Starting sanity-check job for model: $MODEL"
+              kubectl apply -f sanity-check-$MODEL_NAME.yaml
+
+              # Wait for sanity-check job to complete (with timeout)
+              echo "Waiting for Job sanity-check-$MODEL_NAME to complete in namespace $NAMESPACE..."
+
+              SANITY_TIMEOUT=7200  # 120 minutes
+              SANITY_ELAPSED=0
+              SANITY_SUCCESS=false
+
+              while [ $SANITY_ELAPSED -lt $SANITY_TIMEOUT ]; do
+                # Check if Job exists and get its status
+                if kubectl get job sanity-check-$MODEL_NAME -n $NAMESPACE >/dev/null 2>&1; then
+                  # Get the job completion status
+                  SANITY_CHECK_STATUS=$(kubectl get job sanity-check-$MODEL_NAME -n $NAMESPACE -o jsonpath='{.status.conditions[0].type}' 2>/dev/null || echo "Running")
+
+                  if [ "$SANITY_CHECK_STATUS" = "Complete" ]; then
+                    SANITY_SUCCESS=true
+                    break
+                  elif [ "$SANITY_CHECK_STATUS" = "Failed" ]; then
+                    echo "❌ Sanity-check job failed for model: $MODEL"
+                    break
+                  fi
+                  # echo "Sanity-check job still running (waiting...)"
+                else
+                  echo "Sanity-check job not found yet (waiting...)"
+                fi
+
+                sleep 30
+                SANITY_ELAPSED=$((SANITY_ELAPSED + 30))
+              done
+
+              if [ "$SANITY_SUCCESS" = "true" ]; then
+                echo "✅ Successfully completed sanity-check for model: $MODEL"
+                SANITY_STATUS="SUCCESS"
+              else
+                echo "❌ Failed to complete sanity-check or timeout for model: $MODEL"
+                echo "Checking sanity-check job status..."
+                kubectl get job sanity-check-$MODEL_NAME -n $NAMESPACE -o yaml || echo "Sanity-check job not found"
+                echo "Checking sanity-check job events..."
+                kubectl get events -n $NAMESPACE --sort-by=.metadata.creationTimestamp | grep -i sanity-check | tail -10
+
+                SANITY_STATUS="FAILED"
+              fi
+
+              # Get sanity-check job logs
+              echo "Fetching sanity-check job logs for debugging..."
+              kubectl logs -n $NAMESPACE jobs/sanity-check-$MODEL_NAME --tail=100 || echo "Failed to get sanity check job logs"
+
+              # Clean up benchmark job, sanity-check job and inference service
+              echo "🧹 Cleaning up resources for model: $MODEL"
+              kubectl delete -f benchmark-$MODEL_NAME.yaml || echo "Failed to delete benchmark job"
+              kubectl delete -f sanity-check-$MODEL_NAME.yaml || echo "Failed to delete sanity-check job"
+
+            else
+              echo "❌ Failed to deploy or timeout waiting for InferenceService: $MODEL"
+              echo "Checking InferenceService status..."
+              kubectl get inferenceservice $MODEL_NAME -n $NAMESPACE -o yaml || echo "InferenceService not found"
+              echo "Checking events in namespace..."
+              kubectl get events -n $NAMESPACE --sort-by=.metadata.creationTimestamp | tail -10
+              # Continue with next model instead of failing the entire workflow
+              ISVC_STATUS="FAILED"
+              BENCHMARK_STATUS="SKIPPED"
+              SANITY_STATUS="SKIPPED"
+            fi
+
+            # Clean up inference service
+            kubectl delete -f isvc-$MODEL_NAME.yaml || echo "Failed to delete inference service"
+
+            # Add status to tracking JSON
+            STATUS_ENTRY=$(jq -n \
+              --arg model "$MODEL" \
+              --arg model_name "$MODEL_NAME" \
+              --arg namespace "$NAMESPACE" \
+              --arg runtime_name "$RUNTIME_NAME" \
+              --arg isvc_status "${ISVC_STATUS:-UNKNOWN}" \
+              --arg benchmark_status "${BENCHMARK_STATUS:-UNKNOWN}" \
+              --arg sanity_status "${SANITY_STATUS:-UNKNOWN}" \
+              '{
+                model: $model,
+                model_name: $model_name,
+                namespace: $namespace,
+                runtime_name: $runtime_name,
+                isvc_status: $isvc_status,
+                benchmark_status: $benchmark_status,
+                sanity_status: $sanity_status
+              }')
+
+            # Update the deployment status JSON
+            DEPLOYMENT_STATUS=$(echo "$DEPLOYMENT_STATUS" | jq --argjson entry "$STATUS_ENTRY" '. += [$entry]')
+
+            # Clean up manifest files
+            rm -f isvc-$MODEL_NAME.yaml
+            rm -f benchmark-$MODEL_NAME.yaml
+            rm -f sanity-check-$MODEL_NAME.yaml
+
+            echo "---"
+          done < <(echo "$MODELS_METADATA" | jq -c '.[]')
+
+          # Output the deployment status for use in summary
+          echo "deployment_status=$(echo "$DEPLOYMENT_STATUS" | tr -d '\n')" >> $GITHUB_OUTPUT
+          echo "Deployment status saved:"
+          echo "$DEPLOYMENT_STATUS" | jq .
+
+      - name: Cleanup on failure or cancellation
+        if: failure() || cancelled()
+        run: |
+          echo "🧹 Cleaning up resources after workflow failure or cancellation..."
+
+          # Use the same model names that were prepared earlier
+          MODELS_METADATA='${{ steps.models.outputs.models_metadata }}'
+
+          # Clean up any remaining resources using pre-computed names
+          echo "$MODELS_METADATA" | jq -c '.[]' | while read -r model_info; do
+            # Extract all names from JSON
+            MODEL=$(echo "$model_info" | jq -r '.original')
+            MODEL_NAME=$(echo "$model_info" | jq -r '.model_name')
+            NAMESPACE=$(echo "$model_info" | jq -r '.namespace')
+
+            echo "Cleaning up resources for model: $MODEL"
+
+            # Try to delete InferenceService
+            kubectl delete inferenceservice $MODEL_NAME -n $NAMESPACE 2>/dev/null || echo "No InferenceService $MODEL_NAME to cleanup"
+
+            # Try to delete BenchmarkJob
+            kubectl delete benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE 2>/dev/null || echo "No BenchmarkJob benchmark-$MODEL_NAME to cleanup"
+
+            # Try to delete sanity-check Job
+            kubectl delete job sanity-check-$MODEL_NAME -n $NAMESPACE 2>/dev/null || echo "No sanity-check Job sanity-check-$MODEL_NAME to cleanup"
+
+            # Clean up any manifest files
+            rm -f isvc-$MODEL_NAME.yaml benchmark-$MODEL_NAME.yaml sanity-check-$MODEL_NAME.yaml
+          done
+
+          echo "Cleanup completed"
+
+      - name: Cleanup kubeconfig
+        if: always()
+        run: |
+          echo "🧹 Cleaning up kubeconfig credentials..."
+          rm -f $HOME/.kube/config
+          echo "Kubeconfig cleanup completed"
+
+      - name: Create deployment summary
+        run: |
+          DEPLOYMENT_STATUS='${{ steps.deployment.outputs.deployment_status }}'
+
+          echo "## 🚀 SGLang Deployment & Benchmark Summary" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "**Branch:** \`${BRANCH_NAME}\`" >> $GITHUB_STEP_SUMMARY
+          echo "**Version:** \`${VERSION}\`" >> $GITHUB_STEP_SUMMARY
+          echo "**Commit:** \`${COMMIT_HASH}\`" >> $GITHUB_STEP_SUMMARY
+          echo "**Docker Image:** \`${IMAGE_NAME}:${FULL_TAG}\`" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+
+          echo "### 📋 Deployment Overview" >> $GITHUB_STEP_SUMMARY
+          echo "This workflow deployed and tested each model sequentially:" >> $GITHUB_STEP_SUMMARY
+          echo "1. **InferenceService Deployment** - Deploy SGLang runtime with the model" >> $GITHUB_STEP_SUMMARY
+          echo "2. **Benchmark Execution** - Run performance benchmarks against the deployed service" >> $GITHUB_STEP_SUMMARY
+          echo "3. **Sanity Check** - Run sanity check validation against the deployed service" >> $GITHUB_STEP_SUMMARY
+          echo "4. **Resource Cleanup** - Clean up InferenceService, BenchmarkJob, and sanity-check Job after completion" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+
+          # Read deployment status from step output
+          if [ -n "$DEPLOYMENT_STATUS" ] && [ "$DEPLOYMENT_STATUS" != "[]" ]; then
+            echo "### 📊 Model Deployment Results" >> $GITHUB_STEP_SUMMARY
+
+            # Count overall statistics
+            TOTAL_MODELS=$(echo "$DEPLOYMENT_STATUS" | jq 'length')
+            SUCCESSFUL_ISVC=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.isvc_status == "SUCCESS")] | length')
+            SUCCESSFUL_BENCHMARKS=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.benchmark_status == "SUCCESS")] | length')
+            SUCCESSFUL_SANITY=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.sanity_status == "SUCCESS")] | length')
+            FAILED_ISVC=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.isvc_status == "FAILED")] | length')
+            FAILED_BENCHMARKS=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.benchmark_status == "FAILED")] | length')
+            FAILED_SANITY=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.sanity_status == "FAILED")] | length')
+
+            echo "**Overall Status:**" >> $GITHUB_STEP_SUMMARY
+            echo "- 📈 Total Models: $TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY
+            echo "- ✅ InferenceServices Deployed: $SUCCESSFUL_ISVC/$TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY
+            echo "- 🏃 Benchmarks Completed: $SUCCESSFUL_BENCHMARKS/$TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY
+            echo "- 🔍 Sanity Checks Completed: $SUCCESSFUL_SANITY/$TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY
+            if [ "$FAILED_ISVC" -gt 0 ]; then
+              echo "- ❌ InferenceService Failures: $FAILED_ISVC" >> $GITHUB_STEP_SUMMARY
+            fi
+            if [ "$FAILED_BENCHMARKS" -gt 0 ]; then
+              echo "- ❌ Benchmark Failures: $FAILED_BENCHMARKS" >> $GITHUB_STEP_SUMMARY
+            fi
+            if [ "$FAILED_SANITY" -gt 0 ]; then
+              echo "- ❌ Sanity Check Failures: $FAILED_SANITY" >> $GITHUB_STEP_SUMMARY
+            fi
+            echo "" >> $GITHUB_STEP_SUMMARY
+
+            # Individual model status
+            echo "$DEPLOYMENT_STATUS" | jq -c '.[]' | while read -r status_info; do
+              MODEL=$(echo "$status_info" | jq -r '.model')
+              MODEL_NAME=$(echo "$status_info" | jq -r '.model_name')
+              NAMESPACE=$(echo "$status_info" | jq -r '.namespace')
+              RUNTIME_NAME=$(echo "$status_info" | jq -r '.runtime_name')
+              ISVC_STATUS=$(echo "$status_info" | jq -r '.isvc_status')
+              BENCHMARK_STATUS=$(echo "$status_info" | jq -r '.benchmark_status')
+              SANITY_STATUS=$(echo "$status_info" | jq -r '.sanity_status')
+
+              # Status emojis
+              case "$ISVC_STATUS" in
+                "SUCCESS") ISVC_EMOJI="✅" ;;
+                "FAILED") ISVC_EMOJI="❌" ;;
+                *) ISVC_EMOJI="❓" ;;
+              esac
+
+              case "$BENCHMARK_STATUS" in
+                "SUCCESS") BENCHMARK_EMOJI="✅" ;;
+                "FAILED") BENCHMARK_EMOJI="❌" ;;
+                "SKIPPED") BENCHMARK_EMOJI="⏭️" ;;
+                *) BENCHMARK_EMOJI="❓" ;;
+              esac
+
+              case "$SANITY_STATUS" in
+                "SUCCESS") SANITY_EMOJI="✅" ;;
+                "FAILED") SANITY_EMOJI="❌" ;;
+                "SKIPPED") SANITY_EMOJI="⏭️" ;;
+                *) SANITY_EMOJI="❓" ;;
+              esac
+
+              echo "#### 🤖 $MODEL" >> $GITHUB_STEP_SUMMARY
+              echo "- **Namespace:** \`$NAMESPACE\`" >> $GITHUB_STEP_SUMMARY
+              echo "- **InferenceService:** \`$MODEL_NAME\` $ISVC_EMOJI \`$ISVC_STATUS\`" >> $GITHUB_STEP_SUMMARY
+              echo "- **Runtime:** \`$RUNTIME_NAME\`" >> $GITHUB_STEP_SUMMARY
+              echo "- **Benchmark:** $BENCHMARK_EMOJI \`$BENCHMARK_STATUS\`" >> $GITHUB_STEP_SUMMARY
+              echo "- **Sanity Check:** $SANITY_EMOJI \`$SANITY_STATUS\`" >> $GITHUB_STEP_SUMMARY
+
+              if [ "$BENCHMARK_STATUS" = "SUCCESS" ]; then
+                echo "- **Results:** \`oci://n/${REGISTRY_NAMESPACE}/b/${BUCKET_NAME}/o/${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}\`" >> $GITHUB_STEP_SUMMARY
+              fi
+              echo "" >> $GITHUB_STEP_SUMMARY
+            done
+          else
+            echo "### ❗ No deployment status available" >> $GITHUB_STEP_SUMMARY
+            echo "The deployment status was not found. This may indicate the workflow was interrupted before completion." >> $GITHUB_STEP_SUMMARY
+            echo "" >> $GITHUB_STEP_SUMMARY
+          fi
+
+          echo "### ⚙️ Benchmark Configuration" >> $GITHUB_STEP_SUMMARY
+          echo "- **Engine:** SGLang" >> $GITHUB_STEP_SUMMARY
+          echo "- **GPU Type:** H100" >> $GITHUB_STEP_SUMMARY
+          echo "- **Task:** text-to-text" >> $GITHUB_STEP_SUMMARY
+          echo "- **Benchmark Image:** \`${BENCHMARK_CONTAINER}\`" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+
+          echo "### 📈 Results Location" >> $GITHUB_STEP_SUMMARY
+          echo "All benchmark results are stored in OCI Object Storage:" >> $GITHUB_STEP_SUMMARY
+          echo "- **Bucket:** \`${BUCKET_NAME}\`" >> $GITHUB_STEP_SUMMARY
+          echo "- **Path Pattern:** \`${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}/\`" >> $GITHUB_STEP_SUMMARY
+          echo "" >> $GITHUB_STEP_SUMMARY
+          echo "_Note: All resources (InferenceServices and BenchmarkJobs) were cleaned up after completion._" >> $GITHUB_STEP_SUMMARY
diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml
index c529952e..82ba8dc7 100644
--- a/config/models/kustomization.yaml
+++ b/config/models/kustomization.yaml
@@ -2,10 +2,19 @@ apiVersion: kustomize.config.k8s.io/v1beta1
 kind: Kustomization
 
 resources:
-  - meta/Llama-3.3-70B-instruct.yaml
-  - meta/Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
-  - meta/Llama-4-Scout-17B-16E-Instruct.yaml
   - intfloat/e5-mistral-7b-instruct.yaml
   - microsoft/Phi-3-vision-128k-instruct.yaml
   - deepseek-ai/DeepSeek-V3.yaml
-  - deepseek-ai/DeepSeek-R1.yaml
\ No newline at end of file
+  - deepseek-ai/DeepSeek-R1.yaml
+  - meta/Llama-3.1-405B-Instruct-FP8.yaml
+  - meta/Llama-3.1-8B-Instruct.yaml
+  - meta/Llama-3.1-70B-Instruct.yaml
+  - meta/Llama-3.2-11B-Vision-Instruct.yaml
+  - meta/Llama-3.2-90B-Vision-Instruct.yaml
+  - meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml
+  - meta/Llama-3.3-70B-Instruct.yaml
+  - meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml
+  - meta/Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
+  - meta/Llama-4-Scout-17B-16E-Instruct.yaml
+  - openai/gpt-oss-20b.yaml
+  - openai/gpt-oss-120b.yaml
diff --git a/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml b/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml
index 982e757f..cec4375d 100644
--- a/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml
+++ b/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml
@@ -7,6 +7,6 @@ spec:
   disabled: false
   version: "1.0.0"
   storage:
-    storageUri: hf://meta-llama/Meta-Llama-3.1-405B-Instruct-FP8
-    path: /raid/models/meta/Llama-3.1-405B-Instruct-FP8
+    storageUri: hf://meta-llama/Llama-3.1-405B-Instruct-FP8
+    path: /raid/models/meta/llama-3-1-405b-instruct-fp8
     key: "hf-token"
diff --git a/config/models/meta/Llama-3.1-8B-Instruct.yaml b/config/models/meta/Llama-3.1-8B-Instruct.yaml
new file mode 100644
index 00000000..9b47d01a
--- /dev/null
+++ b/config/models/meta/Llama-3.1-8B-Instruct.yaml
@@ -0,0 +1,13 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: llama-3-1-8b-instruct
+spec:
+  vendor: meta
+  disabled: false
+  version: "1.0.0"
+  displayName: meta.llama-3.1-8b-instruct
+  storage:
+    storageUri: hf://meta-llama/Llama-3.1-8B-Instruct
+    path: /raid/models/meta/llama-3-1-8b-instruct
+    key: "hf-token"
\ No newline at end of file
diff --git a/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml b/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml
index fc2ee760..3166df39 100644
--- a/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml
+++ b/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml
@@ -9,5 +9,5 @@ spec:
   version: "1.0.0"
   storage:
     storageUri: hf://meta-llama/Llama-3.2-11B-Vision-Instruct
-    path: /raid/models/meta/Llama-3.2-11B-Vision-Instruct
+    path: /raid/models/meta/llama-3-2-11b-vision-instruct
     key: "hf-token"
\ No newline at end of file
diff --git a/config/models/meta/Llama-3.2-3B-Instruct.yaml b/config/models/meta/Llama-3.2-3B-Instruct.yaml
index 219fa8f8..4d63f093 100644
--- a/config/models/meta/Llama-3.2-3B-Instruct.yaml
+++ b/config/models/meta/Llama-3.2-3B-Instruct.yaml
@@ -9,5 +9,5 @@ spec:
   version: "1.0.0"
   storage:
     storageUri: hf://meta-llama/Llama-3.2-3B-Instruct
-    path: /raid/models/meta/Llama-3.2-3B-Instruct
+    path: /raid/models/meta/llama-3-2-3b-instruct
     key: "hf-token"
\ No newline at end of file
diff --git a/config/models/meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml b/config/models/meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml
new file mode 100644
index 00000000..8d68fda8
--- /dev/null
+++ b/config/models/meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml
@@ -0,0 +1,13 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: llama-3-2-90b-vision-instruct-fp8
+spec:
+  displayName: meta.llama-3.2-90b-vision-instruct-fp8
+  vendor: meta
+  disabled: false
+  version: "1.0.0"
+  storage:
+    storageUri: hf://RedHatAI/Llama-3.2-90B-Vision-Instruct-FP8-dynamic
+    path: /raid/models/meta/llama-3-2-90b-vision-instruct-fp8-dynamic
+    key: "hf-token"
\ No newline at end of file
diff --git a/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml b/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml
index 5ee704cd..bd8c5d6d 100644
--- a/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml
+++ b/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml
@@ -9,5 +9,5 @@ spec:
   version: "1.0.0"
   storage:
     storageUri: hf://meta-llama/Llama-3.2-90B-Vision-Instruct
-    path: /raid/models/meta/Llama-3.2-90B-Vision-Instruct
+    path: /raid/models/meta/llama-3-2-90b-vision-instruct
     key: "hf-token"
\ No newline at end of file
diff --git a/config/models/meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml b/config/models/meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml
new file mode 100644
index 00000000..65ebbed8
--- /dev/null
+++ b/config/models/meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml
@@ -0,0 +1,13 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterBaseModel
+metadata:
+  name: llama-3-3-70b-instruct-fp8-dynamic
+spec:
+  disabled: false
+  displayName: meta.llama-3.3-70b-instruct-fp8-dynamic
+  storage:
+    storageUri: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
+    path: /raid/models/meta/llama-3-3-70b-instruct-fp8-dynamic
+    key: "hf-token"
+  vendor: meta
+  version: "1.0.0"
\ No newline at end of file
diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml
index 111d1161..f5da4e48 100644
--- a/config/runtimes/kustomization.yaml
+++ b/config/runtimes/kustomization.yaml
@@ -4,10 +4,18 @@ kind: Kustomization
 resources:
 - srt/deepseek-rdma-pd-rt.yaml
 - srt/deepseek-rdma-rt.yaml
+- srt/e5-mistral-7b-instruct-rt.yaml
+- srt/llama-3-1-8b-instruct-rt.yaml
+- vllm/llama-3-1-8b-instruct-rt.yaml
+- srt/llama-3-1-70b-instruct-rt.yaml
+- vllm/llama-3-1-405b-instruct-fp8-rt.yaml
+- vllm/llama-3-2-11b-vision-instruct-rt.yaml
+- vllm/llama-3-2-90b-vision-instruct-rt.yaml
+- vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml
+- srt/llama-3-3-70b-instruct-rt.yaml
+- srt/llama-3-3-70b-instruct-pd-rt.yaml
+- srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
 - srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
 - srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
 - srt/llama-4-scout-17b-16e-instruct-rt.yaml
 - srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
-- srt/e5-mistral-7b-instruct-rt.yaml
-- srt/llama-3-3-70b-instruct-rt.yaml
-- srt/llama-3-3-70b-instruct-pd-rt.yaml
diff --git a/config/runtimes/srt/llama-3-1-8b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-8b-instruct-rt.yaml
new file mode 100644
index 00000000..4960a4bd
--- /dev/null
+++ b/config/runtimes/srt/llama-3-1-8b-instruct-rt.yaml
@@ -0,0 +1,129 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-1-8b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.42.3"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+      version: "1.0.0"
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 7B
+    max: 9B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=1 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
new file mode 100644
index 00000000..678730ef
--- /dev/null
+++ b/config/runtimes/srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
@@ -0,0 +1,128 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: srt-llama-3-3-70b-instruct-fp8-dynamic
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.47.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 60B
+    max: 75B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m sglang.launch_server \
+          --host=0.0.0.0 \
+          --port=8080 \
+          --enable-metrics \
+          --log-requests \
+          --model-path="$MODEL_PATH" \
+          --tp-size=2 \
+          --mem-frac=0.9
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 10
+          memory: 160Gi
+          nvidia.com/gpu: 2
+
+      readinessProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health_generate
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
+  routerConfig:
+    runner:
+      name: router
+      image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
+      resources:
+        limits:
+          cpu: "1"
+          memory: "2Gi"
+      ports:
+        - containerPort: 8080
+          name: http
+      command:
+        - sh
+        - -c
+        - >
+          python3 -m sglang_router.launch_router
+          --host "0.0.0.0"
+          --port "8080"
+          --service-discovery
+          --service-discovery-namespace "${NAMESPACE}"
+          --service-discovery-port 8080
+          --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
+      env:
+        - name: NAMESPACE
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.namespace
+        - name: INFERENCESERVICE_NAME
+          valueFrom:
+            fieldRef:
+              fieldPath: metadata.labels['ome.io/inferenceservice']
\ No newline at end of file
diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
index 8e78c01c..d3ff8386 100644
--- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
+++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
@@ -14,6 +14,7 @@ spec:
       modelArchitecture: Llama4ForConditionalGeneration
       autoSelect: true
       priority: 2
+      quantization: fp8
   protocolVersions:
     - openAI
   modelSizeRange:
@@ -51,12 +52,17 @@ spec:
           --host=0.0.0.0 \
           --port=8080 \
           --enable-metrics \
+          --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp-size 8 \
-          --context-length=430000 \
-          --chat-template llama-4 \
-          --attention-backend fa3 \
-          --log-requests
+          --tp=8 \
+          --mem-frac=0.82 \
+          --context-length=524288 \
+          --enable-multimodal \
+          --tool-call-parser=pythonic \
+          --chat-template=/sgl-workspace/sglang/examples/chat_template/tool_chat_template_llama4_pythonic.jinja \
+          --attention-backend=fa3 \
+          --mm-attention-backend=fa3 \
+          --disable-fast-image-processor
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
index 2609ed15..20b38b24 100644
--- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
+++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml
@@ -51,13 +51,17 @@ spec:
           --host=0.0.0.0 \
           --port=8080 \
           --enable-metrics \
+          --log-requests \
           --model-path="$MODEL_PATH" \
-          --tp-size 4 \
-          --mem-frac=0.95 \
-          --context-length=128000 \
-          --chat-template llama-4 \
-          --attention-backend fa3 \
-          --log-requests
+          --tp=4 \
+          --mem-frac=0.85 \
+          --context-length=196608 \
+          --enable-multimodal \
+          --tool-call-parser=pythonic \
+          --chat-template=/sgl-workspace/sglang/examples/chat_template/tool_chat_template_llama4_pythonic.jinja \
+          --attention-backend=fa3 \
+          --mm-attention-backend=fa3 \
+          --disable-fast-image-processor
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
diff --git a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
index 296b386e..40ef78cf 100644
--- a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
+++ b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml
@@ -15,6 +15,7 @@ spec:
       autoSelect: true
       priority: 1
       version: "1.0.0"
+      quantization: fp8
   protocolVersions:
     - openAI
   modelSizeRange:
@@ -56,13 +57,12 @@ spec:
           --served-model-name=vllm-model \
           --tensor-parallel-size=8 \
           --max-model-len=131072 \
-          --gpu-memory-utilization=0.95 \
+          --gpu-memory-utilization=0.9 \
           --enable-chunked-prefill \
-          --preemption-mode=swap \
-          --swap-space=10 \
           --enable-auto-tool-choice \
           --tool-call-parser=llama3_json \
-          --chat-template=examples/tool_chat_template_llama3.1_json.jinja
+          --chat-template=examples/tool_chat_template_llama3.1_json.jinja \
+          --cuda-graph-sizes=256
       volumeMounts:
         - mountPath: /dev/shm
           name: dshm
diff --git a/config/runtimes/vllm/llama-3-1-8b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-1-8b-instruct-rt.yaml
new file mode 100644
index 00000000..1c25cb99
--- /dev/null
+++ b/config/runtimes/vllm/llama-3-1-8b-instruct-rt.yaml
@@ -0,0 +1,107 @@
+---
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-llama-3-1-8b-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.42.3"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: false
+      priority: 1
+      version: "1.0.0"
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 7B
+    max: 9B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/vllm/vllm-openai:v0.9.0.1
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m vllm.entrypoints.openai.api_server \
+          --port=8080 \
+          --model="$MODEL_PATH" \
+          --middleware=vllm.entrypoints.openai.middleware.log_opc_header \
+          --max-log-len=0 \
+          --served-model-name="$SERVED_MODEL_NAME" \
+          --tensor-parallel-size=1 \
+          --max-model-len=131072 \
+          --gpu-memory-utilization=0.9 \
+          --enable-chunked-prefill \
+          --enable-auto-tool-choice \
+          --tool-call-parser=llama3_json \
+          --chat-template=./examples/tool_chat_template_llama3.1_json.jinja
+      env:
+        - name: SERVED_MODEL_NAME
+          value: "vllm-model"
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+        limits:
+          cpu: 10
+          memory: 30Gi
+          nvidia.com/gpu: 1
+
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
\ No newline at end of file
diff --git a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
index 09a00a44..4714c4ae 100644
--- a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
+++ b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml
@@ -72,10 +72,9 @@ spec:
           --tensor-parallel-size=1 \
           --max-num-seqs=32 \
           --enforce-eager \
-          --preemption-mode=swap \
           --limit-mm-per-prompt=image=1 \
           --max-model-len=131072 \
-          --gpu-memory-utilization=0.99 \
+          --gpu-memory-utilization=0.9 \
           --enable-auto-tool-choice \
           --tool-call-parser=llama3_json \
           --chat-template=./examples/tool_chat_template_llama3.2_json.jinja
diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml
new file mode 100644
index 00000000..769c7e56
--- /dev/null
+++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml
@@ -0,0 +1,120 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-llama-3-2-90b-vision-instruct-fp8-dynamic
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.46.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: MllamaForConditionalGeneration
+      autoSelect: false
+      priority: 1
+      version: "1.0.0"
+    - modelFramework:
+        name: transformers
+        version: "4.50.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: MllamaForConditionalGeneration
+      autoSelect: true
+      priority: 1
+      version: "1.0.0"
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 85B
+    max: 95B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/vllm/vllm-openai:v0.9.0.1
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      env:
+        - name: VLLM_RPC_TIMEOUT
+          value: '30000'
+        - name: VLLM_ENGINE_ITERATION_TIMEOUT_S
+          value: '120'
+      args:
+        - |
+          python3 -m vllm.entrypoints.openai.api_server \
+          --port=8080 \
+          --model="$MODEL_PATH" \
+          --middleware=vllm.entrypoints.openai.middleware.log_opc_header \
+          --max-log-len=0 \
+          --served-model-name=vllm-model \
+          --tensor-parallel-size=4 \
+          --max-num-seqs=32 \
+          --enforce-eager \
+          --limit-mm-per-prompt=image=1 \
+          --max-model-len=131072 \
+          --gpu-memory-utilization=0.9 \
+          --enable-auto-tool-choice \
+          --tool-call-parser=llama3_json \
+          --chat-template=./examples/tool_chat_template_llama3.2_json.jinja
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+        limits:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 4
+
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
\ No newline at end of file
diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-rt.yaml
new file mode 100644
index 00000000..e43d9e17
--- /dev/null
+++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-rt.yaml
@@ -0,0 +1,110 @@
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-llama-3-2-90b-vision-instruct
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.46.0.dev0"
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: MllamaForConditionalGeneration
+      autoSelect: false
+      priority: 1
+      version: "1.0.0"
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 85B
+    max: 95B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: docker.io/vllm/vllm-openai:v0.9.0.1
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      env:
+        - name: VLLM_RPC_TIMEOUT
+          value: '30000'
+        - name: VLLM_ENGINE_ITERATION_TIMEOUT_S
+          value: '120'
+      args:
+        - |
+          python3 -m vllm.entrypoints.openai.api_server \
+          --port=8080 \
+          --model="$MODEL_PATH" \
+          --middleware=vllm.entrypoints.openai.middleware.log_opc_header \
+          --max-log-len=0 \
+          --served-model-name=vllm-model \
+          --tensor-parallel-size=8 \
+          --max-num-seqs=128 \
+          --enforce-eager \
+          --limit-mm-per-prompt=image=1 \
+          --max-model-len=131072 \
+          --gpu-memory-utilization=0.9 \
+          --enable-auto-tool-choice \
+          --tool-call-parser=llama3_json \
+          --chat-template=./examples/tool_chat_template_llama3.2_json.jinja
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 8
+        limits:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 8
+
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
\ No newline at end of file
diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
new file mode 100644
index 00000000..e4fd92f9
--- /dev/null
+++ b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
@@ -0,0 +1,108 @@
+---
+apiVersion: ome.io/v1beta1
+kind: ClusterServingRuntime
+metadata:
+  name: vllm-llama-3-3-70b-instruct-fp8-dynamic
+spec:
+  disabled: false
+  supportedModelFormats:
+    - modelFramework:
+        name: transformers
+        version: "4.47.1"
+      # quantization: compressed-tensors
+      modelFormat:
+        name: safetensors
+        version: "1.0.0"
+      modelArchitecture: LlamaForCausalLM
+      autoSelect: true
+      priority: 1
+      version: "1.0.0"
+  protocolVersions:
+    - openAI
+  modelSizeRange:
+    min: 60B
+    max: 75B
+  engineConfig:
+    annotations:
+      prometheus.io/scrape: "true"
+      prometheus.io/port: "8080"
+      prometheus.io/path: "/metrics"
+    labels:
+      logging-forward: enabled
+    tolerations:
+      - key: "nvidia.com/gpu"
+        operator: "Exists"
+        effect: "NoSchedule"
+    volumes:
+      - name: dshm
+        emptyDir:
+          medium: Memory
+    runner:
+      name: ome-container
+      image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.7.3.78f0810ef
+      ports:
+        - containerPort: 8080
+          name: http1
+          protocol: TCP
+      command:
+        - /bin/bash
+        - '-lc'
+        - --
+      args:
+        - |
+          python3 -m vllm.entrypoints.openai.api_server \
+          --port=8080 \
+          --model="$MODEL_PATH" \
+          --middleware=vllm.entrypoints.openai.middleware.log_opc_header \
+          --max-log-len=0 \
+          --served-model-name="$SERVED_MODEL_NAME" \
+          --tensor-parallel-size=2 \
+          --max-model-len=131072 \
+          --gpu-memory-utilization=0.9 \
+          --enable-chunked-prefill \
+          --enable-auto-tool-choice \
+          --tool-call-parser=llama3_json \
+          --chat-template=./examples/tool_chat_template_llama3.1_json.jinja
+      env:
+        - name: SERVED_MODEL_NAME
+          value: "vllm-model"
+      volumeMounts:
+        - mountPath: /dev/shm
+          name: dshm
+      resources:
+        requests:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 2
+        limits:
+          cpu: 30
+          memory: 100Gi
+          nvidia.com/gpu: 2
+
+      readinessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 3
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 200
+
+      livenessProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 5
+        successThreshold: 1
+        periodSeconds: 60
+        timeoutSeconds: 60
+
+      startupProbe:
+        httpGet:
+          path: /health
+          port: 8080
+        failureThreshold: 150
+        successThreshold: 1
+        periodSeconds: 6
+        initialDelaySeconds: 60
+        timeoutSeconds: 30
\ No newline at end of file
diff --git a/config/samples/isvc/meta/llama3-1-8b-instruct.yaml b/config/samples/isvc/meta/llama3-1-8b-instruct.yaml
new file mode 100644
index 00000000..0ca55f1c
--- /dev/null
+++ b/config/samples/isvc/meta/llama3-1-8b-instruct.yaml
@@ -0,0 +1,23 @@
+# ---
+# apiVersion: v1
+# kind: Namespace
+# metadata:
+#   name: llama-3-1-8b-instruct
+---
+
+apiVersion: ome.io/v1beta1
+kind: InferenceService
+metadata:
+  name: llama-3-1-8b-instruct
+  namespace: llama-3-1-8b-instruct
+spec:
+  model:
+    name: llama-3-1-8b-instruct
+  engine:
+    minReplicas: 8
+    maxReplicas: 8
+  runtime:
+    name: srt-llama-3-1-8b-instruct
+  router:
+    minReplicas: 1
+    maxReplicas: 1