diff --git a/.github/workflows/automation.yaml b/.github/workflows/automation.yaml new file mode 100644 index 00000000..1de3dbc8 --- /dev/null +++ b/.github/workflows/automation.yaml @@ -0,0 +1,566 @@ +name: Container Release Process Automation + +on: + pull_request: + repository_dispatch: + types: [container-pushed] + workflow_dispatch: + inputs: + models: + description: "Comma-separated list of model names to deploy" + required: false + default: "llama-3-3-70b-instruct,llama-4-scout-17b-16e-instruct" + type: string + +env: + REGISTRY: ord.ocir.io + REGISTRY_NAMESPACE: idqj093njucb + K8S_NAMESPACE: github-actions + CONTAINER_NAME: official-sgl + # TODO: Add all supported models + ISVC_MODELS: llama-3-3-70b-instruct,llama-4-scout-17b-16e-instruct + BUCKET_NAME: ome-benchmark-results + BENCHMARK_CONTAINER: ghcr.io/moirai-internal/genai-bench:v0.0.2 + # TODO: Switch to official sanity check image + SANITY_CHECK_CONTAINER: phx.ocir.io/idqj093njucb/sanity-check:dev + +jobs: + deploy: + runs-on: gh-arc-runner + permissions: + contents: read + packages: write + id-token: write + env: + IMAGE_NAME: ${{ github.event.client_payload.image_name }} + FULL_TAG: ${{ github.event.client_payload.full_tag }} + BRANCH_NAME: ${{ github.event.client_payload.branch_name }} + VERSION: ${{ github.event.client_payload.version }} + COMMIT_HASH: ${{ github.event.client_payload.commit_hash }} + + steps: + - name: Set up kubectl + uses: azure/setup-kubectl@v3 + with: + version: 'v1.30.1' + + - name: Prepare model list and sanitize names + id: models + run: | + if [[ "${{ github.event_name }}" == "workflow_dispatch" ]]; then + MODELS="${{ github.event.inputs.models }}" + else + # Default models for auto-deployment + MODELS=${ISVC_MODELS} + fi + + # Helper functions for name sanitization + sanitize_model_name() { + echo "$1" | sed 's/[^a-zA-Z0-9]/-/g' | tr '[:upper:]' '[:lower:]' + } + + sanitize_namespace() { + local namespace="$1" + namespace=$(echo "$namespace" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g') + echo "$namespace" | cut -c1-63 | sed 's/-$//' + } + + create_runtime_name() { + local model_name="$1" + local runtime_name="srt-$model_name" + runtime_name=$(echo "$runtime_name" | tr '[:upper:]' '[:lower:]' | sed 's/[^a-zA-Z0-9-]/-/g' | sed 's/--*/-/g' | sed 's/^-\|-$//g') + echo "$runtime_name" | cut -c1-63 | sed 's/-$//' + } + + # Process each model and create comprehensive JSON with all sanitized names + IFS=',' read -ra MODEL_ARRAY <<< "$MODELS" + MODELS_WITH_NAMES="[]" + + for model in "${MODEL_ARRAY[@]}"; do + MODEL_NAME=$(sanitize_model_name "$model") + NAMESPACE=$(sanitize_namespace "${K8S_NAMESPACE}") + RUNTIME_NAME=$(create_runtime_name "$MODEL_NAME") + + # Create JSON object for this model + MODEL_OBJECT=$(jq -n \ + --arg model "$model" \ + --arg model_name "$MODEL_NAME" \ + --arg namespace "$NAMESPACE" \ + --arg runtime_name "$RUNTIME_NAME" \ + '{ + original: $model, + model_name: $model_name, + namespace: $namespace, + runtime_name: $runtime_name + }') + + # Add to array + MODELS_WITH_NAMES=$(echo "$MODELS_WITH_NAMES" | jq --argjson obj "$MODEL_OBJECT" '. += [$obj]') + done + + # Output the comprehensive model metadata + echo "models_metadata=$(echo "$MODELS_WITH_NAMES" | tr -d '\n')" >> $GITHUB_OUTPUT + + echo "Models to deploy: $MODELS" + echo "Models with sanitized names:" + echo "$MODELS_WITH_NAMES" | jq . + + - name: Deploy to Kubernetes + id: deployment + # Change benchmark endpoint to use inference service + run: | + MODELS_METADATA='${{ steps.models.outputs.models_metadata }}' + IMAGE="${IMAGE_NAME}:${FULL_TAG}" + + # Initialize status tracking + DEPLOYMENT_STATUS="[]" + + # Deploy each model sequentially using pre-computed names + # Use process substitution to avoid subshell and preserve variables + while read -r model_info; do + # Extract all names from JSON + MODEL=$(echo "$model_info" | jq -r '.original') + MODEL_NAME=$(echo "$model_info" | jq -r '.model_name') + NAMESPACE=$(echo "$model_info" | jq -r '.namespace') + RUNTIME_NAME=$(echo "$model_info" | jq -r '.runtime_name') + + echo "Deploying model: $MODEL" + echo "Namespace: $NAMESPACE" + echo "Runtime: $RUNTIME_NAME" + + # Create InferenceService manifest + cat < isvc-$MODEL_NAME.yaml + --- + apiVersion: ome.io/v1beta1 + kind: InferenceService + metadata: + name: $MODEL_NAME + namespace: $NAMESPACE + labels: + sglang.version: "$VERSION" + sidecar.istio.io/inject : "true" + annotations: + sglang.deployed-by: "github-actions" + sglang.deployment-time: "$(date -u +%Y-%m-%dT%H:%M:%SZ)" + spec: + engine: + minReplicas: 1 + maxReplicas: 1 + runner: + name: ome-container + image: $IMAGE + model: + name: $MODEL_NAME + runtime: + name: $RUNTIME_NAME + EOF + + # create benchmark job manifest + cat < benchmark-$MODEL_NAME.yaml + apiVersion: ome.io/v1beta1 + kind: BenchmarkJob + metadata: + name: benchmark-$MODEL_NAME + namespace: $NAMESPACE + spec: + podOverride: + image: ${BENCHMARK_CONTAINER} + huggingFaceSecretReference: + name: huggingface-secret + endpoint: + inferenceService: + name: $MODEL_NAME + namespace: $NAMESPACE + # endpoint: + # url: http://$MODEL_NAME-engine.$NAMESPACE.svc.cluster.local:8080 + # apiFormat: openai + # modelName: $MODEL_NAME + task: text-to-text + # numConcurrency: + # - 1 + # - 128 + maxTimePerIteration: 10 + maxRequestsPerIteration: 1000 + outputLocation: + storageUri: "oci://n/${REGISTRY_NAMESPACE}/b/${BUCKET_NAME}/o/${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}" + parameters: + auth: "instance_principal" + region: "eu-frankfurt-1" + EOF + + # create sanity-check job manifest + cat < sanity-check-$MODEL_NAME.yaml + apiVersion: batch/v1 + kind: Job + metadata: + name: sanity-check-$MODEL_NAME + namespace: $NAMESPACE + labels: + sglang.version: "$VERSION" + spec: + template: + spec: + containers: + - name: sanity-check + image: ${SANITY_CHECK_CONTAINER} + imagePullPolicy: Always + env: + - name: BASE_URL + value: "http://$MODEL_NAME-engine.$NAMESPACE.svc.cluster.local:8080" + - name: VLLM_ENDPOINT + value: "$MODEL_NAME-engine.$NAMESPACE.svc.cluster.local" + - name: VLLM_PORT + value: "8080" + restartPolicy: Never + backoffLimit: 3 + EOF + + # Apply the InferenceService + kubectl apply -f isvc-$MODEL_NAME.yaml + + # Wait for InferenceService to be ready (with timeout) + echo "Waiting for InferenceService $MODEL_NAME to be ready in namespace $NAMESPACE..." + + # Check if the InferenceService exists and wait for it to be ready + TIMEOUT=600 + ELAPSED=0 + READY=false + + while [ $ELAPSED -lt $TIMEOUT ]; do + # Check if InferenceService exists and get its status + if kubectl get inferenceservice $MODEL_NAME -n $NAMESPACE >/dev/null 2>&1; then + STATUS=$(kubectl get inferenceservice $MODEL_NAME -n $NAMESPACE -o jsonpath='{.status.conditions[?(@.type=="Ready")].status}' 2>/dev/null || echo "Unknown") + if [ "$STATUS" = "True" ]; then + READY=true + break + fi + echo "InferenceService status: $STATUS (waiting...)" + else + echo "InferenceService not found yet (waiting...)" + fi + + sleep 60 + ELAPSED=$((ELAPSED + 10)) + done + + if [ "$READY" = "true" ]; then + echo "โœ… Successfully deployed InferenceService for model: $MODEL" + echo " Namespace: $NAMESPACE" + echo " InferenceService: $MODEL_NAME" + echo " Runtime: $RUNTIME_NAME" + + # Track InferenceService success + ISVC_STATUS="SUCCESS" + + # Deploy benchmark job now that InferenceService is ready + echo "๐Ÿƒ Starting benchmark job for model: $MODEL" + kubectl apply -f benchmark-$MODEL_NAME.yaml + + # Wait for benchmark job to complete (with timeout) + echo "Waiting for BenchmarkJob benchmark-$MODEL_NAME to complete in namespace $NAMESPACE..." + + BENCHMARK_TIMEOUT=18000 # 5 hours + BENCHMARK_ELAPSED=0 + BENCHMARK_SUCCESS=false + + while [ $BENCHMARK_ELAPSED -lt $BENCHMARK_TIMEOUT ]; do + # Check if BenchmarkJob exists and get its status + if kubectl get benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE >/dev/null 2>&1; then + # Get the overall status/state of the benchmark job + BENCHMARK_STATUS=$(kubectl get benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE -o jsonpath='{.status.state}' 2>/dev/null || echo "Running") + + if [ "$BENCHMARK_STATUS" = "Succeeded" ] || [ "$BENCHMARK_STATUS" = "Completed" ]; then + BENCHMARK_SUCCESS=true + break + elif [ "$BENCHMARK_STATUS" = "Failed" ]; then + echo "โŒ Benchmark job failed for model: $MODEL" + break + fi + # echo "BenchmarkJob status: $BENCHMARK_STATUS (waiting...)" + else + echo "BenchmarkJob not found yet (waiting...)" + fi + + sleep 30 + BENCHMARK_ELAPSED=$((BENCHMARK_ELAPSED + 30)) + done + + if [ "$BENCHMARK_SUCCESS" = "true" ]; then + echo "โœ… Successfully completed benchmark for model: $MODEL" + echo " Benchmark results should be available at: oci://n/${REGISTRY_NAMESPACE}/b/${BUCKET_NAME}/o/${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}" + BENCHMARK_STATUS="SUCCESS" + else + echo "โŒ Failed to complete benchmark or timeout for model: $MODEL" + echo "Checking BenchmarkJob status..." + kubectl get benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE -o yaml || echo "BenchmarkJob not found" + echo "Checking benchmark job events..." + kubectl get events -n $NAMESPACE --sort-by=.metadata.creationTimestamp | grep -i benchmark | tail -10 + + # Get benchmark job pod logs for debugging + echo "Fetching benchmark job pod logs for debugging..." + BENCHMARK_POD=$(kubectl get pods -n $NAMESPACE -l job-name=benchmark-$MODEL_NAME -o jsonpath='{.items[0].metadata.name}' 2>/dev/null || echo "") + if [ -n "$BENCHMARK_POD" ]; then + echo "--- Benchmark Pod Logs ---" + kubectl logs $BENCHMARK_POD -n $NAMESPACE --tail=100 || echo "Failed to get pod logs" + echo "--- End of Benchmark Pod Logs ---" + else + echo "No benchmark pod found for job benchmark-$MODEL_NAME" + fi + BENCHMARK_STATUS="FAILED" + fi + + # Deploy sanity-check job after benchmark + echo "๐Ÿ” Starting sanity-check job for model: $MODEL" + kubectl apply -f sanity-check-$MODEL_NAME.yaml + + # Wait for sanity-check job to complete (with timeout) + echo "Waiting for Job sanity-check-$MODEL_NAME to complete in namespace $NAMESPACE..." + + SANITY_TIMEOUT=7200 # 120 minutes + SANITY_ELAPSED=0 + SANITY_SUCCESS=false + + while [ $SANITY_ELAPSED -lt $SANITY_TIMEOUT ]; do + # Check if Job exists and get its status + if kubectl get job sanity-check-$MODEL_NAME -n $NAMESPACE >/dev/null 2>&1; then + # Get the job completion status + SANITY_CHECK_STATUS=$(kubectl get job sanity-check-$MODEL_NAME -n $NAMESPACE -o jsonpath='{.status.conditions[0].type}' 2>/dev/null || echo "Running") + + if [ "$SANITY_CHECK_STATUS" = "Complete" ]; then + SANITY_SUCCESS=true + break + elif [ "$SANITY_CHECK_STATUS" = "Failed" ]; then + echo "โŒ Sanity-check job failed for model: $MODEL" + break + fi + # echo "Sanity-check job still running (waiting...)" + else + echo "Sanity-check job not found yet (waiting...)" + fi + + sleep 30 + SANITY_ELAPSED=$((SANITY_ELAPSED + 30)) + done + + if [ "$SANITY_SUCCESS" = "true" ]; then + echo "โœ… Successfully completed sanity-check for model: $MODEL" + SANITY_STATUS="SUCCESS" + else + echo "โŒ Failed to complete sanity-check or timeout for model: $MODEL" + echo "Checking sanity-check job status..." + kubectl get job sanity-check-$MODEL_NAME -n $NAMESPACE -o yaml || echo "Sanity-check job not found" + echo "Checking sanity-check job events..." + kubectl get events -n $NAMESPACE --sort-by=.metadata.creationTimestamp | grep -i sanity-check | tail -10 + + SANITY_STATUS="FAILED" + fi + + # Get sanity-check job logs + echo "Fetching sanity-check job logs for debugging..." + kubectl logs -n $NAMESPACE jobs/sanity-check-$MODEL_NAME --tail=100 || echo "Failed to get sanity check job logs" + + # Clean up benchmark job, sanity-check job and inference service + echo "๐Ÿงน Cleaning up resources for model: $MODEL" + kubectl delete -f benchmark-$MODEL_NAME.yaml || echo "Failed to delete benchmark job" + kubectl delete -f sanity-check-$MODEL_NAME.yaml || echo "Failed to delete sanity-check job" + + else + echo "โŒ Failed to deploy or timeout waiting for InferenceService: $MODEL" + echo "Checking InferenceService status..." + kubectl get inferenceservice $MODEL_NAME -n $NAMESPACE -o yaml || echo "InferenceService not found" + echo "Checking events in namespace..." + kubectl get events -n $NAMESPACE --sort-by=.metadata.creationTimestamp | tail -10 + # Continue with next model instead of failing the entire workflow + ISVC_STATUS="FAILED" + BENCHMARK_STATUS="SKIPPED" + SANITY_STATUS="SKIPPED" + fi + + # Clean up inference service + kubectl delete -f isvc-$MODEL_NAME.yaml || echo "Failed to delete inference service" + + # Add status to tracking JSON + STATUS_ENTRY=$(jq -n \ + --arg model "$MODEL" \ + --arg model_name "$MODEL_NAME" \ + --arg namespace "$NAMESPACE" \ + --arg runtime_name "$RUNTIME_NAME" \ + --arg isvc_status "${ISVC_STATUS:-UNKNOWN}" \ + --arg benchmark_status "${BENCHMARK_STATUS:-UNKNOWN}" \ + --arg sanity_status "${SANITY_STATUS:-UNKNOWN}" \ + '{ + model: $model, + model_name: $model_name, + namespace: $namespace, + runtime_name: $runtime_name, + isvc_status: $isvc_status, + benchmark_status: $benchmark_status, + sanity_status: $sanity_status + }') + + # Update the deployment status JSON + DEPLOYMENT_STATUS=$(echo "$DEPLOYMENT_STATUS" | jq --argjson entry "$STATUS_ENTRY" '. += [$entry]') + + # Clean up manifest files + rm -f isvc-$MODEL_NAME.yaml + rm -f benchmark-$MODEL_NAME.yaml + rm -f sanity-check-$MODEL_NAME.yaml + + echo "---" + done < <(echo "$MODELS_METADATA" | jq -c '.[]') + + # Output the deployment status for use in summary + echo "deployment_status=$(echo "$DEPLOYMENT_STATUS" | tr -d '\n')" >> $GITHUB_OUTPUT + echo "Deployment status saved:" + echo "$DEPLOYMENT_STATUS" | jq . + + - name: Cleanup on failure or cancellation + if: failure() || cancelled() + run: | + echo "๐Ÿงน Cleaning up resources after workflow failure or cancellation..." + + # Use the same model names that were prepared earlier + MODELS_METADATA='${{ steps.models.outputs.models_metadata }}' + + # Clean up any remaining resources using pre-computed names + echo "$MODELS_METADATA" | jq -c '.[]' | while read -r model_info; do + # Extract all names from JSON + MODEL=$(echo "$model_info" | jq -r '.original') + MODEL_NAME=$(echo "$model_info" | jq -r '.model_name') + NAMESPACE=$(echo "$model_info" | jq -r '.namespace') + + echo "Cleaning up resources for model: $MODEL" + + # Try to delete InferenceService + kubectl delete inferenceservice $MODEL_NAME -n $NAMESPACE 2>/dev/null || echo "No InferenceService $MODEL_NAME to cleanup" + + # Try to delete BenchmarkJob + kubectl delete benchmarkjob benchmark-$MODEL_NAME -n $NAMESPACE 2>/dev/null || echo "No BenchmarkJob benchmark-$MODEL_NAME to cleanup" + + # Try to delete sanity-check Job + kubectl delete job sanity-check-$MODEL_NAME -n $NAMESPACE 2>/dev/null || echo "No sanity-check Job sanity-check-$MODEL_NAME to cleanup" + + # Clean up any manifest files + rm -f isvc-$MODEL_NAME.yaml benchmark-$MODEL_NAME.yaml sanity-check-$MODEL_NAME.yaml + done + + echo "Cleanup completed" + + - name: Cleanup kubeconfig + if: always() + run: | + echo "๐Ÿงน Cleaning up kubeconfig credentials..." + rm -f $HOME/.kube/config + echo "Kubeconfig cleanup completed" + + - name: Create deployment summary + run: | + DEPLOYMENT_STATUS='${{ steps.deployment.outputs.deployment_status }}' + + echo "## ๐Ÿš€ SGLang Deployment & Benchmark Summary" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "**Branch:** \`${BRANCH_NAME}\`" >> $GITHUB_STEP_SUMMARY + echo "**Version:** \`${VERSION}\`" >> $GITHUB_STEP_SUMMARY + echo "**Commit:** \`${COMMIT_HASH}\`" >> $GITHUB_STEP_SUMMARY + echo "**Docker Image:** \`${IMAGE_NAME}:${FULL_TAG}\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + echo "### ๐Ÿ“‹ Deployment Overview" >> $GITHUB_STEP_SUMMARY + echo "This workflow deployed and tested each model sequentially:" >> $GITHUB_STEP_SUMMARY + echo "1. **InferenceService Deployment** - Deploy SGLang runtime with the model" >> $GITHUB_STEP_SUMMARY + echo "2. **Benchmark Execution** - Run performance benchmarks against the deployed service" >> $GITHUB_STEP_SUMMARY + echo "3. **Sanity Check** - Run sanity check validation against the deployed service" >> $GITHUB_STEP_SUMMARY + echo "4. **Resource Cleanup** - Clean up InferenceService, BenchmarkJob, and sanity-check Job after completion" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + # Read deployment status from step output + if [ -n "$DEPLOYMENT_STATUS" ] && [ "$DEPLOYMENT_STATUS" != "[]" ]; then + echo "### ๐Ÿ“Š Model Deployment Results" >> $GITHUB_STEP_SUMMARY + + # Count overall statistics + TOTAL_MODELS=$(echo "$DEPLOYMENT_STATUS" | jq 'length') + SUCCESSFUL_ISVC=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.isvc_status == "SUCCESS")] | length') + SUCCESSFUL_BENCHMARKS=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.benchmark_status == "SUCCESS")] | length') + SUCCESSFUL_SANITY=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.sanity_status == "SUCCESS")] | length') + FAILED_ISVC=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.isvc_status == "FAILED")] | length') + FAILED_BENCHMARKS=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.benchmark_status == "FAILED")] | length') + FAILED_SANITY=$(echo "$DEPLOYMENT_STATUS" | jq '[.[] | select(.sanity_status == "FAILED")] | length') + + echo "**Overall Status:**" >> $GITHUB_STEP_SUMMARY + echo "- ๐Ÿ“ˆ Total Models: $TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY + echo "- โœ… InferenceServices Deployed: $SUCCESSFUL_ISVC/$TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY + echo "- ๐Ÿƒ Benchmarks Completed: $SUCCESSFUL_BENCHMARKS/$TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY + echo "- ๐Ÿ” Sanity Checks Completed: $SUCCESSFUL_SANITY/$TOTAL_MODELS" >> $GITHUB_STEP_SUMMARY + if [ "$FAILED_ISVC" -gt 0 ]; then + echo "- โŒ InferenceService Failures: $FAILED_ISVC" >> $GITHUB_STEP_SUMMARY + fi + if [ "$FAILED_BENCHMARKS" -gt 0 ]; then + echo "- โŒ Benchmark Failures: $FAILED_BENCHMARKS" >> $GITHUB_STEP_SUMMARY + fi + if [ "$FAILED_SANITY" -gt 0 ]; then + echo "- โŒ Sanity Check Failures: $FAILED_SANITY" >> $GITHUB_STEP_SUMMARY + fi + echo "" >> $GITHUB_STEP_SUMMARY + + # Individual model status + echo "$DEPLOYMENT_STATUS" | jq -c '.[]' | while read -r status_info; do + MODEL=$(echo "$status_info" | jq -r '.model') + MODEL_NAME=$(echo "$status_info" | jq -r '.model_name') + NAMESPACE=$(echo "$status_info" | jq -r '.namespace') + RUNTIME_NAME=$(echo "$status_info" | jq -r '.runtime_name') + ISVC_STATUS=$(echo "$status_info" | jq -r '.isvc_status') + BENCHMARK_STATUS=$(echo "$status_info" | jq -r '.benchmark_status') + SANITY_STATUS=$(echo "$status_info" | jq -r '.sanity_status') + + # Status emojis + case "$ISVC_STATUS" in + "SUCCESS") ISVC_EMOJI="โœ…" ;; + "FAILED") ISVC_EMOJI="โŒ" ;; + *) ISVC_EMOJI="โ“" ;; + esac + + case "$BENCHMARK_STATUS" in + "SUCCESS") BENCHMARK_EMOJI="โœ…" ;; + "FAILED") BENCHMARK_EMOJI="โŒ" ;; + "SKIPPED") BENCHMARK_EMOJI="โญ๏ธ" ;; + *) BENCHMARK_EMOJI="โ“" ;; + esac + + case "$SANITY_STATUS" in + "SUCCESS") SANITY_EMOJI="โœ…" ;; + "FAILED") SANITY_EMOJI="โŒ" ;; + "SKIPPED") SANITY_EMOJI="โญ๏ธ" ;; + *) SANITY_EMOJI="โ“" ;; + esac + + echo "#### ๐Ÿค– $MODEL" >> $GITHUB_STEP_SUMMARY + echo "- **Namespace:** \`$NAMESPACE\`" >> $GITHUB_STEP_SUMMARY + echo "- **InferenceService:** \`$MODEL_NAME\` $ISVC_EMOJI \`$ISVC_STATUS\`" >> $GITHUB_STEP_SUMMARY + echo "- **Runtime:** \`$RUNTIME_NAME\`" >> $GITHUB_STEP_SUMMARY + echo "- **Benchmark:** $BENCHMARK_EMOJI \`$BENCHMARK_STATUS\`" >> $GITHUB_STEP_SUMMARY + echo "- **Sanity Check:** $SANITY_EMOJI \`$SANITY_STATUS\`" >> $GITHUB_STEP_SUMMARY + + if [ "$BENCHMARK_STATUS" = "SUCCESS" ]; then + echo "- **Results:** \`oci://n/${REGISTRY_NAMESPACE}/b/${BUCKET_NAME}/o/${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}\`" >> $GITHUB_STEP_SUMMARY + fi + echo "" >> $GITHUB_STEP_SUMMARY + done + else + echo "### โ— No deployment status available" >> $GITHUB_STEP_SUMMARY + echo "The deployment status was not found. This may indicate the workflow was interrupted before completion." >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + fi + + echo "### โš™๏ธ Benchmark Configuration" >> $GITHUB_STEP_SUMMARY + echo "- **Engine:** SGLang" >> $GITHUB_STEP_SUMMARY + echo "- **GPU Type:** H100" >> $GITHUB_STEP_SUMMARY + echo "- **Task:** text-to-text" >> $GITHUB_STEP_SUMMARY + echo "- **Benchmark Image:** \`${BENCHMARK_CONTAINER}\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + + echo "### ๐Ÿ“ˆ Results Location" >> $GITHUB_STEP_SUMMARY + echo "All benchmark results are stored in OCI Object Storage:" >> $GITHUB_STEP_SUMMARY + echo "- **Bucket:** \`${BUCKET_NAME}\`" >> $GITHUB_STEP_SUMMARY + echo "- **Path Pattern:** \`${CONTAINER_NAME}/${FULL_TAG}/${MODEL_NAME}/\`" >> $GITHUB_STEP_SUMMARY + echo "" >> $GITHUB_STEP_SUMMARY + echo "_Note: All resources (InferenceServices and BenchmarkJobs) were cleaned up after completion._" >> $GITHUB_STEP_SUMMARY diff --git a/config/models/kustomization.yaml b/config/models/kustomization.yaml index c529952e..82ba8dc7 100644 --- a/config/models/kustomization.yaml +++ b/config/models/kustomization.yaml @@ -2,10 +2,19 @@ apiVersion: kustomize.config.k8s.io/v1beta1 kind: Kustomization resources: - - meta/Llama-3.3-70B-instruct.yaml - - meta/Llama-4-Maverick-17B-128E-Instruct-FP8.yaml - - meta/Llama-4-Scout-17B-16E-Instruct.yaml - intfloat/e5-mistral-7b-instruct.yaml - microsoft/Phi-3-vision-128k-instruct.yaml - deepseek-ai/DeepSeek-V3.yaml - - deepseek-ai/DeepSeek-R1.yaml \ No newline at end of file + - deepseek-ai/DeepSeek-R1.yaml + - meta/Llama-3.1-405B-Instruct-FP8.yaml + - meta/Llama-3.1-8B-Instruct.yaml + - meta/Llama-3.1-70B-Instruct.yaml + - meta/Llama-3.2-11B-Vision-Instruct.yaml + - meta/Llama-3.2-90B-Vision-Instruct.yaml + - meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml + - meta/Llama-3.3-70B-Instruct.yaml + - meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml + - meta/Llama-4-Maverick-17B-128E-Instruct-FP8.yaml + - meta/Llama-4-Scout-17B-16E-Instruct.yaml + - openai/gpt-oss-20b.yaml + - openai/gpt-oss-120b.yaml diff --git a/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml b/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml index 982e757f..cec4375d 100644 --- a/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml +++ b/config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml @@ -7,6 +7,6 @@ spec: disabled: false version: "1.0.0" storage: - storageUri: hf://meta-llama/Meta-Llama-3.1-405B-Instruct-FP8 - path: /raid/models/meta/Llama-3.1-405B-Instruct-FP8 + storageUri: hf://meta-llama/Llama-3.1-405B-Instruct-FP8 + path: /raid/models/meta/llama-3-1-405b-instruct-fp8 key: "hf-token" diff --git a/config/models/meta/Llama-3.1-8B-Instruct.yaml b/config/models/meta/Llama-3.1-8B-Instruct.yaml new file mode 100644 index 00000000..9b47d01a --- /dev/null +++ b/config/models/meta/Llama-3.1-8B-Instruct.yaml @@ -0,0 +1,13 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-3-1-8b-instruct +spec: + vendor: meta + disabled: false + version: "1.0.0" + displayName: meta.llama-3.1-8b-instruct + storage: + storageUri: hf://meta-llama/Llama-3.1-8B-Instruct + path: /raid/models/meta/llama-3-1-8b-instruct + key: "hf-token" \ No newline at end of file diff --git a/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml b/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml index fc2ee760..3166df39 100644 --- a/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml +++ b/config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml @@ -9,5 +9,5 @@ spec: version: "1.0.0" storage: storageUri: hf://meta-llama/Llama-3.2-11B-Vision-Instruct - path: /raid/models/meta/Llama-3.2-11B-Vision-Instruct + path: /raid/models/meta/llama-3-2-11b-vision-instruct key: "hf-token" \ No newline at end of file diff --git a/config/models/meta/Llama-3.2-3B-Instruct.yaml b/config/models/meta/Llama-3.2-3B-Instruct.yaml index 219fa8f8..4d63f093 100644 --- a/config/models/meta/Llama-3.2-3B-Instruct.yaml +++ b/config/models/meta/Llama-3.2-3B-Instruct.yaml @@ -9,5 +9,5 @@ spec: version: "1.0.0" storage: storageUri: hf://meta-llama/Llama-3.2-3B-Instruct - path: /raid/models/meta/Llama-3.2-3B-Instruct + path: /raid/models/meta/llama-3-2-3b-instruct key: "hf-token" \ No newline at end of file diff --git a/config/models/meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml b/config/models/meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml new file mode 100644 index 00000000..8d68fda8 --- /dev/null +++ b/config/models/meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml @@ -0,0 +1,13 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-3-2-90b-vision-instruct-fp8 +spec: + displayName: meta.llama-3.2-90b-vision-instruct-fp8 + vendor: meta + disabled: false + version: "1.0.0" + storage: + storageUri: hf://RedHatAI/Llama-3.2-90B-Vision-Instruct-FP8-dynamic + path: /raid/models/meta/llama-3-2-90b-vision-instruct-fp8-dynamic + key: "hf-token" \ No newline at end of file diff --git a/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml b/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml index 5ee704cd..bd8c5d6d 100644 --- a/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml +++ b/config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml @@ -9,5 +9,5 @@ spec: version: "1.0.0" storage: storageUri: hf://meta-llama/Llama-3.2-90B-Vision-Instruct - path: /raid/models/meta/Llama-3.2-90B-Vision-Instruct + path: /raid/models/meta/llama-3-2-90b-vision-instruct key: "hf-token" \ No newline at end of file diff --git a/config/models/meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml b/config/models/meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml new file mode 100644 index 00000000..65ebbed8 --- /dev/null +++ b/config/models/meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml @@ -0,0 +1,13 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterBaseModel +metadata: + name: llama-3-3-70b-instruct-fp8-dynamic +spec: + disabled: false + displayName: meta.llama-3.3-70b-instruct-fp8-dynamic + storage: + storageUri: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic + path: /raid/models/meta/llama-3-3-70b-instruct-fp8-dynamic + key: "hf-token" + vendor: meta + version: "1.0.0" \ No newline at end of file diff --git a/config/runtimes/kustomization.yaml b/config/runtimes/kustomization.yaml index 111d1161..f5da4e48 100644 --- a/config/runtimes/kustomization.yaml +++ b/config/runtimes/kustomization.yaml @@ -4,10 +4,18 @@ kind: Kustomization resources: - srt/deepseek-rdma-pd-rt.yaml - srt/deepseek-rdma-rt.yaml +- srt/e5-mistral-7b-instruct-rt.yaml +- srt/llama-3-1-8b-instruct-rt.yaml +- vllm/llama-3-1-8b-instruct-rt.yaml +- srt/llama-3-1-70b-instruct-rt.yaml +- vllm/llama-3-1-405b-instruct-fp8-rt.yaml +- vllm/llama-3-2-11b-vision-instruct-rt.yaml +- vllm/llama-3-2-90b-vision-instruct-rt.yaml +- vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml +- srt/llama-3-3-70b-instruct-rt.yaml +- srt/llama-3-3-70b-instruct-pd-rt.yaml +- srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml - srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml - srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml - srt/llama-4-scout-17b-16e-instruct-rt.yaml - srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml -- srt/e5-mistral-7b-instruct-rt.yaml -- srt/llama-3-3-70b-instruct-rt.yaml -- srt/llama-3-3-70b-instruct-pd-rt.yaml diff --git a/config/runtimes/srt/llama-3-1-8b-instruct-rt.yaml b/config/runtimes/srt/llama-3-1-8b-instruct-rt.yaml new file mode 100644 index 00000000..4960a4bd --- /dev/null +++ b/config/runtimes/srt/llama-3-1-8b-instruct-rt.yaml @@ -0,0 +1,129 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-1-8b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.42.3" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + version: "1.0.0" + protocolVersions: + - openAI + modelSizeRange: + min: 7B + max: 9B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=1 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml new file mode 100644 index 00000000..678730ef --- /dev/null +++ b/config/runtimes/srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml @@ -0,0 +1,128 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: srt-llama-3-3-70b-instruct-fp8-dynamic +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.47.0.dev0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + protocolVersions: + - openAI + modelSizeRange: + min: 60B + max: 75B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m sglang.launch_server \ + --host=0.0.0.0 \ + --port=8080 \ + --enable-metrics \ + --log-requests \ + --model-path="$MODEL_PATH" \ + --tp-size=2 \ + --mem-frac=0.9 + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 2 + limits: + cpu: 10 + memory: 160Gi + nvidia.com/gpu: 2 + + readinessProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health_generate + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 + routerConfig: + runner: + name: router + image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44 + resources: + limits: + cpu: "1" + memory: "2Gi" + ports: + - containerPort: 8080 + name: http + command: + - sh + - -c + - > + python3 -m sglang_router.launch_router + --host "0.0.0.0" + --port "8080" + --service-discovery + --service-discovery-namespace "${NAMESPACE}" + --service-discovery-port 8080 + --selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME} + env: + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + - name: INFERENCESERVICE_NAME + valueFrom: + fieldRef: + fieldPath: metadata.labels['ome.io/inferenceservice'] \ No newline at end of file diff --git a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml index 8e78c01c..d3ff8386 100644 --- a/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml +++ b/config/runtimes/srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml @@ -14,6 +14,7 @@ spec: modelArchitecture: Llama4ForConditionalGeneration autoSelect: true priority: 2 + quantization: fp8 protocolVersions: - openAI modelSizeRange: @@ -51,12 +52,17 @@ spec: --host=0.0.0.0 \ --port=8080 \ --enable-metrics \ + --log-requests \ --model-path="$MODEL_PATH" \ - --tp-size 8 \ - --context-length=430000 \ - --chat-template llama-4 \ - --attention-backend fa3 \ - --log-requests + --tp=8 \ + --mem-frac=0.82 \ + --context-length=524288 \ + --enable-multimodal \ + --tool-call-parser=pythonic \ + --chat-template=/sgl-workspace/sglang/examples/chat_template/tool_chat_template_llama4_pythonic.jinja \ + --attention-backend=fa3 \ + --mm-attention-backend=fa3 \ + --disable-fast-image-processor volumeMounts: - mountPath: /dev/shm name: dshm diff --git a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml index 2609ed15..20b38b24 100644 --- a/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml +++ b/config/runtimes/srt/llama-4-scout-17b-16e-instruct-rt.yaml @@ -51,13 +51,17 @@ spec: --host=0.0.0.0 \ --port=8080 \ --enable-metrics \ + --log-requests \ --model-path="$MODEL_PATH" \ - --tp-size 4 \ - --mem-frac=0.95 \ - --context-length=128000 \ - --chat-template llama-4 \ - --attention-backend fa3 \ - --log-requests + --tp=4 \ + --mem-frac=0.85 \ + --context-length=196608 \ + --enable-multimodal \ + --tool-call-parser=pythonic \ + --chat-template=/sgl-workspace/sglang/examples/chat_template/tool_chat_template_llama4_pythonic.jinja \ + --attention-backend=fa3 \ + --mm-attention-backend=fa3 \ + --disable-fast-image-processor volumeMounts: - mountPath: /dev/shm name: dshm diff --git a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml index 296b386e..40ef78cf 100644 --- a/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml +++ b/config/runtimes/vllm/llama-3-1-405b-instruct-fp8-rt.yaml @@ -15,6 +15,7 @@ spec: autoSelect: true priority: 1 version: "1.0.0" + quantization: fp8 protocolVersions: - openAI modelSizeRange: @@ -56,13 +57,12 @@ spec: --served-model-name=vllm-model \ --tensor-parallel-size=8 \ --max-model-len=131072 \ - --gpu-memory-utilization=0.95 \ + --gpu-memory-utilization=0.9 \ --enable-chunked-prefill \ - --preemption-mode=swap \ - --swap-space=10 \ --enable-auto-tool-choice \ --tool-call-parser=llama3_json \ - --chat-template=examples/tool_chat_template_llama3.1_json.jinja + --chat-template=examples/tool_chat_template_llama3.1_json.jinja \ + --cuda-graph-sizes=256 volumeMounts: - mountPath: /dev/shm name: dshm diff --git a/config/runtimes/vllm/llama-3-1-8b-instruct-rt.yaml b/config/runtimes/vllm/llama-3-1-8b-instruct-rt.yaml new file mode 100644 index 00000000..1c25cb99 --- /dev/null +++ b/config/runtimes/vllm/llama-3-1-8b-instruct-rt.yaml @@ -0,0 +1,107 @@ +--- +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-llama-3-1-8b-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.42.3" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: LlamaForCausalLM + autoSelect: false + priority: 1 + version: "1.0.0" + protocolVersions: + - openAI + modelSizeRange: + min: 7B + max: 9B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/vllm/vllm-openai:v0.9.0.1 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m vllm.entrypoints.openai.api_server \ + --port=8080 \ + --model="$MODEL_PATH" \ + --middleware=vllm.entrypoints.openai.middleware.log_opc_header \ + --max-log-len=0 \ + --served-model-name="$SERVED_MODEL_NAME" \ + --tensor-parallel-size=1 \ + --max-model-len=131072 \ + --gpu-memory-utilization=0.9 \ + --enable-chunked-prefill \ + --enable-auto-tool-choice \ + --tool-call-parser=llama3_json \ + --chat-template=./examples/tool_chat_template_llama3.1_json.jinja + env: + - name: SERVED_MODEL_NAME + value: "vllm-model" + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + limits: + cpu: 10 + memory: 30Gi + nvidia.com/gpu: 1 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 \ No newline at end of file diff --git a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml index 09a00a44..4714c4ae 100644 --- a/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml +++ b/config/runtimes/vllm/llama-3-2-11b-vision-instruct-rt.yaml @@ -72,10 +72,9 @@ spec: --tensor-parallel-size=1 \ --max-num-seqs=32 \ --enforce-eager \ - --preemption-mode=swap \ --limit-mm-per-prompt=image=1 \ --max-model-len=131072 \ - --gpu-memory-utilization=0.99 \ + --gpu-memory-utilization=0.9 \ --enable-auto-tool-choice \ --tool-call-parser=llama3_json \ --chat-template=./examples/tool_chat_template_llama3.2_json.jinja diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml new file mode 100644 index 00000000..769c7e56 --- /dev/null +++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml @@ -0,0 +1,120 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-llama-3-2-90b-vision-instruct-fp8-dynamic +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.46.0.dev0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: MllamaForConditionalGeneration + autoSelect: false + priority: 1 + version: "1.0.0" + - modelFramework: + name: transformers + version: "4.50.0.dev0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: MllamaForConditionalGeneration + autoSelect: true + priority: 1 + version: "1.0.0" + protocolVersions: + - openAI + modelSizeRange: + min: 85B + max: 95B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/vllm/vllm-openai:v0.9.0.1 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + env: + - name: VLLM_RPC_TIMEOUT + value: '30000' + - name: VLLM_ENGINE_ITERATION_TIMEOUT_S + value: '120' + args: + - | + python3 -m vllm.entrypoints.openai.api_server \ + --port=8080 \ + --model="$MODEL_PATH" \ + --middleware=vllm.entrypoints.openai.middleware.log_opc_header \ + --max-log-len=0 \ + --served-model-name=vllm-model \ + --tensor-parallel-size=4 \ + --max-num-seqs=32 \ + --enforce-eager \ + --limit-mm-per-prompt=image=1 \ + --max-model-len=131072 \ + --gpu-memory-utilization=0.9 \ + --enable-auto-tool-choice \ + --tool-call-parser=llama3_json \ + --chat-template=./examples/tool_chat_template_llama3.2_json.jinja + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + limits: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 4 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 \ No newline at end of file diff --git a/config/runtimes/vllm/llama-3-2-90b-vision-instruct-rt.yaml b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-rt.yaml new file mode 100644 index 00000000..e43d9e17 --- /dev/null +++ b/config/runtimes/vllm/llama-3-2-90b-vision-instruct-rt.yaml @@ -0,0 +1,110 @@ +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-llama-3-2-90b-vision-instruct +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.46.0.dev0" + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: MllamaForConditionalGeneration + autoSelect: false + priority: 1 + version: "1.0.0" + protocolVersions: + - openAI + modelSizeRange: + min: 85B + max: 95B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: docker.io/vllm/vllm-openai:v0.9.0.1 + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + env: + - name: VLLM_RPC_TIMEOUT + value: '30000' + - name: VLLM_ENGINE_ITERATION_TIMEOUT_S + value: '120' + args: + - | + python3 -m vllm.entrypoints.openai.api_server \ + --port=8080 \ + --model="$MODEL_PATH" \ + --middleware=vllm.entrypoints.openai.middleware.log_opc_header \ + --max-log-len=0 \ + --served-model-name=vllm-model \ + --tensor-parallel-size=8 \ + --max-num-seqs=128 \ + --enforce-eager \ + --limit-mm-per-prompt=image=1 \ + --max-model-len=131072 \ + --gpu-memory-utilization=0.9 \ + --enable-auto-tool-choice \ + --tool-call-parser=llama3_json \ + --chat-template=./examples/tool_chat_template_llama3.2_json.jinja + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 8 + limits: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 8 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 \ No newline at end of file diff --git a/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml new file mode 100644 index 00000000..e4fd92f9 --- /dev/null +++ b/config/runtimes/vllm/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml @@ -0,0 +1,108 @@ +--- +apiVersion: ome.io/v1beta1 +kind: ClusterServingRuntime +metadata: + name: vllm-llama-3-3-70b-instruct-fp8-dynamic +spec: + disabled: false + supportedModelFormats: + - modelFramework: + name: transformers + version: "4.47.1" + # quantization: compressed-tensors + modelFormat: + name: safetensors + version: "1.0.0" + modelArchitecture: LlamaForCausalLM + autoSelect: true + priority: 1 + version: "1.0.0" + protocolVersions: + - openAI + modelSizeRange: + min: 60B + max: 75B + engineConfig: + annotations: + prometheus.io/scrape: "true" + prometheus.io/port: "8080" + prometheus.io/path: "/metrics" + labels: + logging-forward: enabled + tolerations: + - key: "nvidia.com/gpu" + operator: "Exists" + effect: "NoSchedule" + volumes: + - name: dshm + emptyDir: + medium: Memory + runner: + name: ome-container + image: fra.ocir.io/idqj093njucb/official-vllm-openai:v0.7.3.78f0810ef + ports: + - containerPort: 8080 + name: http1 + protocol: TCP + command: + - /bin/bash + - '-lc' + - -- + args: + - | + python3 -m vllm.entrypoints.openai.api_server \ + --port=8080 \ + --model="$MODEL_PATH" \ + --middleware=vllm.entrypoints.openai.middleware.log_opc_header \ + --max-log-len=0 \ + --served-model-name="$SERVED_MODEL_NAME" \ + --tensor-parallel-size=2 \ + --max-model-len=131072 \ + --gpu-memory-utilization=0.9 \ + --enable-chunked-prefill \ + --enable-auto-tool-choice \ + --tool-call-parser=llama3_json \ + --chat-template=./examples/tool_chat_template_llama3.1_json.jinja + env: + - name: SERVED_MODEL_NAME + value: "vllm-model" + volumeMounts: + - mountPath: /dev/shm + name: dshm + resources: + requests: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 2 + limits: + cpu: 30 + memory: 100Gi + nvidia.com/gpu: 2 + + readinessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 3 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 200 + + livenessProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 5 + successThreshold: 1 + periodSeconds: 60 + timeoutSeconds: 60 + + startupProbe: + httpGet: + path: /health + port: 8080 + failureThreshold: 150 + successThreshold: 1 + periodSeconds: 6 + initialDelaySeconds: 60 + timeoutSeconds: 30 \ No newline at end of file diff --git a/config/samples/isvc/meta/llama3-1-8b-instruct.yaml b/config/samples/isvc/meta/llama3-1-8b-instruct.yaml new file mode 100644 index 00000000..0ca55f1c --- /dev/null +++ b/config/samples/isvc/meta/llama3-1-8b-instruct.yaml @@ -0,0 +1,23 @@ +# --- +# apiVersion: v1 +# kind: Namespace +# metadata: +# name: llama-3-1-8b-instruct +--- + +apiVersion: ome.io/v1beta1 +kind: InferenceService +metadata: + name: llama-3-1-8b-instruct + namespace: llama-3-1-8b-instruct +spec: + model: + name: llama-3-1-8b-instruct + engine: + minReplicas: 8 + maxReplicas: 8 + runtime: + name: srt-llama-3-1-8b-instruct + router: + minReplicas: 1 + maxReplicas: 1