templates/vllm.yaml

blocks:
  - name: aws-nodes
    content:
      nodes:
        - name: gpu-enabled
          $cndi.comment(aws_vantage): https://instances.vantage.sh/aws/ec2/g6.xlarge?region=us-east-1
          instance_type: g6.xlarge
          count: 2
          disk_size: 200

  - name: azure-nodes
    content:
      nodes:
        - name: gpugroup
          $cndi.comment(azure_vantage): https://instances.vantage.sh/azure/vm/nc4ast4-v3
          instance_type: Standard_NC4as_T4_v3
          count: 2
          disk_size: 200

  - name: gcp-nodes
    content:
      nodes:
        - name: gpu-enabled
          instance_type: g2-standard-4
          count: 2
          disk_size: 200

  - name: gcp-resource-quota
    content:
      apiVersion: v1
      kind: ResourceQuota
      metadata:
        name: gpu-operator-quota
        namespace: gpu-operator
      spec:
        hard:
          pods: 100
        scopeSelector:
          matchExpressions:
            - operator: In
              scopeName: PriorityClass
              values:
                - system-node-critical
                - system-cluster-critical

prompts:
  - $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/common/cluster/core-prompts.yaml):
      {}
  - $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/{{ $cndi.get_prompt_response(deployment_target_provider) }}/core-prompts.yaml):
      {}

  - name: hf_api_token
    message: >-
      What is your Hugging Face API token?
    type: Secret

  - name: hf_model
    message: >-
      What Hugging Face model would you like to deploy?
    type: Input
    default: meta-llama/Llama-3.2-1B-Instruct

outputs:
  cndi_config:
    cndi_version: v2
    project_name: "{{ $cndi.get_prompt_response(project_name) }}"
    provider: "{{ $cndi.get_prompt_response(deployment_target_provider) }}"
    distribution: "{{ $cndi.get_prompt_response(deployment_target_distribution) }}"
    infrastructure:
      cndi:
        $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/common/cluster/observability-config.yaml):
          condition:
            - "{{ $cndi.get_prompt_response(deploy_grafana_ingress) }}"
            - ==
            - true

        $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/common/cluster/argocd-config.yaml):
          condition:
            - "{{ $cndi.get_prompt_response(deploy_argocd_ingress) }}"
            - ==
            - true

        cert_manager:
          email: "{{ $cndi.get_prompt_response(cert_manager_email) }}"

        external_dns:
          $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/external-dns/config/{{ $cndi.get_prompt_response(dns_provider) }}.yaml):
            condition:
              - "{{ $cndi.get_prompt_response(enable_external_dns) }}"
              - ==
              - true

        $cndi.get_block({{ $cndi.get_prompt_response(deployment_target_provider) }}-nodes):
          {}

    cluster_manifests:
      gpu-operator-ns:
        apiVersion: v1
        kind: Namespace
        metadata:
          name: gpu-operator

      vllm-ns:
        apiVersion: v1
        kind: Namespace
        metadata:
          name: vllm

      external-dns-secret:
        $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/external-dns/secret/{{ $cndi.get_prompt_response(dns_provider) }}.yaml):
          condition:
            - "{{ $cndi.get_prompt_response(enable_external_dns) }}"
            - ==
            - true

      gpu-operator-quota:
        $cndi.get_block(gcp-resource-quota):
          condition:
            - "{{ $cndi.get_prompt_response(deployment_target_provider) }}"
            - ==
            - gcp

      hf-secret:
        apiVersion: v1
        kind: Secret
        metadata:
          name: hf-secret
          namespace: vllm
        stringData:
          hf_api_token: $cndi_on_ow.seal_secret_from_env_var(HF_API_TOKEN)

      model-pvc:
        apiVersion: v1
        kind: PersistentVolumeClaim
        metadata:
          name: model-pvc
          namespace: vllm
        spec:
          accessModes:
            - ReadWriteOnce
          resources:
            requests:
              storage: 50Gi
          storageClassName: rwo
          volumeMode: Filesystem

      model-deployment:
        apiVersion: apps/v1
        kind: Deployment
        metadata:
          name: model-deployment
          namespace: vllm
          labels:
            app: model
        spec:
          replicas: 1
          selector:
            matchLabels:
              app: model
          template:
            metadata:
              labels:
                app: model
            spec:
              volumes:
                - name: cache-volume
                  persistentVolumeClaim:
                    claimName: model-pvc
                - name: shm
                  emptyDir:
                    medium: Memory
                    sizeLimit: "2Gi"
              containers:
                - name: model
                  image: vllm/vllm-openai:latest
                  command: ["/bin/sh", "-c"]
                  args: [
                    "vllm serve {{ $cndi.get_prompt_response(hf_model) }} --trust-remote-code --enable-chunked-prefill",
                  ]
                  env:
                    - name: HUGGING_FACE_HUB_TOKEN
                      valueFrom:
                        secretKeyRef:
                          name: hf-secret
                          key: hf_api_token
                  ports:
                    - containerPort: 8000
                  resources:
                    limits:
                      cpu: "3"
                      memory: 10G
                      nvidia.com/gpu: "1"
                    requests:
                      cpu: "2"
                      memory: 6G
                      nvidia.com/gpu: "1"
                  volumeMounts:
                    - mountPath: /root/.cache/huggingface
                      name: cache-volume
                    - name: shm
                      mountPath: /dev/shm
                  livenessProbe:
                    httpGet:
                      path: /health
                      port: 8000
                    initialDelaySeconds: 60
                    periodSeconds: 10
                  readinessProbe:
                    httpGet:
                      path: /health
                      port: 8000
                    initialDelaySeconds: 60
                    periodSeconds: 5

      vllm-svc:
        apiVersion: v1
        kind: Service
        metadata:
          name: model-service
          namespace: vllm
        spec:
          ports:
            - name: model
              port: 80
              protocol: TCP
              targetPort: 8000
          selector:
            app: model
          sessionAffinity: None
          type: ClusterIP

    applications:
      gpu-operator:
        repoURL: https://helm.ngc.nvidia.com/nvidia
        chart: gpu-operator
        targetRevision: v24.9.0
        destinationNamespace: gpu-operator
        values: {} # https://github.com/NVIDIA/gpu-operator/blob/master/deployments/gpu-operator/values.yaml

  env:
    $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/common/git-credentials-{{ $cndi.get_prompt_response(git_credentials_mode) }}-env.yaml):
      {}
    $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/{{ $cndi.get_prompt_response(deployment_target_provider) }}/env.yaml):
      {}
    $cndi.get_block(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/common/cluster/env.yaml):
      {}
    $cndi.comment(hf): Hugging Face API token
    HF_API_TOKEN: "{{ $cndi.get_prompt_response(hf_api_token) }}"

  readme:
    project_name: "# {{ $cndi.get_prompt_response(project_name) }}"
    $cndi.get_string(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/common/cluster/core-readme.md):
      {}
    $cndi.get_string(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/{{ $cndi.get_prompt_response(deployment_target_provider) }}/core.md):
      {}
    $cndi.get_string(https://raw.githubusercontent.com/polyseam/cndi/main/blocks/{{ $cndi.get_prompt_response(deployment_target_provider) }}/{{ $cndi.get_prompt_response(deployment_target_distribution) }}.md):
      {}

    vllm: |
      ## vllm

      The [gpu-operator](https://github.com/nvidia/gpu-operator) is a Kubernetes operator will automatically add capacity information to your Kubernetes nodes which denotes that a GPU is present.

      Once the GPU operator is installed, vllm will consume your huggingface API token and deploy the model you specified in your prompts.

      Note: You will need to have a valid Hugging Face API token, and that token must be granted access to the model you want to deploy.

      To learn more about the vllm deployment, take a look at the manifests defined in your [cndi_config.yaml](./cndi_config.yaml).