Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
566 changes: 566 additions & 0 deletions .github/workflows/automation.yaml

Large diffs are not rendered by default.

17 changes: 13 additions & 4 deletions config/models/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,19 @@ apiVersion: kustomize.config.k8s.io/v1beta1
kind: Kustomization

resources:
- meta/Llama-3.3-70B-instruct.yaml
- meta/Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
- meta/Llama-4-Scout-17B-16E-Instruct.yaml
- intfloat/e5-mistral-7b-instruct.yaml
- microsoft/Phi-3-vision-128k-instruct.yaml
- deepseek-ai/DeepSeek-V3.yaml
- deepseek-ai/DeepSeek-R1.yaml
- deepseek-ai/DeepSeek-R1.yaml
- meta/Llama-3.1-405B-Instruct-FP8.yaml
- meta/Llama-3.1-8B-Instruct.yaml
- meta/Llama-3.1-70B-Instruct.yaml
- meta/Llama-3.2-11B-Vision-Instruct.yaml
- meta/Llama-3.2-90B-Vision-Instruct.yaml
- meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml
- meta/Llama-3.3-70B-Instruct.yaml
- meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml
- meta/Llama-4-Maverick-17B-128E-Instruct-FP8.yaml
- meta/Llama-4-Scout-17B-16E-Instruct.yaml
- openai/gpt-oss-20b.yaml
- openai/gpt-oss-120b.yaml
4 changes: 2 additions & 2 deletions config/models/meta/Llama-3.1-405B-Instruct-FP8.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@ spec:
disabled: false
version: "1.0.0"
storage:
storageUri: hf://meta-llama/Meta-Llama-3.1-405B-Instruct-FP8
path: /raid/models/meta/Llama-3.1-405B-Instruct-FP8
storageUri: hf://meta-llama/Llama-3.1-405B-Instruct-FP8
path: /raid/models/meta/llama-3-1-405b-instruct-fp8
key: "hf-token"
13 changes: 13 additions & 0 deletions config/models/meta/Llama-3.1-8B-Instruct.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: llama-3-1-8b-instruct
spec:
vendor: meta
disabled: false
version: "1.0.0"
displayName: meta.llama-3.1-8b-instruct
storage:
storageUri: hf://meta-llama/Llama-3.1-8B-Instruct
path: /raid/models/meta/llama-3-1-8b-instruct
key: "hf-token"
2 changes: 1 addition & 1 deletion config/models/meta/Llama-3.2-11B-Vision-Instruct.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
version: "1.0.0"
storage:
storageUri: hf://meta-llama/Llama-3.2-11B-Vision-Instruct
path: /raid/models/meta/Llama-3.2-11B-Vision-Instruct
path: /raid/models/meta/llama-3-2-11b-vision-instruct
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why changing this?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is just to be consistent with what are currently in cluster. All the existing models follows this all lower case pattern.

key: "hf-token"
2 changes: 1 addition & 1 deletion config/models/meta/Llama-3.2-3B-Instruct.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
version: "1.0.0"
storage:
storageUri: hf://meta-llama/Llama-3.2-3B-Instruct
path: /raid/models/meta/Llama-3.2-3B-Instruct
path: /raid/models/meta/llama-3-2-3b-instruct
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here
Those are just path and mimicking model ID
We don't have to change this, please revert

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

All lowercase or mimicking the model ID, we just need to agree on a pattern then I will clean up the rest.

key: "hf-token"
13 changes: 13 additions & 0 deletions config/models/meta/Llama-3.2-90B-Vision-Instruct-FP8.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: llama-3-2-90b-vision-instruct-fp8
spec:
displayName: meta.llama-3.2-90b-vision-instruct-fp8
vendor: meta
disabled: false
version: "1.0.0"
storage:
storageUri: hf://RedHatAI/Llama-3.2-90B-Vision-Instruct-FP8-dynamic
path: /raid/models/meta/llama-3-2-90b-vision-instruct-fp8-dynamic
key: "hf-token"
2 changes: 1 addition & 1 deletion config/models/meta/Llama-3.2-90B-Vision-Instruct.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ spec:
version: "1.0.0"
storage:
storageUri: hf://meta-llama/Llama-3.2-90B-Vision-Instruct
path: /raid/models/meta/Llama-3.2-90B-Vision-Instruct
path: /raid/models/meta/llama-3-2-90b-vision-instruct
key: "hf-token"
13 changes: 13 additions & 0 deletions config/models/meta/Llama-3.3-70B-Instruct-FP8-dynamic.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
apiVersion: ome.io/v1beta1
kind: ClusterBaseModel
metadata:
name: llama-3-3-70b-instruct-fp8-dynamic
spec:
disabled: false
displayName: meta.llama-3.3-70b-instruct-fp8-dynamic
storage:
storageUri: hf://RedHatAI/Llama-3.3-70B-Instruct-FP8-dynamic
path: /raid/models/meta/llama-3-3-70b-instruct-fp8-dynamic
key: "hf-token"
vendor: meta
version: "1.0.0"
14 changes: 11 additions & 3 deletions config/runtimes/kustomization.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,18 @@ kind: Kustomization
resources:
- srt/deepseek-rdma-pd-rt.yaml
- srt/deepseek-rdma-rt.yaml
- srt/e5-mistral-7b-instruct-rt.yaml
- srt/llama-3-1-8b-instruct-rt.yaml
- vllm/llama-3-1-8b-instruct-rt.yaml
- srt/llama-3-1-70b-instruct-rt.yaml
- vllm/llama-3-1-405b-instruct-fp8-rt.yaml
- vllm/llama-3-2-11b-vision-instruct-rt.yaml
- vllm/llama-3-2-90b-vision-instruct-rt.yaml
- vllm/llama-3-2-90b-vision-instruct-fp8-dynamic-rt.yaml
- srt/llama-3-3-70b-instruct-rt.yaml
- srt/llama-3-3-70b-instruct-pd-rt.yaml
- srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
- srt/llama-4-maverick-17b-128e-instruct-fp8-rt.yaml
- srt/llama-4-maverick-17b-128e-instruct-fp8-pd-rt.yaml
- srt/llama-4-scout-17b-16e-instruct-rt.yaml
- srt/llama-4-scout-17b-16e-instruct-pd-rt.yaml
- srt/e5-mistral-7b-instruct-rt.yaml
- srt/llama-3-3-70b-instruct-rt.yaml
- srt/llama-3-3-70b-instruct-pd-rt.yaml
129 changes: 129 additions & 0 deletions config/runtimes/srt/llama-3-1-8b-instruct-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: srt-llama-3-1-8b-instruct
spec:
disabled: false
supportedModelFormats:
- modelFramework:
name: transformers
version: "4.42.3"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: LlamaForCausalLM
autoSelect: false
priority: 1
version: "1.0.0"
protocolVersions:
- openAI
modelSizeRange:
min: 7B
max: 9B
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
runner:
name: ome-container
image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- /bin/bash
- '-lc'
- --
args:
- |
python3 -m sglang.launch_server \
--host=0.0.0.0 \
--port=8080 \
--enable-metrics \
--log-requests \
--model-path="$MODEL_PATH" \
--tp-size=1 \
--mem-frac=0.9
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 10
memory: 30Gi
nvidia.com/gpu: 1
limits:
cpu: 10
memory: 30Gi
nvidia.com/gpu: 1

readinessProbe:
httpGet:
path: /health_generate
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 200

livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60

startupProbe:
httpGet:
path: /health_generate
port: 8080
failureThreshold: 150
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
routerConfig:
runner:
name: router
image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
resources:
limits:
cpu: "1"
memory: "2Gi"
ports:
- containerPort: 8080
name: http
command:
- sh
- -c
- >
python3 -m sglang_router.launch_router
--host "0.0.0.0"
--port "8080"
--service-discovery
--service-discovery-namespace "${NAMESPACE}"
--service-discovery-port 8080
--selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
128 changes: 128 additions & 0 deletions config/runtimes/srt/llama-3-3-70b-instruct-fp8-dynamic-rt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
apiVersion: ome.io/v1beta1
kind: ClusterServingRuntime
metadata:
name: srt-llama-3-3-70b-instruct-fp8-dynamic
spec:
disabled: false
supportedModelFormats:
- modelFramework:
name: transformers
version: "4.47.0.dev0"
modelFormat:
name: safetensors
version: "1.0.0"
modelArchitecture: LlamaForCausalLM
autoSelect: false
priority: 1
protocolVersions:
- openAI
modelSizeRange:
min: 60B
max: 75B
engineConfig:
annotations:
prometheus.io/scrape: "true"
prometheus.io/port: "8080"
prometheus.io/path: "/metrics"
labels:
logging-forward: enabled
tolerations:
- key: "nvidia.com/gpu"
operator: "Exists"
effect: "NoSchedule"
volumes:
- name: dshm
emptyDir:
medium: Memory
runner:
name: ome-container
image: docker.io/lmsysorg/sglang:v0.4.8.post1-cu126
ports:
- containerPort: 8080
name: http1
protocol: TCP
command:
- /bin/bash
- '-lc'
- --
args:
- |
python3 -m sglang.launch_server \
--host=0.0.0.0 \
--port=8080 \
--enable-metrics \
--log-requests \
--model-path="$MODEL_PATH" \
--tp-size=2 \
--mem-frac=0.9
volumeMounts:
- mountPath: /dev/shm
name: dshm
resources:
requests:
cpu: 10
memory: 160Gi
nvidia.com/gpu: 2
limits:
cpu: 10
memory: 160Gi
nvidia.com/gpu: 2

readinessProbe:
httpGet:
path: /health_generate
port: 8080
failureThreshold: 3
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 200

livenessProbe:
httpGet:
path: /health
port: 8080
failureThreshold: 5
successThreshold: 1
periodSeconds: 60
timeoutSeconds: 60

startupProbe:
httpGet:
path: /health_generate
port: 8080
failureThreshold: 150
successThreshold: 1
periodSeconds: 6
initialDelaySeconds: 60
timeoutSeconds: 30
routerConfig:
runner:
name: router
image: ghcr.io/moirai-internal/sgl-router:0.1.4.30f2a44
resources:
limits:
cpu: "1"
memory: "2Gi"
ports:
- containerPort: 8080
name: http
command:
- sh
- -c
- >
python3 -m sglang_router.launch_router
--host "0.0.0.0"
--port "8080"
--service-discovery
--service-discovery-namespace "${NAMESPACE}"
--service-discovery-port 8080
--selector component=engine ome.io/inferenceservice=${INFERENCESERVICE_NAME}
env:
- name: NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
- name: INFERENCESERVICE_NAME
valueFrom:
fieldRef:
fieldPath: metadata.labels['ome.io/inferenceservice']
Loading
Loading