Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -82,3 +82,6 @@ coverage-*.html
*.a

hack/python-sdk/openapi-generator-cli-*.jar

# Go build cache directories
.cache/
26 changes: 25 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ GO_FMT ?= gofmt
# Use go.mod go version as a single source of truth for the Go version
GO_VERSION := $(shell awk '/^go /{print $$2}' go.mod|head -n1)

# Go build cache configuration
# Only use custom cache when CUSTOM_GO_CACHE is set to avoid conflicts with CI caching
ifdef CUSTOM_GO_CACHE
GOCACHE ?= $(shell pwd)/.cache/go-build
GOMODCACHE ?= $(shell pwd)/.cache/go-mod
export GOCACHE
export GOMODCACHE

# Ensure cache directories exist
$(shell mkdir -p $(GOCACHE) $(GOMODCACHE))
endif

# Determine Docker build command (use nerdctl if available)
DOCKER_BUILD_CMD ?= docker

Expand All @@ -36,6 +48,9 @@ else
DOCKER_BUILD_CMD = nerdctl
endif

# Enable Docker BuildKit for cache mounts
export DOCKER_BUILDKIT=1

# CRD Options
CRD_OPTIONS ?= "crd:maxDescLen=0"

Expand Down Expand Up @@ -192,7 +207,7 @@ fmt: install-goimports ## 🧹 Run go fmt and goimports against code
@echo "🧹 Formatting Go code..."
@$(GO_CMD) fmt ./...
@echo "🧹 Organizing imports in Go files..."
@find . -name '*.go' -not -path '*/vendor/*' -not -exec grep -q '// Code generated' {} \; -exec $(GOIMPORTS) -w {} +
@find . -name '*.go' -not -path '*/vendor/*' -not -path '*/.cache/*' -not -exec grep -q '// Code generated' {} \; -exec $(GOIMPORTS) -w {} +
@echo "✅ Formatting complete"

.PHONY: vet
Expand All @@ -207,6 +222,15 @@ tidy: ## 📦 Run go mod tidy
@$(GO_CMD) mod tidy
@echo "✅ Dependencies cleaned up"

.PHONY: clean-cache
clean-cache: ## 🧹 Clean Go build cache
@echo "🧹 Cleaning Go build cache..."
ifdef CUSTOM_GO_CACHE
@rm -rf .cache/go-build .cache/go-mod
endif
@$(GO_CMD) clean -cache -modcache
@echo "✅ Cache cleaned"

.PHONY: ci-lint
ci-lint: golangci-lint ## 🔎 Run golangci-lint against code.
@echo "🔎 Running golangci-lint..."
Expand Down
2 changes: 1 addition & 1 deletion Makefile-deps.mk
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ golangci-lint: fix-tools-gomod ## 🔍 Download golangci-lint locally if necessa
@echo "✅ Installation complete"


GOIMPORTS = $(PROJECT_DIR)/bin/staticcheck
STATICCHECK = $(PROJECT_DIR)/bin/staticcheck
.PHONY: staticcheck
staticcheck: fix-tools-gomod ## 🔎 Download staticcheck locally if necessary
@echo "🔎 Installing staticcheck..."
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,15 @@ spec:
serviceAccountName: {{ .Values.modelAgent.serviceAccountName }}
affinity:
{{- toYaml .Values.modelAgent.affinity | nindent 8 }}
{{- if or .Values.modelAgent.gpuNodesOnly .Values.modelAgent.nodeSelector }}
nodeSelector:
{{- if .Values.modelAgent.gpuNodesOnly }}
{{ .Values.modelAgent.gpuNodeLabel.key }}: {{ .Values.modelAgent.gpuNodeLabel.value | quote }}
{{- end }}
{{- if .Values.modelAgent.nodeSelector }}
{{- toYaml .Values.modelAgent.nodeSelector | nindent 8 }}
{{- end }}
{{- end }}
{{- $imagePullSecrets := .Values.modelAgent.imagePullSecrets | default .Values.global.imagePullSecrets }}
{{- if $imagePullSecrets }}
imagePullSecrets:
Expand All @@ -40,7 +47,7 @@ spec:
{{- end }}
containers:
- name: model-agent
image: {{ include "ome.imageWithHub" (dict "values" .Values "repository" .Values.modelAgent.image.repository "tag" .Values.modelAgent.image.tag) }}
image: {{ include "ome.imageWithHub" (dict "values" (merge (dict "global" (dict "hub" (default .Values.global.hub .Values.modelAgent.image.hub))) .Values) "repository" .Values.modelAgent.image.repository "tag" .Values.modelAgent.image.tag) }}
imagePullPolicy: {{ .Values.modelAgent.image.pullPolicy }}
ports:
- name: metrics
Expand All @@ -56,6 +63,10 @@ spec:
valueFrom:
fieldRef:
fieldPath: spec.nodeName
{{- range $key, $value := .Values.modelAgent.env }}
- name: {{ $key }}
value: {{ $value | quote }}
{{- end }}
volumeMounts:
- name: host-models
readOnly: false
Expand Down
25 changes: 24 additions & 1 deletion charts/ome-resources/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,26 @@ modelAgent:
priorityClassName: system-node-critical
serviceAccountName: ome-model-agent
image:
# Docker registry hub. If set, overrides global.hub for model-agent.
# If repository contains '/', hub is ignored.
# Leave empty to use global.hub
hub: ""
repository: model-agent
pullPolicy: Always
tag: *defaultVersion

# When enabled, the model agent will only run on nodes with GPU
gpuNodesOnly: false

# GPU node label selector (used when gpuNodesOnly is true)
# Examples:
# For NVIDIA GPU operator: nvidia.com/gpu.present: "true"
# For Nebius: nebius.com/gpu: "true"
# For AWS: node.kubernetes.io/instance-type: g4dn.xlarge
gpuNodeLabel:
key: nvidia.com/gpu.present
value: "true"

nodeSelector: {}

# Additional volumes to mount into the model-agent DaemonSet pods
Expand Down Expand Up @@ -125,7 +141,14 @@ modelAgent:

health:
port: 8080


# Additional environment variables for the model-agent container
# Examples:
# env:
# LOG_LEVEL: "debug"
# DOWNLOAD_RETRY: "5"
env: {}

resources:
limits:
cpu: '10'
Expand Down
2 changes: 2 additions & 0 deletions cmd/model-agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"fmt"
"net/http"
"os"
"strings"
"time"

"github.com/prometheus/client_golang/prometheus"
Expand Down Expand Up @@ -74,6 +75,7 @@ func init() {
rootCmd.PersistentFlags().StringVar(&cfg.logLevel, "log-level", "info", "Log level (debug, info, warn, error)")

_ = v.BindPFlags(rootCmd.PersistentFlags())
v.SetEnvKeyReplacer(strings.NewReplacer("-", "_"))
v.AutomaticEnv()
}

Expand Down
11 changes: 7 additions & 4 deletions dockerfiles/manager.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@ WORKDIR /workspace
COPY go.mod go.mod
COPY go.sum go.sum

# Download dependencies
RUN go mod download
# Download dependencies with Go module cache
RUN --mount=type=cache,target=/go/pkg/mod \
go mod download

# Copy source code
COPY cmd/ cmd/
Expand All @@ -38,8 +39,10 @@ ARG VERSION
ARG GIT_TAG
ARG GIT_COMMIT

# Build the manager binary (CGO must be enabled for XET library)
RUN CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
# Build the manager binary with Go build cache (CGO must be enabled for XET library)
RUN --mount=type=cache,target=/root/.cache/go-build \
--mount=type=cache,target=/go/pkg/mod \
CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
go build -a \
-ldflags "-X github.com/sgl-project/ome/pkg/version.GitVersion=${GIT_TAG} -X github.com/sgl-project/ome/pkg/version.GitCommit=${GIT_COMMIT}" \
-o manager ./cmd/manager
Expand Down
57 changes: 42 additions & 15 deletions dockerfiles/model-agent.Dockerfile
Original file line number Diff line number Diff line change
@@ -1,3 +1,9 @@
# Configurable base image - must be declared before any FROM statement
# Defaults to Oracle Linux 10 for OCI SDK compatibility
# Can be overridden with --build-arg BASE_IMAGE=ubuntu:24.04
# Note: Ubuntu 22.04 has glibc 2.35, but golang:1.24 requires glibc 2.38+
ARG BASE_IMAGE=oraclelinux:10-slim

# Build the model-agent binary
FROM golang:1.24 AS builder

Expand All @@ -23,8 +29,9 @@ WORKDIR /workspace
COPY go.mod go.mod
COPY go.sum go.sum

# Download dependencies
RUN go mod download
# Download dependencies with Go module cache
RUN --mount=type=cache,target=/go/pkg/mod \
go mod download

# Copy source code
COPY cmd/ cmd/
Expand All @@ -33,28 +40,48 @@ COPY pkg/ pkg/
# Build the XET library first
RUN cd pkg/xet && make build

# Verify static library exists and remove dynamic library to force static linking
RUN ls -lh /workspace/pkg/xet/target/release/libxet.* && \
rm -f /workspace/pkg/xet/target/release/libxet.so

# Build arguments for version info
ARG VERSION
ARG GIT_TAG
ARG GIT_COMMIT

# Build the model-agent binary (CGO must be enabled for XET library)
RUN CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
# Build the model-agent binary with Go build cache (CGO must be enabled for XET library)
RUN --mount=type=cache,target=/root/.cache/go-build \
--mount=type=cache,target=/go/pkg/mod \
PKG_CONFIG_ALL_STATIC=1 \
CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
go build -a \
-ldflags "-X github.com/sgl-project/ome/pkg/version.GitVersion=${GIT_TAG} -X github.com/sgl-project/ome/pkg/version.GitCommit=${GIT_COMMIT}" \
-o model-agent ./cmd/model-agent

# Use Oracle Linux 9 as base image for OCI SDK compatibility
FROM oraclelinux:10-slim
RUN microdnf update -y && microdnf clean all

# Install runtime dependencies for the XET library
RUN microdnf install -y \
glibc \
libgcc \
libstdc++ \
openssl-libs \
&& microdnf clean all
# Use the base image specified at the top of the file
ARG BASE_IMAGE
FROM ${BASE_IMAGE}

# Install/update packages and runtime dependencies based on the base image
RUN if [ -f /usr/bin/microdnf ]; then \
microdnf update -y && \
microdnf install -y \
glibc \
libgcc \
libstdc++ \
openssl-libs && \
microdnf clean all; \
elif [ -f /usr/bin/apt-get ]; then \
apt-get update && \
apt-get install -y \
ca-certificates \
libc6 \
libgcc-s1 \
libstdc++6 \
libssl3 && \
apt-get upgrade -y && \
apt-get clean && rm -rf /var/lib/apt/lists/*; \
fi

COPY --from=builder /workspace/model-agent /
ENTRYPOINT ["/model-agent"]
11 changes: 7 additions & 4 deletions dockerfiles/multinode-prober.Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,9 @@ WORKDIR /workspace
COPY go.mod go.mod
COPY go.sum go.sum

# Download dependencies
RUN go mod download
# Download dependencies with Go module cache
RUN --mount=type=cache,target=/go/pkg/mod \
go mod download

# Copy source code
COPY cmd/ cmd/
Expand All @@ -24,8 +25,10 @@ ARG VERSION
ARG GIT_TAG
ARG GIT_COMMIT

# Build the multinode-prober binary
RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
# Build the multinode-prober binary with Go build cache
RUN --mount=type=cache,target=/root/.cache/go-build \
--mount=type=cache,target=/go/pkg/mod \
CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \
go build -a -installsuffix cgo \
-ldflags "-X github.com/sgl-project/ome/pkg/version.GitVersion=${GIT_TAG} -X github.com/sgl-project/ome/pkg/version.GitCommit=${GIT_COMMIT}" \
-o multinode-prober ./cmd/multinode-prober
Expand Down
Loading
Loading