diff --git a/.gitignore b/.gitignore index 75ee0304..137bcdc8 100644 --- a/.gitignore +++ b/.gitignore @@ -82,3 +82,6 @@ coverage-*.html *.a hack/python-sdk/openapi-generator-cli-*.jar + +# Go build cache directories +.cache/ diff --git a/Makefile b/Makefile index dbfca1ad..9ccbfe73 100644 --- a/Makefile +++ b/Makefile @@ -26,6 +26,15 @@ GO_FMT ?= gofmt # Use go.mod go version as a single source of truth for the Go version GO_VERSION := $(shell awk '/^go /{print $$2}' go.mod|head -n1) +# Go build cache configuration +GOCACHE ?= $(shell pwd)/.cache/go-build +GOMODCACHE ?= $(shell pwd)/.cache/go-mod +export GOCACHE +export GOMODCACHE + +# Ensure cache directories exist +$(shell mkdir -p $(GOCACHE) $(GOMODCACHE)) + # Determine Docker build command (use nerdctl if available) DOCKER_BUILD_CMD ?= docker @@ -36,6 +45,9 @@ else DOCKER_BUILD_CMD = nerdctl endif +# Enable Docker BuildKit for cache mounts +export DOCKER_BUILDKIT=1 + # CRD Options CRD_OPTIONS ?= "crd:maxDescLen=0" @@ -207,6 +219,13 @@ tidy: ## ๐Ÿ“ฆ Run go mod tidy @$(GO_CMD) mod tidy @echo "โœ… Dependencies cleaned up" +.PHONY: clean-cache +clean-cache: ## ๐Ÿงน Clean Go build cache + @echo "๐Ÿงน Cleaning Go build cache..." + @rm -rf $(GOCACHE) $(GOMODCACHE) + @$(GO_CMD) clean -cache -modcache + @echo "โœ… Cache cleaned" + .PHONY: ci-lint ci-lint: golangci-lint ## ๐Ÿ”Ž Run golangci-lint against code. @echo "๐Ÿ”Ž Running golangci-lint..." diff --git a/charts/ome-resources/templates/model-agent-daemonset/daemonset.yaml b/charts/ome-resources/templates/model-agent-daemonset/daemonset.yaml index 5f8954b7..f90b4e13 100644 --- a/charts/ome-resources/templates/model-agent-daemonset/daemonset.yaml +++ b/charts/ome-resources/templates/model-agent-daemonset/daemonset.yaml @@ -23,8 +23,15 @@ spec: serviceAccountName: {{ .Values.modelAgent.serviceAccountName }} affinity: {{- toYaml .Values.modelAgent.affinity | nindent 8 }} + {{- if or .Values.modelAgent.gpuNodesOnly .Values.modelAgent.nodeSelector }} nodeSelector: + {{- if .Values.modelAgent.gpuNodesOnly }} + {{ .Values.modelAgent.gpuNodeLabel.key }}: {{ .Values.modelAgent.gpuNodeLabel.value | quote }} + {{- end }} + {{- if .Values.modelAgent.nodeSelector }} {{- toYaml .Values.modelAgent.nodeSelector | nindent 8 }} + {{- end }} + {{- end }} {{- $imagePullSecrets := .Values.modelAgent.imagePullSecrets | default .Values.global.imagePullSecrets }} {{- if $imagePullSecrets }} imagePullSecrets: @@ -40,7 +47,7 @@ spec: {{- end }} containers: - name: model-agent - image: {{ include "ome.imageWithHub" (dict "values" .Values "repository" .Values.modelAgent.image.repository "tag" .Values.modelAgent.image.tag) }} + image: {{ include "ome.imageWithHub" (dict "values" (merge (dict "global" (dict "hub" (default .Values.global.hub .Values.modelAgent.image.hub))) .Values) "repository" .Values.modelAgent.image.repository "tag" .Values.modelAgent.image.tag) }} imagePullPolicy: {{ .Values.modelAgent.image.pullPolicy }} ports: - name: metrics diff --git a/charts/ome-resources/values.yaml b/charts/ome-resources/values.yaml index 6fbbb6c2..b09e9e04 100644 --- a/charts/ome-resources/values.yaml +++ b/charts/ome-resources/values.yaml @@ -92,10 +92,26 @@ modelAgent: priorityClassName: system-node-critical serviceAccountName: ome-model-agent image: + # Docker registry hub. If set, overrides global.hub for model-agent. + # If repository contains '/', hub is ignored. + # Leave empty to use global.hub + hub: "" repository: model-agent pullPolicy: Always tag: *defaultVersion + # When enabled, the model agent will only run on nodes with GPU + gpuNodesOnly: false + + # GPU node label selector (used when gpuNodesOnly is true) + # Examples: + # For NVIDIA GPU operator: nvidia.com/gpu.present: "true" + # For Nebius: nebius.com/gpu: "true" + # For AWS: node.kubernetes.io/instance-type: g4dn.xlarge + gpuNodeLabel: + key: nvidia.com/gpu.present + value: "true" + nodeSelector: {} # Additional volumes to mount into the model-agent DaemonSet pods diff --git a/dockerfiles/manager.Dockerfile b/dockerfiles/manager.Dockerfile index ba50b3c9..acfabf21 100644 --- a/dockerfiles/manager.Dockerfile +++ b/dockerfiles/manager.Dockerfile @@ -12,8 +12,9 @@ WORKDIR /workspace COPY go.mod go.mod COPY go.sum go.sum -# Download dependencies -RUN go mod download +# Download dependencies with Go module cache +RUN --mount=type=cache,target=/go/pkg/mod \ + go mod download # Copy source code COPY cmd/ cmd/ @@ -24,8 +25,10 @@ ARG VERSION ARG GIT_TAG ARG GIT_COMMIT -# Build the manager binary -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ +# Build the manager binary with Go build cache +RUN --mount=type=cache,target=/root/.cache/go-build \ + --mount=type=cache,target=/go/pkg/mod \ + CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ go build -a -installsuffix cgo \ -ldflags "-X github.com/sgl-project/ome/pkg/version.GitVersion=${GIT_TAG} -X github.com/sgl-project/ome/pkg/version.GitCommit=${GIT_COMMIT}" \ -o manager ./cmd/manager diff --git a/dockerfiles/model-agent.Dockerfile b/dockerfiles/model-agent.Dockerfile index f02d9fca..8fd525d0 100644 --- a/dockerfiles/model-agent.Dockerfile +++ b/dockerfiles/model-agent.Dockerfile @@ -1,3 +1,8 @@ +# Configurable base image - must be declared before any FROM statement +# Defaults to Oracle Linux 10 for OCI SDK compatibility +# Can be overridden with --build-arg BASE_IMAGE=ubuntu:22.04 +ARG BASE_IMAGE=oraclelinux:10-slim + # Build the model-agent binary FROM golang:1.24 AS builder @@ -12,8 +17,9 @@ WORKDIR /workspace COPY go.mod go.mod COPY go.sum go.sum -# Download dependencies -RUN go mod download +# Download dependencies with Go module cache +RUN --mount=type=cache,target=/go/pkg/mod \ + go mod download # Copy source code COPY cmd/ cmd/ @@ -24,15 +30,27 @@ ARG VERSION ARG GIT_TAG ARG GIT_COMMIT -# Build the model-agent binary -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ +# Build the model-agent binary with Go build cache +RUN --mount=type=cache,target=/root/.cache/go-build \ + --mount=type=cache,target=/go/pkg/mod \ + CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ go build -a -installsuffix cgo \ -ldflags "-X github.com/sgl-project/ome/pkg/version.GitVersion=${GIT_TAG} -X github.com/sgl-project/ome/pkg/version.GitCommit=${GIT_COMMIT}" \ -o model-agent ./cmd/model-agent -# Use Oracle Linux 9 as base image for OCI SDK compatibility -FROM oraclelinux:10-slim -RUN microdnf update -y && microdnf clean all +# Use the base image specified at the top of the file +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# Install/update packages based on the base image +RUN if [ -f /usr/bin/microdnf ]; then \ + microdnf update -y && microdnf clean all; \ + elif [ -f /usr/bin/apt-get ]; then \ + apt-get update && \ + apt-get install -y ca-certificates && \ + apt-get upgrade -y && \ + apt-get clean && rm -rf /var/lib/apt/lists/*; \ + fi COPY --from=builder /workspace/model-agent / ENTRYPOINT ["/model-agent"] diff --git a/dockerfiles/multinode-prober.Dockerfile b/dockerfiles/multinode-prober.Dockerfile index fe193749..d3e04428 100644 --- a/dockerfiles/multinode-prober.Dockerfile +++ b/dockerfiles/multinode-prober.Dockerfile @@ -12,8 +12,9 @@ WORKDIR /workspace COPY go.mod go.mod COPY go.sum go.sum -# Download dependencies -RUN go mod download +# Download dependencies with Go module cache +RUN --mount=type=cache,target=/go/pkg/mod \ + go mod download # Copy source code COPY cmd/ cmd/ @@ -24,8 +25,10 @@ ARG VERSION ARG GIT_TAG ARG GIT_COMMIT -# Build the multinode-prober binary -RUN CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ +# Build the multinode-prober binary with Go build cache +RUN --mount=type=cache,target=/root/.cache/go-build \ + --mount=type=cache,target=/go/pkg/mod \ + CGO_ENABLED=0 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ go build -a -installsuffix cgo \ -ldflags "-X github.com/sgl-project/ome/pkg/version.GitVersion=${GIT_TAG} -X github.com/sgl-project/ome/pkg/version.GitCommit=${GIT_COMMIT}" \ -o multinode-prober ./cmd/multinode-prober diff --git a/dockerfiles/ome-agent.Dockerfile b/dockerfiles/ome-agent.Dockerfile index 87a60dab..69578ddf 100644 --- a/dockerfiles/ome-agent.Dockerfile +++ b/dockerfiles/ome-agent.Dockerfile @@ -1,3 +1,8 @@ +# Configurable base image - must be declared before any FROM statement +# Defaults to Oracle Linux 10 for OCI SDK compatibility +# Can be overridden with --build-arg BASE_IMAGE=ubuntu:22.04 +ARG BASE_IMAGE=oraclelinux:10-slim + # Build the ome-agent binary FROM golang:1.24 AS builder @@ -23,39 +28,61 @@ WORKDIR /workspace COPY go.mod go.mod COPY go.sum go.sum -# Download dependencies -RUN go mod download +# Download dependencies with Go module cache +RUN --mount=type=cache,target=/go/pkg/mod \ + go mod download + +# Copy XET dependencies from other pkg subdirectories +COPY pkg/configutils/ pkg/configutils/ +COPY pkg/logging/ pkg/logging/ + +# Copy XET package for building with better caching +COPY pkg/xet/ pkg/xet/ + +# Download Rust dependencies with cargo cache +RUN --mount=type=cache,target=/root/.cargo/registry \ + --mount=type=cache,target=/root/.cargo/git \ + cd pkg/xet && cargo fetch -# Copy source code +# Build the XET library with cargo build cache +RUN --mount=type=cache,target=/root/.cargo/registry \ + --mount=type=cache,target=/root/.cargo/git \ + cd pkg/xet && \ + cargo build --release + +# Copy remaining source code COPY cmd/ cmd/ COPY pkg/ pkg/ COPY internal/ internal/ -# Build the XET library first -RUN cd pkg/xet && make build - # Build arguments for version info ARG VERSION ARG GIT_TAG ARG GIT_COMMIT # Build the ome-agent binary (CGO must be enabled for XET library) -RUN CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ +RUN --mount=type=cache,target=/root/.cache/go-build \ + --mount=type=cache,target=/go/pkg/mod \ + CGO_ENABLED=1 GOOS=${TARGETOS:-linux} GOARCH=${TARGETARCH} \ go build -a \ -ldflags "-X github.com/sgl-project/ome/pkg/version.GitVersion=${GIT_TAG} -X github.com/sgl-project/ome/pkg/version.GitCommit=${GIT_COMMIT}" \ -o ome-agent ./cmd/ome-agent -# Use Oracle Linux 9 as base image for OCI SDK compatibility -FROM oraclelinux:10-slim -RUN microdnf update -y && microdnf clean all - -# Install runtime dependencies for the XET library -RUN microdnf install -y \ - glibc \ - libgcc \ - libstdc++ \ - openssl-libs \ - && microdnf clean all +# Use the base image specified at the top of the file +ARG BASE_IMAGE +FROM ${BASE_IMAGE} + +# Install/update packages and runtime dependencies based on the base image +RUN if [ -f /usr/bin/microdnf ]; then \ + microdnf update -y && \ + microdnf install -y glibc libgcc libstdc++ openssl-libs && \ + microdnf clean all; \ + elif [ -f /usr/bin/apt-get ]; then \ + apt-get update && \ + apt-get upgrade -y && \ + apt-get install -y libc6 libgcc-s1 libstdc++6 libssl3 && \ + apt-get clean && rm -rf /var/lib/apt/lists/*; \ + fi COPY --from=builder /workspace/ome-agent / COPY config/ome-agent/ome-agent.yaml / diff --git a/pkg/hfutil/hub/repo.go b/pkg/hfutil/hub/repo.go index 15b008ef..880a9668 100644 --- a/pkg/hfutil/hub/repo.go +++ b/pkg/hfutil/hub/repo.go @@ -288,9 +288,11 @@ func SnapshotDownload(ctx context.Context, config *DownloadConfig) (string, erro if totalErrors > 0 { // Collect error details var errorFiles []string + var errorMessages []string for _, result := range results { if result.err != nil { errorFiles = append(errorFiles, filesToDownload[result.index].Path) + errorMessages = append(errorMessages, fmt.Sprintf("%s: %v", filesToDownload[result.index].Path, result.err)) } }