Skip to content

Commit

Permalink
DCGM-Exporter 4.0.0 (#437)
Browse files Browse the repository at this point in the history
- Update to DCGM 4.0.0
- Major refactor to enable clean mock testing
- Refactor metric collection to align with prometheus best practices
- Many more bug fixes and improvements
  • Loading branch information
glowkey authored Jan 7, 2025
1 parent 900d465 commit 5f9250c
Show file tree
Hide file tree
Showing 163 changed files with 20,574 additions and 6,517 deletions.
70 changes: 39 additions & 31 deletions .devcontainer/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,31 +1,39 @@
FROM nvcr.io/nvidia/cuda:12.3.1-base-ubuntu22.04
ARG GOLANG_VERSION=1.21.5
FROM nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
ARG GOLANG_VERSION=1.22.5
ARG USERNAME=developer
ARG USER_UID=1000
ARG USER_GID=1000
ARG DCGM_VERSION=3.3.3
# Create a user 'developer' with UID=1000, add to 'developer' group, and add to 'sudo' group
RUN groupadd -g $USER_GID $USERNAME && \
useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \
usermod -aG sudo $USERNAME
useradd -m -u $USER_GID -g $USERNAME -s /bin/bash $USERNAME && \
usermod -aG sudo $USERNAME
# Allow 'developer' to use sudo without a password
RUN echo "$USERNAME ALL=(ALL) NOPASSWD:ALL" >> /etc/sudoers

RUN --mount=type=cache,target=/var/cache/apt \
set -eux; \
apt-get update; \
apt-get install -y --no-install-recommends \
git \
ca-certificates \
g++ \
gcc \
libc6-dev \
make \
pkg-config \
wget \
datacenter-gpu-manager=1:${DCGM_VERSION} \
libcap2-bin \
&& apt-get autoremove -y \
git \
ca-certificates \
g++ \
gcc \
libc6-dev \
make \
pkg-config \
wget \
datacenter-gpu-manager-4-core \
libcap2-bin \
&& install -m 0755 -d /etc/apt/keyrings \
&& wget -O /etc/apt/keyrings/docker.asc https://download.docker.com/linux/ubuntu/gpg \
&& chmod a+r /etc/apt/keyrings/docker.asc \
&& echo \
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu \
$(. /etc/os-release && echo "$VERSION_CODENAME") stable" | \
tee /etc/apt/sources.list.d/docker.list > /dev/null \
&& apt-get update \
&& apt-get install -y --no-install-recommends docker-ce docker-ce-cli containerd.io docker-buildx-plugin \
&& apt-get autoremove -y \
&& rm -rfd /usr/local/dcgm/bindings /usr/local/dcgm/sdk_samples /usr/share/nvidia-validation-suite \
# DCGM exporter doesn't use libdcgm_cublas_proxy*.so.
&& rm -rf /usr/lib/x86_64-linux-gnu/libdcgm_cublas_proxy*.so \
Expand All @@ -36,25 +44,25 @@ RUN set -eux; \
url=; \
echo "$arch"; \
case "$arch" in \
'amd64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \
;; \
'arm64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \
;; \
*) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \
'amd64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-amd64.tar.gz"; \
;; \
'arm64') \
url="https://dl.google.com/go/go${GOLANG_VERSION}.linux-arm64.tar.gz"; \
;; \
*) echo >&2 "error: unsupported architecture '$arch' (likely packaging update needed)"; exit 1 ;; \
esac; \
build=; \
if [ -z "$url" ]; then \
# https://github.com/golang/go/issues/38536#issuecomment-616897960
build=1; \
url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \
echo >&2; \
echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \
echo >&2; \
# https://github.com/golang/go/issues/38536#issuecomment-616897960
build=1; \
url="https://dl.google.com/go/go${GOLANG_VERSION}.src.tar.gz"; \
echo >&2; \
echo >&2 "warning: current architecture ($arch) does not have a compatible Go binary release; will be building from source"; \
echo >&2; \
fi; \
wget -O go.tgz "$url" --progress=dot:giga; \
tar -C /usr/local -xzf go.tgz; \
wget -O go.tgz "$url" --progress=dot:giga; \
tar -C /usr/local -xzf go.tgz; \
rm go.tgz
ENV GOTOOLCHAIN=local
ENV GOPATH /go
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/go.yml
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ jobs:
- name: Set up Go
uses: actions/setup-go@v2
with:
go-version: 1.21
go-version: 1.22

- name: Build
run: make binary
Expand Down
6 changes: 4 additions & 2 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,9 @@ tests.cov
test_results.json
.scannerwork
dist/
.run/
.run
dist/

###############################################################################
# JetBrains
# https://github.com/github/gitignore/blob/master/Global/JetBrains.gitignore
Expand Down Expand Up @@ -236,4 +238,4 @@ $RECYCLE.BIN/
*.msp

# Windows shortcuts
*.lnk
*.lnk
22 changes: 14 additions & 8 deletions .vscode/launch.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,18 @@
"request": "launch",
"mode": "test",
"program": "${workspaceFolder}/tests/e2e",
"args": ["-test.v",
"--ginkgo.v",
"-kubeconfig","~/.kube/config",
"-chart","./../../deployment/",
"-image-repository","nvidia/dcgm-exporter",
"-arguments","{-f=/etc/dcgm-exporter/default-counters.csv,--enable-dcgm-log=true,--dcgm-log-level=ERROR}"],
"args": [
"-test.v",
"--ginkgo.v",
"-kubeconfig",
"~/.kube/config",
"-chart",
"./../../deployment/",
"-image-repository",
"nvidia/dcgm-exporter",
"-arguments",
"{-f=/etc/dcgm-exporter/default-counters.csv}"
],
"env": {},
"buildFlags": "-tags=e2e"
},
Expand All @@ -30,8 +36,8 @@
"-f",
"./etc/default-counters.csv",
"--debug",
"--enable-dcgm-log",
"--dcgm-log-level=INFO"
"-r",
"localhost:5555"
]
}
]
Expand Down
64 changes: 0 additions & 64 deletions Jenkinsfile

This file was deleted.

76 changes: 62 additions & 14 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,30 +18,29 @@ REGISTRY ?= nvidia
GO ?= go
MKDIR ?= mkdir
GOLANGCILINT_TIMEOUT ?= 10m
IMAGE_TAG ?= ""

DCGM_VERSION := $(NEW_DCGM_VERSION)
GOLANG_VERSION := 1.22.5
GOLANG_VERSION := 1.22.9
VERSION := $(NEW_EXPORTER_VERSION)
FULL_VERSION := $(DCGM_VERSION)-$(VERSION)
OUTPUT := type=oci,dest=/dev/null
PLATFORMS := linux/amd64,linux/arm64
DOCKERCMD := docker buildx build
DOCKERCMD := docker --debug buildx build
MODULE := github.com/NVIDIA/dcgm-exporter


.PHONY: all binary install check-format local
all: update-version ubuntu22.04 ubi9

binary: generate update-version
binary: update-version
cd cmd/dcgm-exporter; $(GO) build -ldflags "-X main.BuildVersion=${DCGM_VERSION}-${VERSION}"

test-main:
test-main: generate
$(GO) test ./... -short

install: binary
install -m 755 cmd/dcgm-exporter/dcgm-exporter /usr/bin/dcgm-exporter
install -m 644 -D ./etc/default-counters.csv /etc/dcgm-exporter/default-counters.csv
install -m 644 -D ./etc/dcp-metrics-included.csv /etc/dcgm-exporter/dcp-metrics-included.csv

check-format:
test $$(gofmt -l pkg | tee /dev/stderr | wc -l) -eq 0
Expand All @@ -58,23 +57,71 @@ else
$(MAKE) PLATFORMS=linux/amd64 OUTPUT=type=docker DOCKERCMD='docker build'
endif

TARGETS = ubuntu22.04 ubi9
ubi%: DOCKERFILE = docker/Dockerfile.ubi
ubi%: --docker-build-%
@
ubi9: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubi9
ubi9: IMAGE_TAG = ubi9

ubuntu%: DOCKERFILE = docker/Dockerfile.ubuntu
ubuntu%: --docker-build-%
@
ubuntu22.04: BASE_IMAGE = nvcr.io/nvidia/cuda:12.6.3-base-ubuntu22.04
ubuntu22.04: IMAGE_TAG = ubuntu22.04

DOCKERFILE.ubuntu22.04 = docker/Dockerfile.ubuntu22.04
DOCKERFILE.ubi9 = docker/Dockerfile.ubi9

$(TARGETS):
--docker-build-%:
@echo "Building for $@"
DOCKER_BUILDKIT=1 \
$(DOCKERCMD) --pull \
--output $(OUTPUT) \
--progress=plain \
--platform $(PLATFORMS) \
--build-arg BASEIMAGE="$(BASE_IMAGE)" \
--build-arg "GOLANG_VERSION=$(GOLANG_VERSION)" \
--build-arg "DCGM_VERSION=$(DCGM_VERSION)" \
--build-arg "VERSION=$(VERSION)" \
--tag "$(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$@" \
--file $(DOCKERFILE.$@) .
--tag $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)$(if $(IMAGE_TAG),-$(IMAGE_TAG)) \
--file $(DOCKERFILE) .

.PHONY: packages package-arm64 package-amd64
packages: package-amd64 package-arm64

package-arm64:
$(MAKE) package-build PLATFORMS=linux/arm64

package-amd64:
$(MAKE) package-build PLATFORMS=linux/amd64

package-build: IMAGE_TAG = ubuntu22.04
package-build:
ARCH=`echo $(PLATFORMS) | cut -d'/' -f2)`; \
if [ "$$ARCH" = "amd64" ]; then \
ARCH="x86-64"; \
fi; \
if [ "$$ARCH" = "arm64" ]; then \
ARCH="sbsa"; \
fi; \
export DIST_NAME="dcgm_exporter-linux-$$ARCH-$(VERSION)"; \
export COMPONENT_NAME="dcgm_exporter"; \
$(MAKE) ubuntu22.04 OUTPUT=type=docker PLATFORMS=$(PLATFORMS) && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin && \
$(MKDIR) -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/dcgm-exporter && \
I=`docker create $(REGISTRY)/dcgm-exporter:$(FULL_VERSION)-$(IMAGE_TAG)` && \
docker cp $$I:/usr/bin/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/usr/bin/ && \
docker cp $$I:/etc/dcgm-exporter /tmp/$$DIST_NAME/$$COMPONENT_NAME/etc/ && \
cp ./LICENSE /tmp/$$DIST_NAME/$$COMPONENT_NAME && \
mkdir -p /tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/ && \
cp ./packaging/config-files/systemd/nvidia-dcgm-exporter.service \
/tmp/$$DIST_NAME/$$COMPONENT_NAME/lib/systemd/system/nvidia-dcgm-exporter.service && \
docker rm -f $$I && \
$(MKDIR) -p $(CURDIR)/dist && \
cd "/tmp/$$DIST_NAME" && tar -czf $(CURDIR)/dist/$$DIST_NAME.tar.gz `ls -A` && \
rm -rf "/tmp/$$DIST_NAME";

.PHONY: integration
test-integration:
test-integration: generate
go test -race -count=1 -timeout 5m -v $(TEST_ARGS) ./tests/integration/

test-coverage:
Expand All @@ -83,7 +130,7 @@ test-coverage:

.PHONY: lint
lint:
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1 --fix
golangci-lint run ./... --timeout $(GOLANGCILINT_TIMEOUT) --new-from-rev=HEAD~1

.PHONY: validate-modules
validate-modules:
Expand All @@ -99,6 +146,7 @@ tools: ## Install required tools and utilities
go install github.com/axw/gocov/gocov@latest
go install golang.org/x/tools/cmd/goimports@latest
go install mvdan.cc/gofumpt@latest
go install github.com/wadey/gocovmerge@latest

fmt:
find . -name '*.go' | xargs gofumpt -l -w
Expand Down
5 changes: 3 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ Official documentation for DCGM-Exporter can be found on [docs.nvidia.com](https
To gather metrics on a GPU node, simply start the `dcgm-exporter` container:

```shell
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:3.3.9-3.6.1-ubuntu22.04
docker run -d --gpus all --cap-add SYS_ADMIN --rm -p 9400:9400 nvcr.io/nvidia/k8s/dcgm-exporter:4.0.0-4.0.0-ubuntu22.04
curl localhost:9400/metrics
# HELP DCGM_FI_DEV_SM_CLOCK SM clock frequency (in MHz).
# TYPE DCGM_FI_DEV_SM_CLOCK gauge
Expand Down Expand Up @@ -111,8 +111,9 @@ To enable GPU-to-job mapping on the DCGM-exporter side, users must run the DCGM-

In order to build dcgm-exporter ensure you have the following:

* [Golang >= 1.21 installed](https://golang.org/)
* [Golang >= 1.22 installed](https://golang.org/)
* [DCGM installed](https://developer.nvidia.com/dcgm)
* Have Linux machine with GPU, compatible with DCGM.

```shell
git clone https://github.com/NVIDIA/dcgm-exporter.git
Expand Down
Loading

0 comments on commit 5f9250c

Please sign in to comment.