Merge pull request #876 from ArangoGutierrez/reg_test02

Add remote-test option for E2E
NVIDIA · Feb 4, 2025 · 78d6cdc · 78d6cdc
2 parents df4c87b + 6164059
commit 78d6cdc
Show file tree

Hide file tree

Showing 111 changed files with 17,890 additions and 43 deletions.
diff --git a/tests/e2e/Makefile b/tests/e2e/Makefile
@@ -18,9 +18,28 @@ include $(CURDIR)/versions.mk
 
 E2E_RUNTIME ?= docker
 
+E2E_INSTALL_CTK ?= false
+
+ifeq ($($(DIST)),)
+DIST ?= ubuntu20.04
+endif
+IMAGE_TAG ?= $(VERSION)-$(DIST)
+IMAGE = $(IMAGE_NAME):$(IMAGE_TAG)
+
+E2E_SSH_KEY ?=
+E2E_SSH_USER ?=
+E2E_SSH_HOST ?=
+E2E_SSH_PORT ?= 22
+
 .PHONY: test
 test:
 	cd $(CURDIR)/tests/e2e && $(GO_CMD) test -v . -args \
 		-ginkgo.focus="$(E2E_RUNTIME)" \
 		-test.timeout=1h \
-		-ginkgo.v
+		-ginkgo.v \
+		-install-ctk=$(E2E_INSTALL_CTK) \
+		-toolkit-image=$(IMAGE) \
+		-ssh-key=$(E2E_SSH_KEY) \
+		-ssh-user=$(E2E_SSH_USER) \
+		-remote-host=$(E2E_SSH_HOST) \
+		-remote-port=$(E2E_SSH_PORT)
diff --git a/tests/e2e/e2e_test.go b/tests/e2e/e2e_test.go
@@ -17,10 +17,8 @@
 package e2e
 
 import (
-	"bytes"
 	"context"
-	"fmt"
-	"os/exec"
+	"flag"
 	"testing"
 
 	. "github.com/onsi/ginkgo/v2"
@@ -30,8 +28,26 @@ import (
 // Test context
 var (
 	ctx context.Context
+
+	installCTK bool
+
+	image string
+
+	sshKey  string
+	sshUser string
+	host    string
+	sshPort string
 )
 
+func init() {
+	flag.BoolVar(&installCTK, "install-ctk", false, "Install the NVIDIA Container Toolkit")
+	flag.StringVar(&image, "toolkit-image", "", "Repository of the image to test")
+	flag.StringVar(&sshKey, "ssh-key", "", "SSH key to use for remote login")
+	flag.StringVar(&sshUser, "ssh-user", "", "SSH user to use for remote login")
+	flag.StringVar(&host, "remote-host", "", "Hostname of the remote machine")
+	flag.StringVar(&sshPort, "ssh-port", "22", "SSH port to use for remote login")
+}
+
 func TestMain(t *testing.T) {
 	suiteName := "NVIDIA Container Toolkit E2E"
 
@@ -45,25 +61,3 @@ func TestMain(t *testing.T) {
 var _ = BeforeSuite(func() {
 	ctx = context.Background()
 })
-
-func runScript(script string) (string, error) {
-	// Create a command to run the script using bash
-	cmd := exec.Command("bash", "-c", script)
-
-	// Buffer to capture standard output
-	var stdout bytes.Buffer
-	cmd.Stdout = &stdout
-
-	// Buffer to capture standard error
-	var stderr bytes.Buffer
-	cmd.Stderr = &stderr
-
-	// Run the command
-	err := cmd.Run()
-	if err != nil {
-		return "", fmt.Errorf("script execution failed: %v\nSTDOUT: %s\nSTDERR: %s", err, stdout.String(), stderr.String())
-	}
-
-	// Return the captured stdout and nil error
-	return stdout.String(), nil
-}
diff --git a/tests/e2e/installer.go b/tests/e2e/installer.go
@@ -0,0 +1,118 @@
+/*
+* Copyright (c) 2025, NVIDIA CORPORATION.  All rights reserved.
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+ */
+
+package e2e
+
+import (
+	"bytes"
+	"fmt"
+	"text/template"
+)
+
+// dockerInstallTemplate is a template for installing the NVIDIA Container Toolkit
+// on a host using Docker.
+var dockerInstallTemplate = `
+#! /usr/bin/env bash
+set -xe
+
+: ${IMAGE:={{.Image}}}
+
+# Create a temporary directory
+TEMP_DIR="/tmp/ctk_e2e.$(date +%s)_$RANDOM"
+mkdir -p "$TEMP_DIR"
+
+# Given that docker has an init function that checks for the existence of the
+# nvidia-container-toolkit, we need to create a symlink to the nvidia-container-runtime-hook
+# in the /usr/bin directory.
+# See https://github.com/moby/moby/blob/20a05dabf44934447d1a66cdd616cc803b81d4e2/daemon/nvidia_linux.go#L32-L46
+sudo rm -f /usr/bin/nvidia-container-runtime-hook
+sudo ln -s "$TEMP_DIR/toolkit/nvidia-container-runtime-hook" /usr/bin/nvidia-container-runtime-hook
+
+docker run --pid=host --rm -i --privileged	\
+	-v /:/host	\
+	-v /var/run/docker.sock:/var/run/docker.sock	\
+	-v "$TEMP_DIR:$TEMP_DIR"	\
+	-v /etc/docker:/config-root	\
+	${IMAGE}	\
+	--root "$TEMP_DIR"	\
+	--runtime=docker	\
+	--config=/config-root/daemon.json	\
+	--driver-root=/	\
+	--no-daemon	\
+	--restart-mode=systemd
+`
+
+type ToolkitInstaller struct {
+	runner   Runner
+	template string
+
+	Image string
+}
+
+type installerOption func(*ToolkitInstaller)
+
+func WithRunner(r Runner) installerOption {
+	return func(i *ToolkitInstaller) {
+		i.runner = r
+	}
+}
+
+func WithImage(image string) installerOption {
+	return func(i *ToolkitInstaller) {
+		i.Image = image
+	}
+}
+
+func WithTemplate(template string) installerOption {
+	return func(i *ToolkitInstaller) {
+		i.template = template
+	}
+}
+
+func NewToolkitInstaller(opts ...installerOption) (*ToolkitInstaller, error) {
+	i := &ToolkitInstaller{
+		runner:   localRunner{},
+		template: dockerInstallTemplate,
+	}
+
+	for _, opt := range opts {
+		opt(i)
+	}
+
+	if i.Image == "" {
+		return nil, fmt.Errorf("image is required")
+	}
+
+	return i, nil
+}
+
+func (i *ToolkitInstaller) Install() error {
+	// Parse the combined template
+	tmpl, err := template.New("installScript").Parse(i.template)
+	if err != nil {
+		return fmt.Errorf("error parsing template: %w", err)
+	}
+
+	// Execute the template
+	var renderedScript bytes.Buffer
+	err = tmpl.Execute(&renderedScript, i)
+	if err != nil {
+		return fmt.Errorf("error executing template: %w", err)
+	}
+
+	_, _, err = i.runner.Run(renderedScript.String())
+	return err
+}
diff --git a/tests/e2e/nvidia-container-toolkit_test.go b/tests/e2e/nvidia-container-toolkit_test.go
@@ -24,7 +24,29 @@ import (
 )
 
 // Integration tests for Docker runtime
-var _ = Describe("docker", func() {
+var _ = Describe("docker", Ordered, func() {
+	var r Runner
+
+	// Install the NVIDIA Container Toolkit
+	BeforeAll(func(ctx context.Context) {
+		r = NewRunner(
+			WithHost(host),
+			WithPort(sshPort),
+			WithSshKey(sshKey),
+			WithSshUser(sshUser),
+		)
+		if installCTK {
+			installer, err := NewToolkitInstaller(
+				WithRunner(r),
+				WithImage(image),
+				WithTemplate(dockerInstallTemplate),
+			)
+			Expect(err).ToNot(HaveOccurred())
+			err = installer.Install()
+			Expect(err).ToNot(HaveOccurred())
+		}
+	})
+
 	// GPUs are accessible in a container: Running nvidia-smi -L inside the
 	// container shows the same output inside the container as outside the
 	// container. This means that the following commands must all produce
@@ -33,33 +55,33 @@ var _ = Describe("docker", func() {
 		var hostOutput string
 
 		BeforeAll(func(ctx context.Context) {
-			_, err := runScript("docker pull ubuntu")
+			_, _, err := r.Run("docker pull ubuntu")
 			Expect(err).ToNot(HaveOccurred())
 
-			hostOutput, err = runScript("nvidia-smi -L")
+			hostOutput, _, err = r.Run("nvidia-smi -L")
 			Expect(err).ToNot(HaveOccurred())
 		})
 
 		It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
-			containerOutput, err := runScript("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
+			containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all ubuntu nvidia-smi -L")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(containerOutput).To(Equal(hostOutput))
 		})
 
 		It("should support automatic CDI spec generation", func(ctx context.Context) {
-			containerOutput, err := runScript("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
+			containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all ubuntu nvidia-smi -L")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(containerOutput).To(Equal(hostOutput))
 		})
 
 		It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
-			containerOutput, err := runScript("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
+			containerOutput, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all ubuntu nvidia-smi -L")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(containerOutput).To(Equal(hostOutput))
 		})
 
 		It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
-			containerOutput, err := runScript("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
+			containerOutput, _, err := r.Run("docker run --rm -i --gpus all ubuntu nvidia-smi -L")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(containerOutput).To(Equal(hostOutput))
 		})
@@ -69,34 +91,34 @@ var _ = Describe("docker", func() {
 	// The following should all produce the same result.
 	When("Running the cuda-vectorAdd sample", Ordered, func() {
 		BeforeAll(func(ctx context.Context) {
-			_, err := runScript("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 		})
 
 		var referenceOutput string
 
 		It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
 			var err error
-			referenceOutput, err = runScript("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 
 			Expect(referenceOutput).To(ContainSubstring("Test PASSED"))
 		})
 
 		It("should support automatic CDI spec generation", func(ctx context.Context) {
-			out2, err := runScript("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(referenceOutput).To(Equal(out2))
 		})
 
 		It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
-			out3, err := runScript("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(referenceOutput).To(Equal(out3))
 		})
 
 		It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
-			out4, err := runScript("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
+			out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(referenceOutput).To(Equal(out4))
 		})
@@ -106,34 +128,34 @@ var _ = Describe("docker", func() {
 	// The following should all produce the same result.
 	When("Running the cuda-deviceQuery sample", Ordered, func() {
 		BeforeAll(func(ctx context.Context) {
-			_, err := runScript("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			_, _, err := r.Run("docker pull nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 		})
 
 		var referenceOutput string
 
 		It("should support NVIDIA_VISIBLE_DEVICES", func(ctx context.Context) {
 			var err error
-			referenceOutput, err = runScript("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			referenceOutput, _, err = r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 
 			Expect(referenceOutput).To(ContainSubstring("Result = PASS"))
 		})
 
 		It("should support automatic CDI spec generation", func(ctx context.Context) {
-			out2, err := runScript("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			out2, _, err := r.Run("docker run --rm -i --runtime=nvidia -e NVIDIA_VISIBLE_DEVICES=runtime.nvidia.com/gpu=all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(referenceOutput).To(Equal(out2))
 		})
 
 		It("should support the --gpus flag using the nvidia-container-runtime", func(ctx context.Context) {
-			out3, err := runScript("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			out3, _, err := r.Run("docker run --rm -i --runtime=nvidia --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(referenceOutput).To(Equal(out3))
 		})
 
 		It("should support the --gpus flag using the nvidia-container-runtime-hook", func(ctx context.Context) {
-			out4, err := runScript("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
+			out4, _, err := r.Run("docker run --rm -i --gpus all nvcr.io/nvidia/k8s/cuda-sample:devicequery-cuda12.5.0")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(referenceOutput).To(Equal(out4))
 		})