Skip to content

Commit

Permalink
Merge branch 'update-container-deployment' into 'main'
Browse files Browse the repository at this point in the history
Update the container deployment to use the nvidia-vgpu-dm cli

See merge request nvidia/cloud-native/vgpu-device-manager!8
  • Loading branch information
cdesiniotis committed Aug 30, 2022
2 parents f6a6a74 + 8bc451f commit 0efd7cf
Show file tree
Hide file tree
Showing 3 changed files with 62 additions and 322 deletions.
9 changes: 6 additions & 3 deletions deployments/container/Dockerfile.ubi8
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,16 @@ FROM golang:${GOLANG_VERSION} AS build

WORKDIR /build
COPY . .
RUN GOOS=linux go build -o /artifacts/vgpu-device-manager .
RUN GOOS=linux go build -o /artifacts/nvidia-vgpu-dm ./cmd/nvidia-vgpu-dm
RUN GOOS=linux go build -o /artifacts/k8s-nvidia-vgpu-dm ./deployments/container/


FROM nvcr.io/nvidia/cuda:${CUDA_VERSION}-base-${BASE_DIST}

ENV NVIDIA_VISIBLE_DEVICES=void

COPY --from=build /artifacts/vgpu-device-manager /usr/bin/vgpu-device-manager
COPY --from=build /artifacts/nvidia-vgpu-dm /usr/bin/nvidia-vgpu-dm
COPY --from=build /artifacts/k8s-nvidia-vgpu-dm /usr/bin/k8s-nvidia-vgpu-dm

LABEL version="${VERSION}"
LABEL release="N/A"
Expand All @@ -45,4 +48,4 @@ RUN if [ -n "${CVE_UPDATES}" ]; then \
rm -rf /var/cache/yum/*; \
fi

ENTRYPOINT ["vgpu-device-manager"]
ENTRYPOINT ["k8s-nvidia-vgpu-dm"]
70 changes: 56 additions & 14 deletions main.go → deployments/container/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ import (
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/clientcmd"
"os"
"os/exec"

"context"
dm "gitlab.com/nvidia/cloud-native/vgpu-device-manager/pkg/devicemanager"
corev1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
Expand All @@ -37,6 +37,7 @@ import (
)

const (
cliName = "nvidia-vgpu-dm"
resourceNodes = "nodes"
vGPUConfigLabel = "nvidia.com/vgpu.config"
vGPUConfigStateLabel = "nvidia.com/vgpu.config.state"
Expand Down Expand Up @@ -179,11 +180,6 @@ func start(c *cli.Context) error {
return fmt.Errorf("error building kubernetes clientset from config: %s", err)
}

m, err := dm.NewVGPUDeviceManager(configFileFlag)
if err != nil {
return fmt.Errorf("error creating new VGPUDeviceManager: %v", err)
}

vGPUConfig := NewSyncableVGPUConfig()

stop := continuouslySyncVGPUConfigChanges(clientset, vGPUConfig)
Expand All @@ -204,7 +200,7 @@ func start(c *cli.Context) error {
}

log.Infof("Updating to vGPU config: %s", selectedConfig)
err = updateConfig(clientset, m, selectedConfig)
err = updateConfig(clientset, selectedConfig)
if err != nil {
log.Errorf("ERROR: %v", err)
} else {
Expand All @@ -216,7 +212,7 @@ func start(c *cli.Context) error {
log.Infof("Waiting for change to '%s' label", vGPUConfigLabel)
value := vGPUConfig.Get()
log.Infof("Updating to vGPU config: %s", value)
err = updateConfig(clientset, m, value)
err = updateConfig(clientset, value)
if err != nil {
log.Errorf("ERROR: %v", err)
continue
Expand Down Expand Up @@ -254,17 +250,24 @@ func continuouslySyncVGPUConfigChanges(clientset *kubernetes.Clientset, vGPUConf
return stop
}

func updateConfig(clientset *kubernetes.Clientset, m *dm.VGPUDeviceManager, selectedConfig string) error {
func updateConfig(clientset *kubernetes.Clientset, selectedConfig string) error {
defer setVGPUConfigStateLabel(clientset)
vGPUConfigState = "failed"

log.Info("Asserting that the requested configuration is present in the configuration file")
ok := m.AssertValidConfig(selectedConfig)
if !ok {
return fmt.Errorf("%s is not a valid config", selectedConfig)
err := assertValidConfig(selectedConfig)
if err != nil {
return fmt.Errorf("Unable to validate the selected vGPU configuration")
}

log.Info("Checking if the selected vGPU device configuration is currently applied or not")
err = assertConfig(selectedConfig)
if err == nil {
vGPUConfigState = "success"
return nil
}

err := getNodeStateLabels(clientset)
err = getNodeStateLabels(clientset)
if err != nil {
return fmt.Errorf("unable to get node state labels: %v", err)
}
Expand All @@ -281,7 +284,8 @@ func updateConfig(clientset *kubernetes.Clientset, m *dm.VGPUDeviceManager, sele
return fmt.Errorf("unable to shutdown gpu operands: %v", err)
}

err = m.ApplyConfig(selectedConfig)
log.Info("Applying the selected vGPU device configuration to the node")
err = applyConfig(selectedConfig)
if err != nil {
return fmt.Errorf("unable to apply config '%s': %v", selectedConfig, err)
}
Expand All @@ -296,6 +300,44 @@ func updateConfig(clientset *kubernetes.Clientset, m *dm.VGPUDeviceManager, sele
return nil
}

func assertValidConfig(config string) error {
args := []string{
"assert",
"--valid-config",
"-f", configFileFlag,
"-c", config,
}
cmd := exec.Command(cliName, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
return cmd.Run()
}

func assertConfig(config string) error {
args := []string{
"assert",
"-f", configFileFlag,
"-c", config,
}
cmd := exec.Command(cliName, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
return cmd.Run()
}

func applyConfig(config string) error {
args := []string{
"-d",
"apply",
"-f", configFileFlag,
"-c", config,
}
cmd := exec.Command(cliName, args...)
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
return cmd.Run()
}

func getNodeStateLabels(clientset *kubernetes.Clientset) error {
node, err := clientset.CoreV1().Nodes().Get(context.TODO(), nodeNameFlag, metav1.GetOptions{})
if err != nil {
Expand Down
Loading

0 comments on commit 0efd7cf

Please sign in to comment.