diff --git a/Makefile b/Makefile index 3a5b6f11..20a8ccbe 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ BUILD_DIR=$(shell pwd)/bin COMPONENTS?=device-plugin status-updater kwok-gpu-device-plugin status-exporter topology-server mig-faker jupyter-notebook -DOCKER_REPO_BASE=gcr.io/run-ai-lab/fake-gpu-operator +DOCKER_REPO_BASE?=gcr.io/run-ai-lab/fake-gpu-operator DOCKER_TAG?=0.0.0-dev NAMESPACE=gpu-operator diff --git a/cmd/nvidia-smi/main.go b/cmd/nvidia-smi/main.go index 35813558..d8553b62 100644 --- a/cmd/nvidia-smi/main.go +++ b/cmd/nvidia-smi/main.go @@ -40,11 +40,14 @@ func main() { fmt.Println("Debug mode enabled") } - err := os.Setenv(constants.EnvTopologyCmNamespace, "gpu-operator") - if err != nil { - panic(err) + if _, ok := os.LookupEnv(constants.EnvTopologyCmNamespace); !ok { + err := os.Setenv(constants.EnvTopologyCmNamespace, "gpu-operator") + if err != nil { + panic(err) + } } - err = os.Setenv(constants.EnvTopologyCmName, "topology") + + err := os.Setenv(constants.EnvTopologyCmName, "topology") if err != nil { panic(err) } @@ -66,7 +69,8 @@ func getNvidiaSmiArgs() (args nvidiaSmiArgs) { } // Send http request to topology-server to get the topology - topologyUrl := "http://topology-server.gpu-operator/topology/nodes/" + nodeName + topologyUrl := fmt.Sprintf("http://topology-server.%s/topology/nodes/%s", + os.Getenv(constants.EnvTopologyCmNamespace), nodeName) if conf.Debug { fmt.Printf("Requesting topology from: %s\n", topologyUrl) } diff --git a/deploy/fake-gpu-operator/templates/runtime-class.yml b/deploy/fake-gpu-operator/templates/runtime-class.yml index 5a36705d..2ec80672 100644 --- a/deploy/fake-gpu-operator/templates/runtime-class.yml +++ b/deploy/fake-gpu-operator/templates/runtime-class.yml @@ -1,5 +1,5 @@ apiVersion: node.k8s.io/v1 kind: RuntimeClass metadata: - name: nvidia + name: {{ .Values.runtimeClass.name | default "fake-nvidia" }} handler: runc diff --git a/deploy/fake-gpu-operator/values.yaml b/deploy/fake-gpu-operator/values.yaml index 730617c9..2e85858c 100644 --- a/deploy/fake-gpu-operator/values.yaml +++ b/deploy/fake-gpu-operator/values.yaml @@ -92,3 +92,6 @@ topology: gpuMemory: 11441 nodePoolLabelKey: run.ai/simulated-gpu-node-pool migStrategy: mixed + +runtimeClass: + name: fake-nvidia diff --git a/internal/deviceplugin/real_node.go b/internal/deviceplugin/real_node.go index 95c43b1d..001df638 100644 --- a/internal/deviceplugin/real_node.go +++ b/internal/deviceplugin/real_node.go @@ -10,11 +10,13 @@ import ( "time" "github.com/google/uuid" - "github.com/run-ai/fake-gpu-operator/internal/common/topology" "golang.org/x/net/context" "google.golang.org/grpc" "google.golang.org/grpc/credentials/insecure" pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1" + + "github.com/run-ai/fake-gpu-operator/internal/common/constants" + "github.com/run-ai/fake-gpu-operator/internal/common/topology" ) const ( @@ -170,11 +172,17 @@ func (m *RealNodeDevicePlugin) GetPreferredAllocation(context.Context, *pluginap } func (m *RealNodeDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) { + ns := os.Getenv(constants.EnvTopologyCmNamespace) + if ns == "" { + ns = "gpu-operator" + } + responses := pluginapi.AllocateResponse{} for _, req := range reqs.ContainerRequests { response := pluginapi.ContainerAllocateResponse{ Envs: map[string]string{ - "MOCK_NVIDIA_VISIBLE_DEVICES": strings.Join(req.DevicesIDs, ","), + "MOCK_NVIDIA_VISIBLE_DEVICES": strings.Join(req.DevicesIDs, ","), + constants.EnvTopologyCmNamespace: ns, }, Mounts: []*pluginapi.Mount{ {