Skip to content

Commit

Permalink
Add DRA driver for IMEX
Browse files Browse the repository at this point in the history
Signed-off-by: Christopher Desiniotis <[email protected]>
  • Loading branch information
cdesiniotis committed Dec 3, 2024
1 parent fe9595a commit 2f5127f
Show file tree
Hide file tree
Showing 18 changed files with 922 additions and 2 deletions.
68 changes: 68 additions & 0 deletions api/nvidia/v1/clusterpolicy_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@ type ClusterPolicySpec struct {
Toolkit ToolkitSpec `json:"toolkit"`
// DevicePlugin component spec
DevicePlugin DevicePluginSpec `json:"devicePlugin"`
// DRADriver component spec
DRADriver DRADriverSpec `json:"draDriver"`
// DCGMExporter spec
DCGMExporter DCGMExporterSpec `json:"dcgmExporter"`
// DCGM component spec
Expand Down Expand Up @@ -841,6 +843,60 @@ type SandboxDevicePluginSpec struct {
Env []EnvVar `json:"env,omitempty"`
}

// DRADriverSpec defines the properties for the NVIDIA DRA Driver deployment
// TODO: add 'controller' and 'kubeletPlugin' structs to allow for per-component configuration
type DRADriverSpec struct {
// Enabled indicates if the deployment of NVIDIA DRA Driver through the operator is enabled
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Enable NVIDIA DRA Driver deployment through GPU Operator"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:booleanSwitch"
Enabled *bool `json:"enabled,omitempty"`

// NVIDIA DRA Driver image repository
// +kubebuilder:validation:Optional
Repository string `json:"repository,omitempty"`

// NVIDIA DRA Driver image name
// +kubebuilder:validation:Pattern=[a-zA-Z0-9\-]+
Image string `json:"image,omitempty"`

// NVIDIA DRA Driver image tag
// +kubebuilder:validation:Optional
Version string `json:"version,omitempty"`

// Image pull policy
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image Pull Policy"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:imagePullPolicy"
ImagePullPolicy string `json:"imagePullPolicy,omitempty"`

// Image pull secrets
// +kubebuilder:validation:Optional
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Image pull secrets"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:io.kubernetes:Secret"
ImagePullSecrets []string `json:"imagePullSecrets,omitempty"`

// Optional: Define resources requests and limits for each pod
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Resource Requirements"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:resourceRequirements"
Resources *ResourceRequirements `json:"resources,omitempty"`

// Optional: List of arguments
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Arguments"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Args []string `json:"args,omitempty"`

// Optional: List of environment variables
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors=true
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.displayName="Environment Variables"
// +operator-sdk:gen-csv:customresourcedefinitions.specDescriptors.x-descriptors="urn:alm:descriptor:com.tectonic.ui:advanced,urn:alm:descriptor:com.tectonic.ui:text"
Env []EnvVar `json:"env,omitempty"`
}

// DCGMExporterSpec defines the properties for NVIDIA DCGM Exporter deployment
type DCGMExporterSpec struct {
// Enabled indicates if deployment of NVIDIA DCGM Exporter through operator is enabled
Expand Down Expand Up @@ -1764,6 +1820,9 @@ func ImagePath(spec interface{}) (string, error) {
case *SandboxDevicePluginSpec:
config := spec.(*SandboxDevicePluginSpec)
return imagePath(config.Repository, config.Image, config.Version, "SANDBOX_DEVICE_PLUGIN_IMAGE")
case *DRADriverSpec:
config := spec.(*DRADriverSpec)
return imagePath(config.Repository, config.Image, config.Version, "DRA_DRIVER_IMAGE")
case *DCGMExporterSpec:
config := spec.(*DCGMExporterSpec)
return imagePath(config.Repository, config.Image, config.Version, "DCGM_EXPORTER_IMAGE")
Expand Down Expand Up @@ -1872,6 +1931,15 @@ func (p *DevicePluginSpec) IsEnabled() bool {
return *p.Enabled
}

// IsEnabled returns true if draDriver is enabled through gpu-operator
func (d *DRADriverSpec) IsEnabled() bool {
if d.Enabled == nil {
// default is true if not specified by user
return true
}
return *d.Enabled
}

// IsEnabled returns true if dcgm-exporter is enabled(default) through gpu-operator
func (e *DCGMExporterSpec) IsEnabled() bool {
if e.Enabled == nil {
Expand Down
41 changes: 41 additions & 0 deletions api/nvidia/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 5 additions & 0 deletions assets/state-dra-driver/0100_service_account.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
apiVersion: v1
kind: ServiceAccount
metadata:
name: nvidia-dra-driver
namespace: "FILLED BY THE OPERATOR"
15 changes: 15 additions & 0 deletions assets/state-dra-driver/0200_clusterrole.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: nvidia-dra-driver
rules:
# TODO: restrict RBAC for DRA driver
- apiGroups:
- ""
- apps
- resource.k8s.io
- gpu.nvidia.com
resources:
- '*'
verbs:
- '*'
12 changes: 12 additions & 0 deletions assets/state-dra-driver/0300_clusterrolebinding.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: nvidia-dra-driver
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: nvidia-dra-driver
subjects:
- kind: ServiceAccount
name: nvidia-dra-driver
namespace: "FILLED BY THE OPERATOR"
8 changes: 8 additions & 0 deletions assets/state-dra-driver/0400_deviceclass-imex.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
apiVersion: resource.k8s.io/v1alpha3
kind: DeviceClass
metadata:
name: imex.nvidia.com
spec:
selectors:
- cel:
expression: "device.driver == 'gpu.nvidia.com' && device.attributes['gpu.nvidia.com'].type == 'imex-channel'"
44 changes: 44 additions & 0 deletions assets/state-dra-driver/0500_deployment.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: nvidia-dra-driver-controller
name: nvidia-dra-driver-controller
namespace: "FILLED BY THE OPERATOR"
spec:
replicas: 1
selector:
matchLabels:
app: nvidia-dra-driver-controller
template:
metadata:
labels:
app: nvidia-dra-driver-controller
spec:
priorityClassName: system-node-critical
serviceAccountName: nvidia-dra-driver
tolerations:
- effect: NoSchedule
key: node-role.kubernetes.io/master
operator: Exists
- effect: NoSchedule
key: node-role.kubernetes.io/control-plane
operator: Exists
containers:
- name: controller
image: "FILLED BY THE OPERATOR"
imagePullPolicy: IfNotPresent
command: ["nvidia-dra-controller", "-v", "6"]
env:
- name: DEVICE_CLASSES
value: imex
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
34 changes: 34 additions & 0 deletions assets/state-dra-driver/0600_configmap.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
apiVersion: v1
kind: ConfigMap
metadata:
name: nvidia-dra-driver-kubelet-plugin-entrypoint
namespace: "FILLED BY THE OPERATOR"
labels:
app: nvidia-dra-driver-kubelet-plugin
data:
entrypoint.sh: |-
#!/bin/bash
until [[ -f /run/nvidia/validations/driver-ready ]]
do
echo "waiting for the driver validations to be ready..."
sleep 5
done
set -o allexport
cat /run/nvidia/validations/driver-ready
. /run/nvidia/validations/driver-ready
# TODO: add an alias for DRIVER_ROOT_CTR_PATH in the k8s-dra-driver and remove the below export
export CONTAINER_DRIVER_ROOT=$DRIVER_ROOT_CTR_PATH
# Conditionally mask the params file to prevent this container from
# recreating any missing GPU device nodes. This is necessary, for
# example, when running under nvkind to limit the set GPUs governed
# by the plugin even though it has cgroup access to all of them.
if [ "${MASK_NVIDIA_DRIVER_PARAMS}" = "true" ]; then
cp /proc/driver/nvidia/params root/gpu-params
sed -i 's/^ModifyDeviceFiles: 1$/ModifyDeviceFiles: 0/' root/gpu-params
mount --bind root/gpu-params /proc/driver/nvidia/params
fi
echo "Starting nvidia-dra-plugin"
exec nvidia-dra-plugin
Loading

0 comments on commit 2f5127f

Please sign in to comment.