From 0c946479ab3618ee15952ec67b83cae011733f59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Thu, 19 Jun 2025 13:25:26 +0200 Subject: [PATCH 1/5] Update operator instructions --- gpu/README.md | 60 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 54 insertions(+), 6 deletions(-) diff --git a/gpu/README.md b/gpu/README.md index 57dd45d21c5ea..d7986238bfe11 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -200,13 +200,63 @@ spec: env: # add this env var, if using operator version 1.14.x - name: DD_ENABLE_NVML_DETECTION - value: "true" + value: "true" # add this env var, if using operator versions 1.14.x or 1.15.x - name: DD_COLLECT_GPU_TAGS - value: "true" + value: "true" ``` -For **mixed environments**, use the [DatadogAgentProfiles feature](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md) of the operator, which allows different configurations to be deployed for different nodes. In this case, it is not necessary to modify the DatadogAgent manifest. Instead, create a profile that enables the configuration on GPU nodes only: +For **mixed environments**, use the [DatadogAgentProfiles (DAP) feature](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md) of the operator, which allows different configurations to be deployed for different nodes. Note that this feature is disabled by default, so it needs to be enabled [as described here](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md#enabling-datadogagentprofiles). + +Modifying the DatadogAgent manifest is necessary to enable certain features that are not supported by the DAP yet. First, the existing configuration should enable the `system-probe` container in the datadog-agent pods (this can be easily checked by looking at the list of containers when running `kubectl describe pod -n `). Because the DAP feature does not yet support conditionally enabling containers, a feature that uses `system-probe` needs to be enabled for all agent pods. We recommend enabling the `oomKill` integration, as it is lightweight and does not require any additional configuration or extra cost. + +Additionally, the agent needs to be configured so that the NVIDIA container runtime exposes GPUs to the agent. This can be done via environment variables or volume mounts, depending on whether the `accept-nvidia-visible-devices-as-volume-mounts` parameter is set to `true` or `false` in the NVIDIA container runtime configuration. We recommend configuring the agent both ways, as it reduces the chance of misconfiguration and there are no side effects to having both. + +Also, the PodResources socket needs to be exposed to the agent too to integrate with the Kubernetes Device Plugin. Again, this needs to be done globally as the DAP does not yet support conditional volume mounts. + +In summary, the changes that need to be applied to the DatadogAgent manifest are the following: + +```yaml +spec: + features: + oomKill: # Only enable this feature if there is nothing else that requires the system-probe container in all agent pods + enabled: true + +override: + nodeAgent: + volumes: + - name: nvidia-devices + hostPath: + path: /dev/null + - name: pod-resources + hostPath: + path: /var/lib/kubelet/pod-resources + containers: + agent: + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + volumeMounts: + - name: nvidia-devices + mountPath: /dev/nvidia-visible-devices + readOnly: true + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources + readOnly: true + system-probe: + env: + - name: NVIDIA_VISIBLE_DEVICES + value: "all" + volumeMounts: + - name: nvidia-devices + mountPath: /dev/nvidia-visible-devices + readOnly: true + - name: pod-resources + mountPath: /var/lib/kubelet/pod-resources + readOnly: true +``` + +Once the DatadogAgent configuration is changed, create a profile that enables the GPU feature configuration on GPU nodes only: ```yaml apiVersion: datadoghq.com/v1alpha1 @@ -229,12 +279,10 @@ spec: env: - name: DD_GPU_MONITORING_ENABLED value: "true" - # add this env var, if using operator version 1.14.x agent: env: - name: DD_ENABLE_NVML_DETECTION - value: "true" - # add this env var, if using operator versions 1.14.x or 1.15.x + value: "true" - name: DD_COLLECT_GPU_TAGS value: "true" ``` From c387439e4e7902f0b00fa39400ab220a2a037c19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Julia=CC=81n?= Date: Mon, 23 Jun 2025 12:59:26 +0200 Subject: [PATCH 2/5] Fix read-only paths --- gpu/README.md | 4 ---- 1 file changed, 4 deletions(-) diff --git a/gpu/README.md b/gpu/README.md index d7986238bfe11..46e496ce3e3b2 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -239,10 +239,8 @@ override: volumeMounts: - name: nvidia-devices mountPath: /dev/nvidia-visible-devices - readOnly: true - name: pod-resources mountPath: /var/lib/kubelet/pod-resources - readOnly: true system-probe: env: - name: NVIDIA_VISIBLE_DEVICES @@ -250,10 +248,8 @@ override: volumeMounts: - name: nvidia-devices mountPath: /dev/nvidia-visible-devices - readOnly: true - name: pod-resources mountPath: /var/lib/kubelet/pod-resources - readOnly: true ``` Once the DatadogAgent configuration is changed, create a profile that enables the GPU feature configuration on GPU nodes only: From 3feb791047936b84307a2c171f7e5c642b356bda Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Juli=C3=A1n?= Date: Mon, 23 Jun 2025 15:11:19 +0200 Subject: [PATCH 3/5] Update gpu/README.md Co-authored-by: Janine Chan <64388808+janine-c@users.noreply.github.com> --- gpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/README.md b/gpu/README.md index 46e496ce3e3b2..ac79bea121f33 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -206,7 +206,7 @@ spec: value: "true" ``` -For **mixed environments**, use the [DatadogAgentProfiles (DAP) feature](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md) of the operator, which allows different configurations to be deployed for different nodes. Note that this feature is disabled by default, so it needs to be enabled [as described here](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md#enabling-datadogagentprofiles). +For **mixed environments**, use the [DatadogAgentProfiles (DAP) feature](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md) of the operator, which allows different configurations to be deployed for different nodes. Note that this feature is disabled by default, so it needs to be enabled. For more information, see [Enabling DatadogAgentProfiles](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md#enabling-datadogagentprofiles). Modifying the DatadogAgent manifest is necessary to enable certain features that are not supported by the DAP yet. First, the existing configuration should enable the `system-probe` container in the datadog-agent pods (this can be easily checked by looking at the list of containers when running `kubectl describe pod -n `). Because the DAP feature does not yet support conditionally enabling containers, a feature that uses `system-probe` needs to be enabled for all agent pods. We recommend enabling the `oomKill` integration, as it is lightweight and does not require any additional configuration or extra cost. From 58486021c15327090ebe0514e629b70ddda7557f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Juli=C3=A1n?= Date: Mon, 23 Jun 2025 15:11:28 +0200 Subject: [PATCH 4/5] Update gpu/README.md Co-authored-by: Janine Chan <64388808+janine-c@users.noreply.github.com> --- gpu/README.md | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/gpu/README.md b/gpu/README.md index ac79bea121f33..09a05c4a24699 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -208,11 +208,15 @@ spec: For **mixed environments**, use the [DatadogAgentProfiles (DAP) feature](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md) of the operator, which allows different configurations to be deployed for different nodes. Note that this feature is disabled by default, so it needs to be enabled. For more information, see [Enabling DatadogAgentProfiles](https://github.com/DataDog/datadog-operator/blob/main/docs/datadog_agent_profiles.md#enabling-datadogagentprofiles). -Modifying the DatadogAgent manifest is necessary to enable certain features that are not supported by the DAP yet. First, the existing configuration should enable the `system-probe` container in the datadog-agent pods (this can be easily checked by looking at the list of containers when running `kubectl describe pod -n `). Because the DAP feature does not yet support conditionally enabling containers, a feature that uses `system-probe` needs to be enabled for all agent pods. We recommend enabling the `oomKill` integration, as it is lightweight and does not require any additional configuration or extra cost. - -Additionally, the agent needs to be configured so that the NVIDIA container runtime exposes GPUs to the agent. This can be done via environment variables or volume mounts, depending on whether the `accept-nvidia-visible-devices-as-volume-mounts` parameter is set to `true` or `false` in the NVIDIA container runtime configuration. We recommend configuring the agent both ways, as it reduces the chance of misconfiguration and there are no side effects to having both. - -Also, the PodResources socket needs to be exposed to the agent too to integrate with the Kubernetes Device Plugin. Again, this needs to be done globally as the DAP does not yet support conditional volume mounts. +Modifying the DatadogAgent manifest is necessary to enable certain features that are not supported by the DAP yet: +- In the existing configuration, enable the `system-probe` container in the datadog-agent pods. Because the DAP feature does not yet support conditionally enabling containers, a feature that uses `system-probe` needs to be enabled for all Agent pods. + - You can check this by looking at the list of containers when running `kubectl describe pod -n `. + - Datadog recommends enabling the `oomKill` integration, as it is lightweight and does not require any additional configuration or cost. +- Configure the Agent so that the NVIDIA container runtime exposes GPUs to the Agent. + - You can do this using environment variables or volume mounts, depending on whether the `accept-nvidia-visible-devices-as-volume-mounts` parameter is set to `true` or `false` in the NVIDIA container runtime configuration. + - Datadog recommends configuring the Agent both ways, as it reduces the chance of misconfiguration. There are no side effects to having both. +- Expose the PodResources socket to the Agent to integrate with the Kubernetes Device Plugin. + - This needs to be done globally, as the DAP does not yet support conditional volume mounts. In summary, the changes that need to be applied to the DatadogAgent manifest are the following: From b13b272c7b4e8ec6a5b36fe0cad499d3f2c6437f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillermo=20Juli=C3=A1n?= Date: Mon, 23 Jun 2025 15:11:39 +0200 Subject: [PATCH 5/5] Update gpu/README.md Co-authored-by: Janine Chan <64388808+janine-c@users.noreply.github.com> --- gpu/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gpu/README.md b/gpu/README.md index 09a05c4a24699..859b4a5608b4b 100644 --- a/gpu/README.md +++ b/gpu/README.md @@ -223,7 +223,7 @@ In summary, the changes that need to be applied to the DatadogAgent manifest are ```yaml spec: features: - oomKill: # Only enable this feature if there is nothing else that requires the system-probe container in all agent pods + oomKill: # Only enable this feature if there is nothing else that requires the system-probe container in all Agent pods enabled: true override: