Enable NaN checks on tensor arguments to kernel launches (#4029)

q10 · facebook-github-bot · commit 5ef7c95c554e · 2025-05-02T19:07:14.000-07:00
Summary: X-link: facebookresearch/FBGEMM#1113 Pull Request resolved: #4029 - Enable NaN checks on tensor arguments to kernel launches Reviewed By: sryap, spcyppt Differential Revision: D73698678 fbshipit-source-id: e87e374c178bfef59db4477aca3874125099eb32
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh b/fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh
@@ -62,14 +62,35 @@ decltype(auto) transform_kernel_arg(const SourceContext& context, T&& arg) {
   }
 }
 
+////////////////////////////////////////////////////////////////////////////////
+// Verify Kernel Argument
+//
+// Verify certain arguments before and after kernel invocation
+////////////////////////////////////////////////////////////////////////////////
+
+template <typename T>
+decltype(auto) check_kernel_arg(const SourceContext& context, T&& arg) {
+  if constexpr (is_tensor_accessor_builder_v<std::decay_t<T>>) {
+    // If the arg is a TensorAccessorBuilder, run verifications on the tensor it
+    // is ref-wrapping, e.g. NaN value checks.
+    return arg.checkValues(context.description());
+  } else {
+    // Otherwise, perfect-forward the argument as is
+    return std::forward<T>(arg);
+  }
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // GPU Kernel Launcher
 //
 // This class encapsulates the common ceremonial pre- and post-execution
 // routines when launching GPU kernels.
 ////////////////////////////////////////////////////////////////////////////////
 
-template <bool EnableDSA = false, bool EnableBarrierIsolation = false>
+template <
+    bool EnableDSA = false,
+    bool EnableBarrierIsolation = false,
+    bool EnableNaNChecks = false>
 struct KernelLauncher {
   const SourceContext context;
 
@@ -234,6 +255,21 @@ struct KernelLauncher {
     // device associated with the compute stream
     checkSharedMemoryPerBlockNotExceeded(properties, shared_mem_per_block);
 
+    // If NaN checks are enabled, run verifications on all kernel arguments that
+    // are tensors
+    if constexpr (EnableNaNChecks) {
+      const auto summary = std::string(context.summary) + " (pre-execution)";
+      (check_kernel_arg(context.withSummary(summary), std::forward<Args>(args)),
+       ...);
+    }
+
+    // If barrier isolation is enabled, synchronize the stream first before
+    // launching the kernel.  This has roughly the same effect as setting
+    // `CUDA_LAUNCH_BLOCKING=1` as an environment variable.
+    if constexpr (EnableBarrierIsolation) {
+      cudaDeviceSynchronize();
+    }
+
     if constexpr (EnableDSA) {
       // This launch code here is essentially the same as the contents of
       // TORCH_USE_CUDA_DSA macro, but with the addition of kernel argument
@@ -251,13 +287,6 @@ struct KernelLauncher {
           c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref();
 #endif
 
-      // If barrier isolation is enabled, synchronize the stream first before
-      // launching the kernel.  This has roughly the same effect as setting
-      // `CUDA_LAUNCH_BLOCKING=1` as an environment variable.
-      if constexpr (EnableBarrierIsolation) {
-        cudaDeviceSynchronize();
-      }
-
       // Launch the kernel
       kernel<<<grid, block, shared_mem_per_block, stream>>>(
           // Transform arguments to the kernel before forwarding them.
@@ -285,6 +314,14 @@ struct KernelLauncher {
 
     // Check for CUDA errors
     C10_CUDA_KERNEL_LAUNCH_CHECK();
+
+    // If NaN checks are enabled, run post-kernel verifications on all kernel
+    // arguments that are tensors
+    if constexpr (EnableNaNChecks) {
+      const auto summary = std::string(context.summary) + " (post-execution)";
+      (check_kernel_arg(context.withSummary(summary), std::forward<Args>(args)),
+       ...);
+    }
   }
 };
 
@@ -320,30 +357,38 @@ struct KernelLauncher {
 #define _FKL_TFILE_ ""
 #endif
 
-#ifdef FBGEMM_GPU_KERNEL_DEBUG
-#define _FKL_KDEBUG_ true
+#ifdef FBGEMM_GPU_ISOLATE_KERNEL_LAUNCH
+#define _FKL_BLOCKING_ true
+#else
+#define _FKL_BLOCKING_ false
+#endif
+
+#ifdef FBGEMM_GPU_TENSORCHECK
+#define _FKL_TENSORCHECK_ true
 #else
-#define _FKL_KDEBUG_ false
+#define _FKL_TENSORCHECK_ false
 #endif
 
-#define FBGEMM_LAUNCH_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...)    \
-  ([&] {                                                                \
-    using source_location = fbgemm_gpu::utils::source_location;         \
-    constexpr auto location = source_location::current();               \
-    decltype(KERNEL)& kernel = KERNEL;                                  \
-                                                                        \
-    return fbgemm_gpu::utils::KernelLauncher<false, _FKL_KDEBUG_>(      \
-               location, #KERNEL, _FKL_TFILE_)                          \
-        .launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
+#define FBGEMM_LAUNCH_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...)        \
+  ([&] {                                                                    \
+    using source_location = fbgemm_gpu::utils::source_location;             \
+    constexpr auto location = source_location::current();                   \
+    decltype(KERNEL)& kernel = KERNEL;                                      \
+                                                                            \
+    return fbgemm_gpu::utils::                                              \
+        KernelLauncher<false, _FKL_BLOCKING_, _FKL_TENSORCHECK_>(           \
+               location, #KERNEL, _FKL_TFILE_)                              \
+            .launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
   }())
 
-#define FBGEMM_LAUNCH_DSA_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...) \
-  ([&] {                                                                 \
-    using source_location = fbgemm_gpu::utils::source_location;          \
-    constexpr auto location = source_location::current();                \
-    decltype(KERNEL)& kernel = KERNEL;                                   \
-                                                                         \
-    return fbgemm_gpu::utils::KernelLauncher<true, _FKL_KDEBUG_>(        \
-               location, #KERNEL, _FKL_TFILE_)                           \
-        .launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__);  \
+#define FBGEMM_LAUNCH_DSA_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...)    \
+  ([&] {                                                                    \
+    using source_location = fbgemm_gpu::utils::source_location;             \
+    constexpr auto location = source_location::current();                   \
+    decltype(KERNEL)& kernel = KERNEL;                                      \
+                                                                            \
+    return fbgemm_gpu::utils::                                              \
+        KernelLauncher<true, _FKL_BLOCKING_, _FKL_TENSORCHECK_>(            \
+               location, #KERNEL, _FKL_TFILE_)                              \
+            .launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
   }())
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/source_context.h b/fbgemm_gpu/include/fbgemm_gpu/utils/source_context.h
@@ -80,6 +80,11 @@ struct SourceContext {
 
     return *desc_;
   }
+
+  inline SourceContext withSummary(
+      const std::string_view& sum_) const noexcept {
+    return SourceContext(location, sum_, secondaryLocation);
+  }
 };
 
 } // namespace fbgemm_gpu::utils
diff --git a/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor_builder.h b/fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor_builder.h
@@ -220,6 +220,26 @@ struct TensorAccessorBuilder {
       return build_ta(context);
     }
   }
+
+  //////////////////////////////////////////////////////////////////////////////
+  // Check Tensor values for NaN
+  //////////////////////////////////////////////////////////////////////////////
+
+  C10_ALWAYS_INLINE void checkValues(const std::string_view& context) const {
+    TORCH_CHECK(
+        !at::isnan(tensor).any().item<bool>(),
+        context,
+        ": Tensor '",
+        name,
+        "' contains NaN values!");
+
+    TORCH_CHECK(
+        !at::isinf(tensor).any().item<bool>(),
+        context,
+        ": Tensor '",
+        name,
+        "' contains (+/-) Inf values!");
+  }
 };
 
 } // namespace fbgemm_gpu::utils
diff --git a/fbgemm_gpu/test/utils/kernel_launcher_test.cu b/fbgemm_gpu/test/utils/kernel_launcher_test.cu
@@ -10,6 +10,9 @@
 // FBGEMM codebase to denote the template source file in auto-generated code.
 #define __TEMPLATE_SOURCE_FILE__ "FOO/BAR/BAZ-123.cpp"
 
+// Enable tensor value checking before and after executing kernels
+#define FBGEMM_GPU_TENSORCHECK
+
 #include <ATen/ATen.h>
 #include <c10/cuda/CUDADeviceAssertion.h>
 #include <cuda.h>
@@ -71,6 +74,44 @@ __global__ void tensor_sum_kernel(
   }
 }
 
+__device__ unsigned int xor128_rand_int(uint32_t seed) {
+  auto x = seed ^ (blockIdx.x * blockDim.x + threadIdx.x);
+  x ^= x << 13;
+  x ^= x >> 17;
+  x ^= x << 5;
+  return x;
+}
+
+template <typename T>
+__global__ void tensor_sum_kernel_bad_output(
+    pta::PackedTensorAccessor64<T, 1, at::RestrictPtrTraits> C,
+    const pta::PackedTensorAccessor64<T, 1, at::RestrictPtrTraits> A,
+    const pta::PackedTensorAccessor64<T, 1, at::RestrictPtrTraits> B,
+    TORCH_DSA_KERNEL_ARGS) {
+  const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
+  auto seed = xor128_rand_int(42);
+
+  if (idx < C.size(0)) {
+    if (seed = xor128_rand_int(seed); seed % 100 != 0) {
+      // 99% chance of normal value
+      C[idx] = A[idx] + B[idx];
+
+    } else {
+      seed = xor128_rand_int(seed);
+
+      if (seed % 3 == 0) {
+        C[idx] = std::numeric_limits<T>::quiet_NaN();
+
+      } else if (seed % 3 == 1) {
+        C[idx] = std::numeric_limits<T>::infinity();
+
+      } else {
+        C[idx] = std::numeric_limits<T>::infinity();
+      }
+    }
+  }
+}
+
 __global__ void always_fail_assertion_kernel(
     const int a,
     TORCH_DSA_KERNEL_ARGS) {
@@ -197,7 +238,7 @@ TEST(KernelLauncherTest, array_kernel_launch_dsa) {
   });
 }
 
-TEST(KernelLauncherTest, tensor_array_kernel_launch) {
+TEST(KernelLauncherTest, tensor_kernel_launch) {
   const auto size = 1024;
   // Not using structured bindings bc it fails on ROCm with:
   // `capturing a structured binding is not yet supported in OpenMP`
@@ -277,8 +318,8 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
       {
         FBGEMM_LAUNCH_DSA_KERNEL(
             tensor_sum_kernel<float>,
-            // Both grid and block dims conform, but the total number of threads
-            // exceeds the max
+            // Both grid and block dims conform, but the total number of
+            // threads exceeds the max
             {U32(grid_max[0]), U32(grid_max[1]), U32(grid_max[2])},
             {U32(block_max[0]), U32(block_max[1]), U32(block_max[2])},
             0,
@@ -311,6 +352,88 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
       std::exception);
 }
 
+TEST(KernelLauncherTest, tensor_value_checks) {
+  const auto size = 1024;
+  // Not using structured bindings bc it fails on ROCm with:
+  // `capturing a structured binding is not yet supported in OpenMP`
+  at::Tensor A, B, C;
+  std::tie(A, B, C) = sample_tensors(size);
+
+  {
+    // Test for bad INPUT tensors
+
+    const float values[] = {
+        std::numeric_limits<float>::quiet_NaN(),
+        std::numeric_limits<float>::infinity(),
+        -std::numeric_limits<float>::infinity(),
+    };
+
+    for (const auto value : values) {
+      // Set a bad value
+      auto i = rand() % size;
+      A[i] = value;
+
+      EXPECT_THROW(
+          {
+            FBGEMM_LAUNCH_DSA_KERNEL(
+                tensor_sum_kernel<float>,
+                8,
+                1024,
+                0,
+                at::cuda::getCurrentCUDAStream(),
+                PTA_B(C, float, 1, 64),
+                PTA_B(A, float, 1, 64),
+                PTA_B(B, float, 1, 64));
+          },
+          std::exception);
+
+      // Unset the bad value
+      A[i] = 1;
+    }
+
+    for (const auto value : values) {
+      // Set a bad value
+      auto i = rand() % size;
+      B[i] = value;
+
+      EXPECT_THROW(
+          {
+            FBGEMM_LAUNCH_DSA_KERNEL(
+                tensor_sum_kernel<float>,
+                8,
+                1024,
+                0,
+                at::cuda::getCurrentCUDAStream(),
+                PTA_B(C, float, 1, 64),
+                PTA_B(A, float, 1, 64),
+                PTA_B(B, float, 1, 64));
+          },
+          std::exception);
+
+      // Unset the bad value
+      B[i] = 1;
+    }
+  }
+
+  {
+    // Test for bad OUTPUT tensors
+
+    EXPECT_THROW(
+        {
+          FBGEMM_LAUNCH_DSA_KERNEL(
+              tensor_sum_kernel_bad_output<float>,
+              8,
+              1024,
+              0,
+              at::cuda::getCurrentCUDAStream(),
+              PTA_B(C, float, 1, 64),
+              PTA_B(A, float, 1, 64),
+              PTA_B(B, float, 1, 64));
+        },
+        std::exception);
+  }
+}
+
 // NOTE: This test currently fails in fbcode CI for HIP with the following
 // error (but runs without issues on both NVIDIA and AMD machines):
 //