Skip to content

Commit 5ef7c95

Browse files
q10facebook-github-bot
authored andcommitted
Enable NaN checks on tensor arguments to kernel launches (#4029)
Summary: X-link: facebookresearch/FBGEMM#1113 Pull Request resolved: #4029 - Enable NaN checks on tensor arguments to kernel launches Reviewed By: sryap, spcyppt Differential Revision: D73698678 fbshipit-source-id: e87e374c178bfef59db4477aca3874125099eb32
1 parent 0911c94 commit 5ef7c95

File tree

4 files changed

+225
-32
lines changed

4 files changed

+225
-32
lines changed

fbgemm_gpu/include/fbgemm_gpu/utils/kernel_launcher.cuh

Lines changed: 74 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -62,14 +62,35 @@ decltype(auto) transform_kernel_arg(const SourceContext& context, T&& arg) {
6262
}
6363
}
6464

65+
////////////////////////////////////////////////////////////////////////////////
66+
// Verify Kernel Argument
67+
//
68+
// Verify certain arguments before and after kernel invocation
69+
////////////////////////////////////////////////////////////////////////////////
70+
71+
template <typename T>
72+
decltype(auto) check_kernel_arg(const SourceContext& context, T&& arg) {
73+
if constexpr (is_tensor_accessor_builder_v<std::decay_t<T>>) {
74+
// If the arg is a TensorAccessorBuilder, run verifications on the tensor it
75+
// is ref-wrapping, e.g. NaN value checks.
76+
return arg.checkValues(context.description());
77+
} else {
78+
// Otherwise, perfect-forward the argument as is
79+
return std::forward<T>(arg);
80+
}
81+
}
82+
6583
////////////////////////////////////////////////////////////////////////////////
6684
// GPU Kernel Launcher
6785
//
6886
// This class encapsulates the common ceremonial pre- and post-execution
6987
// routines when launching GPU kernels.
7088
////////////////////////////////////////////////////////////////////////////////
7189

72-
template <bool EnableDSA = false, bool EnableBarrierIsolation = false>
90+
template <
91+
bool EnableDSA = false,
92+
bool EnableBarrierIsolation = false,
93+
bool EnableNaNChecks = false>
7394
struct KernelLauncher {
7495
const SourceContext context;
7596

@@ -234,6 +255,21 @@ struct KernelLauncher {
234255
// device associated with the compute stream
235256
checkSharedMemoryPerBlockNotExceeded(properties, shared_mem_per_block);
236257

258+
// If NaN checks are enabled, run verifications on all kernel arguments that
259+
// are tensors
260+
if constexpr (EnableNaNChecks) {
261+
const auto summary = std::string(context.summary) + " (pre-execution)";
262+
(check_kernel_arg(context.withSummary(summary), std::forward<Args>(args)),
263+
...);
264+
}
265+
266+
// If barrier isolation is enabled, synchronize the stream first before
267+
// launching the kernel. This has roughly the same effect as setting
268+
// `CUDA_LAUNCH_BLOCKING=1` as an environment variable.
269+
if constexpr (EnableBarrierIsolation) {
270+
cudaDeviceSynchronize();
271+
}
272+
237273
if constexpr (EnableDSA) {
238274
// This launch code here is essentially the same as the contents of
239275
// TORCH_USE_CUDA_DSA macro, but with the addition of kernel argument
@@ -251,13 +287,6 @@ struct KernelLauncher {
251287
c10::cuda::CUDAKernelLaunchRegistry::get_singleton_ref();
252288
#endif
253289

254-
// If barrier isolation is enabled, synchronize the stream first before
255-
// launching the kernel. This has roughly the same effect as setting
256-
// `CUDA_LAUNCH_BLOCKING=1` as an environment variable.
257-
if constexpr (EnableBarrierIsolation) {
258-
cudaDeviceSynchronize();
259-
}
260-
261290
// Launch the kernel
262291
kernel<<<grid, block, shared_mem_per_block, stream>>>(
263292
// Transform arguments to the kernel before forwarding them.
@@ -285,6 +314,14 @@ struct KernelLauncher {
285314

286315
// Check for CUDA errors
287316
C10_CUDA_KERNEL_LAUNCH_CHECK();
317+
318+
// If NaN checks are enabled, run post-kernel verifications on all kernel
319+
// arguments that are tensors
320+
if constexpr (EnableNaNChecks) {
321+
const auto summary = std::string(context.summary) + " (post-execution)";
322+
(check_kernel_arg(context.withSummary(summary), std::forward<Args>(args)),
323+
...);
324+
}
288325
}
289326
};
290327

@@ -320,30 +357,38 @@ struct KernelLauncher {
320357
#define _FKL_TFILE_ ""
321358
#endif
322359

323-
#ifdef FBGEMM_GPU_KERNEL_DEBUG
324-
#define _FKL_KDEBUG_ true
360+
#ifdef FBGEMM_GPU_ISOLATE_KERNEL_LAUNCH
361+
#define _FKL_BLOCKING_ true
362+
#else
363+
#define _FKL_BLOCKING_ false
364+
#endif
365+
366+
#ifdef FBGEMM_GPU_TENSORCHECK
367+
#define _FKL_TENSORCHECK_ true
325368
#else
326-
#define _FKL_KDEBUG_ false
369+
#define _FKL_TENSORCHECK_ false
327370
#endif
328371

329-
#define FBGEMM_LAUNCH_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...) \
330-
([&] { \
331-
using source_location = fbgemm_gpu::utils::source_location; \
332-
constexpr auto location = source_location::current(); \
333-
decltype(KERNEL)& kernel = KERNEL; \
334-
\
335-
return fbgemm_gpu::utils::KernelLauncher<false, _FKL_KDEBUG_>( \
336-
location, #KERNEL, _FKL_TFILE_) \
337-
.launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
372+
#define FBGEMM_LAUNCH_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...) \
373+
([&] { \
374+
using source_location = fbgemm_gpu::utils::source_location; \
375+
constexpr auto location = source_location::current(); \
376+
decltype(KERNEL)& kernel = KERNEL; \
377+
\
378+
return fbgemm_gpu::utils:: \
379+
KernelLauncher<false, _FKL_BLOCKING_, _FKL_TENSORCHECK_>( \
380+
location, #KERNEL, _FKL_TFILE_) \
381+
.launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
338382
}())
339383

340-
#define FBGEMM_LAUNCH_DSA_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...) \
341-
([&] { \
342-
using source_location = fbgemm_gpu::utils::source_location; \
343-
constexpr auto location = source_location::current(); \
344-
decltype(KERNEL)& kernel = KERNEL; \
345-
\
346-
return fbgemm_gpu::utils::KernelLauncher<true, _FKL_KDEBUG_>( \
347-
location, #KERNEL, _FKL_TFILE_) \
348-
.launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
384+
#define FBGEMM_LAUNCH_DSA_KERNEL(KERNEL, GRID, BLOCK, SMEM, STREAM, ...) \
385+
([&] { \
386+
using source_location = fbgemm_gpu::utils::source_location; \
387+
constexpr auto location = source_location::current(); \
388+
decltype(KERNEL)& kernel = KERNEL; \
389+
\
390+
return fbgemm_gpu::utils:: \
391+
KernelLauncher<true, _FKL_BLOCKING_, _FKL_TENSORCHECK_>( \
392+
location, #KERNEL, _FKL_TFILE_) \
393+
.launch_kernel(kernel, GRID, BLOCK, SMEM, STREAM, __VA_ARGS__); \
349394
}())

fbgemm_gpu/include/fbgemm_gpu/utils/source_context.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,11 @@ struct SourceContext {
8080

8181
return *desc_;
8282
}
83+
84+
inline SourceContext withSummary(
85+
const std::string_view& sum_) const noexcept {
86+
return SourceContext(location, sum_, secondaryLocation);
87+
}
8388
};
8489

8590
} // namespace fbgemm_gpu::utils

fbgemm_gpu/include/fbgemm_gpu/utils/tensor_accessor_builder.h

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,26 @@ struct TensorAccessorBuilder {
220220
return build_ta(context);
221221
}
222222
}
223+
224+
//////////////////////////////////////////////////////////////////////////////
225+
// Check Tensor values for NaN
226+
//////////////////////////////////////////////////////////////////////////////
227+
228+
C10_ALWAYS_INLINE void checkValues(const std::string_view& context) const {
229+
TORCH_CHECK(
230+
!at::isnan(tensor).any().item<bool>(),
231+
context,
232+
": Tensor '",
233+
name,
234+
"' contains NaN values!");
235+
236+
TORCH_CHECK(
237+
!at::isinf(tensor).any().item<bool>(),
238+
context,
239+
": Tensor '",
240+
name,
241+
"' contains (+/-) Inf values!");
242+
}
223243
};
224244

225245
} // namespace fbgemm_gpu::utils

fbgemm_gpu/test/utils/kernel_launcher_test.cu

Lines changed: 126 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,9 @@
1010
// FBGEMM codebase to denote the template source file in auto-generated code.
1111
#define __TEMPLATE_SOURCE_FILE__ "FOO/BAR/BAZ-123.cpp"
1212

13+
// Enable tensor value checking before and after executing kernels
14+
#define FBGEMM_GPU_TENSORCHECK
15+
1316
#include <ATen/ATen.h>
1417
#include <c10/cuda/CUDADeviceAssertion.h>
1518
#include <cuda.h>
@@ -71,6 +74,44 @@ __global__ void tensor_sum_kernel(
7174
}
7275
}
7376

77+
__device__ unsigned int xor128_rand_int(uint32_t seed) {
78+
auto x = seed ^ (blockIdx.x * blockDim.x + threadIdx.x);
79+
x ^= x << 13;
80+
x ^= x >> 17;
81+
x ^= x << 5;
82+
return x;
83+
}
84+
85+
template <typename T>
86+
__global__ void tensor_sum_kernel_bad_output(
87+
pta::PackedTensorAccessor64<T, 1, at::RestrictPtrTraits> C,
88+
const pta::PackedTensorAccessor64<T, 1, at::RestrictPtrTraits> A,
89+
const pta::PackedTensorAccessor64<T, 1, at::RestrictPtrTraits> B,
90+
TORCH_DSA_KERNEL_ARGS) {
91+
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
92+
auto seed = xor128_rand_int(42);
93+
94+
if (idx < C.size(0)) {
95+
if (seed = xor128_rand_int(seed); seed % 100 != 0) {
96+
// 99% chance of normal value
97+
C[idx] = A[idx] + B[idx];
98+
99+
} else {
100+
seed = xor128_rand_int(seed);
101+
102+
if (seed % 3 == 0) {
103+
C[idx] = std::numeric_limits<T>::quiet_NaN();
104+
105+
} else if (seed % 3 == 1) {
106+
C[idx] = std::numeric_limits<T>::infinity();
107+
108+
} else {
109+
C[idx] = std::numeric_limits<T>::infinity();
110+
}
111+
}
112+
}
113+
}
114+
74115
__global__ void always_fail_assertion_kernel(
75116
const int a,
76117
TORCH_DSA_KERNEL_ARGS) {
@@ -197,7 +238,7 @@ TEST(KernelLauncherTest, array_kernel_launch_dsa) {
197238
});
198239
}
199240

200-
TEST(KernelLauncherTest, tensor_array_kernel_launch) {
241+
TEST(KernelLauncherTest, tensor_kernel_launch) {
201242
const auto size = 1024;
202243
// Not using structured bindings bc it fails on ROCm with:
203244
// `capturing a structured binding is not yet supported in OpenMP`
@@ -277,8 +318,8 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
277318
{
278319
FBGEMM_LAUNCH_DSA_KERNEL(
279320
tensor_sum_kernel<float>,
280-
// Both grid and block dims conform, but the total number of threads
281-
// exceeds the max
321+
// Both grid and block dims conform, but the total number of
322+
// threads exceeds the max
282323
{U32(grid_max[0]), U32(grid_max[1]), U32(grid_max[2])},
283324
{U32(block_max[0]), U32(block_max[1]), U32(block_max[2])},
284325
0,
@@ -311,6 +352,88 @@ TEST(KernelLauncherTest, kernel_launch_checks) {
311352
std::exception);
312353
}
313354

355+
TEST(KernelLauncherTest, tensor_value_checks) {
356+
const auto size = 1024;
357+
// Not using structured bindings bc it fails on ROCm with:
358+
// `capturing a structured binding is not yet supported in OpenMP`
359+
at::Tensor A, B, C;
360+
std::tie(A, B, C) = sample_tensors(size);
361+
362+
{
363+
// Test for bad INPUT tensors
364+
365+
const float values[] = {
366+
std::numeric_limits<float>::quiet_NaN(),
367+
std::numeric_limits<float>::infinity(),
368+
-std::numeric_limits<float>::infinity(),
369+
};
370+
371+
for (const auto value : values) {
372+
// Set a bad value
373+
auto i = rand() % size;
374+
A[i] = value;
375+
376+
EXPECT_THROW(
377+
{
378+
FBGEMM_LAUNCH_DSA_KERNEL(
379+
tensor_sum_kernel<float>,
380+
8,
381+
1024,
382+
0,
383+
at::cuda::getCurrentCUDAStream(),
384+
PTA_B(C, float, 1, 64),
385+
PTA_B(A, float, 1, 64),
386+
PTA_B(B, float, 1, 64));
387+
},
388+
std::exception);
389+
390+
// Unset the bad value
391+
A[i] = 1;
392+
}
393+
394+
for (const auto value : values) {
395+
// Set a bad value
396+
auto i = rand() % size;
397+
B[i] = value;
398+
399+
EXPECT_THROW(
400+
{
401+
FBGEMM_LAUNCH_DSA_KERNEL(
402+
tensor_sum_kernel<float>,
403+
8,
404+
1024,
405+
0,
406+
at::cuda::getCurrentCUDAStream(),
407+
PTA_B(C, float, 1, 64),
408+
PTA_B(A, float, 1, 64),
409+
PTA_B(B, float, 1, 64));
410+
},
411+
std::exception);
412+
413+
// Unset the bad value
414+
B[i] = 1;
415+
}
416+
}
417+
418+
{
419+
// Test for bad OUTPUT tensors
420+
421+
EXPECT_THROW(
422+
{
423+
FBGEMM_LAUNCH_DSA_KERNEL(
424+
tensor_sum_kernel_bad_output<float>,
425+
8,
426+
1024,
427+
0,
428+
at::cuda::getCurrentCUDAStream(),
429+
PTA_B(C, float, 1, 64),
430+
PTA_B(A, float, 1, 64),
431+
PTA_B(B, float, 1, 64));
432+
},
433+
std::exception);
434+
}
435+
}
436+
314437
// NOTE: This test currently fails in fbcode CI for HIP with the following
315438
// error (but runs without issues on both NVIDIA and AMD machines):
316439
//

0 commit comments

Comments
 (0)