Is this a duplicate?
Type of Bug
Runtime Error
Component
CUB
Describe the bug
Environment
- GPU: NVIDIA RTX PRO 6000 Blackwell Server Edition (Compute Capability 12.0)
- Driver: 580.95.05
- CUDA: 13.1 (V13.1.115)
- CCCL Version: 3.4.0 (CCCL_VERSION 3004000)
- OS: Ubuntu 24.04.4 LTS
Description
cub::DeviceScan::ExclusiveScan produces incorrect results when using a custom binary operator that propagates values conditionally.
Additional Notes
cub::DeviceScan::InclusiveScan exhibits the same bug at the same positions
How to Reproduce
Minimal Reproducible Example
#include <cub/cub.cuh>
#include <cuda_runtime.h>
#include <numeric>
#include <vector>
#include <cstdio>
// Custom operator that propagates the last "write" value through "read" values
template <typename StackSymbolT>
struct PropagateLastWrite {
__host__ __device__ StackSymbolT operator()(StackSymbolT const& lhs,
StackSymbolT const& rhs) const
{
bool is_rhs_read = (rhs == read_symbol);
bool is_lhs_write = (lhs != read_symbol);
return (is_rhs_read && is_lhs_write) ? lhs : rhs;
}
StackSymbolT read_symbol;
};
int main()
{
constexpr size_t num_elements = 8160;
constexpr char read_symbol = 'x';
constexpr char empty_stack_symbol = '_';
// Create input - mostly read_symbol with scattered write values
std::vector<char> h_input(num_elements, read_symbol);
// Key write values that trigger the bug
h_input[8073] = '[';
h_input[8074] = '{';
h_input[8075] = '[';
h_input[8076] = '{'; // Last '{' before positions 8077-8147 (all 'x')
h_input[8148] = '_';
h_input[8151] = '{';
h_input[8152] = '_';
h_input[8154] = '[';
h_input[8155] = '_';
h_input[8157] = '[';
h_input[8159] = '_';
// Earlier scattered values
h_input[8020] = '_';
h_input[8023] = '{';
h_input[8057] = '[';
h_input[8060] = '{';
h_input[8061] = '[';
h_input[8068] = '{';
// Compute CPU reference
std::vector<char> h_expected(num_elements);
std::exclusive_scan(h_input.begin(),
h_input.end(),
h_expected.begin(),
empty_stack_symbol,
PropagateLastWrite<char>{read_symbol});
// Allocate device memory
char* d_input;
char* d_output;
cudaMalloc(&d_input, num_elements);
cudaMalloc(&d_output, num_elements);
cudaMemcpy(d_input, h_input.data(), num_elements, cudaMemcpyHostToDevice);
// Get temp storage size
size_t temp_storage_bytes = 0;
PropagateLastWrite<char> op{read_symbol};
cub::DeviceScan::ExclusiveScan(nullptr, temp_storage_bytes,
d_input, d_output, op,
empty_stack_symbol, num_elements);
char* d_temp;
cudaMalloc(&d_temp, temp_storage_bytes);
// Run GPU scan
cub::DeviceScan::ExclusiveScan(d_temp, temp_storage_bytes,
d_input, d_output, op,
empty_stack_symbol, num_elements);
// Copy back and compare
std::vector<char> h_output(num_elements);
cudaMemcpy(h_output.data(), d_output, num_elements, cudaMemcpyDeviceToHost);
int mismatches = 0;
for (size_t i = 0; i < num_elements; i++) {
if (h_output[i] != h_expected[i]) {
mismatches++;
}
}
printf("Total mismatches: %d\n", mismatches);
cudaFree(d_input);
cudaFree(d_output);
cudaFree(d_temp);
return mismatches > 0 ? 1 : 0;
}
Actual Output on Blackwell
Mismatch at 8127: GPU='[' (0x5B), CPU='{' (0x7B)
Mismatch at 8128: GPU='[' (0x5B), CPU='{' (0x7B)
...
Mismatch at 8148: GPU='[' (0x5B), CPU='{' (0x7B)
Total mismatches: 22
Analysis
- Position 8076 contains
'{' (the last write before a gap)
- Positions 8077-8147 contain
'x' (read_symbol)
- The exclusive scan should propagate
'{' through all read positions
- Instead, GPU produces
'[' (from position 8075) starting at position 8127
Expected behavior
Expected Output
All positions should match (0 mismatches).
Is this a duplicate?
Type of Bug
Runtime Error
Component
CUB
Describe the bug
Environment
Description
cub::DeviceScan::ExclusiveScanproduces incorrect results when using a custom binary operator that propagates values conditionally.Additional Notes
cub::DeviceScan::InclusiveScanexhibits the same bug at the same positionsHow to Reproduce
Minimal Reproducible Example
Actual Output on Blackwell
Analysis
'{'(the last write before a gap)'x'(read_symbol)'{'through all read positions'['(from position 8075) starting at position 8127Expected behavior
Expected Output
All positions should match (0 mismatches).