diff --git a/Src/Base/AMReX_IntSuperAccumulator.H b/Src/Base/AMReX_IntSuperAccumulator.H new file mode 100644 index 0000000000..8186a40b24 --- /dev/null +++ b/Src/Base/AMReX_IntSuperAccumulator.H @@ -0,0 +1,292 @@ +#ifndef AMREX_INT_SUPERACCUMULATOR_H_ +#define AMREX_INT_SUPERACCUMULATOR_H_ + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace amrex { + +/** + * \brief Integer superaccumulator backed by BaseFab storage. + * + * The accumulator stores the exact sum of single-precision floating-point + * numbers by representing the scaled integer \f$\sum_i x_i 2^{-e_\text{min}}\f$ + * in a radix-\f$2^{30}\f$ positional system. Each digit is stored as a 64-bit signed + * integer in a `BaseFab` component. This simplified variant maintains a single logical + * entry whose digits occupy the `digits_per_entry` components. Both + * `accumulate` and `finalize` operate in single precision; callers may cast to + * wider types if needed. All summands are assumed non-negative; subnormal + * inputs are treated as zero to reduce the number of required digits per entry. + */ +class IntSuperAccumulatorFab +{ +public: + static constexpr int digit_bits = 30; //!< Width of each digit in bits. + static constexpr int digit_base = 1 << digit_bits; //!< Base of each digit. + static constexpr int min_exponent = -149; //!< Smallest exponent of scaled mantissa (binary32 minimum normal). + static constexpr int max_exponent = 104; //!< Largest exponent of scaled mantissa (binary32 maximum normal). + static constexpr int significand_bits = 24; //!< Binary32 significand bits including hidden bit. + static constexpr int extra_digits = 1; //!< Safety margin for carry propagation. + static constexpr int digits_for_shift = ((max_exponent - min_exponent) / digit_bits) + 1; // covers exponent window + static constexpr int digits_for_value = 2; //!< Worst-case mantissa occupies two digits at radix 2^30. + static constexpr int digits_per_entry = digits_for_shift + digits_for_value + extra_digits; //!< 12 digits per entry. + + IntSuperAccumulatorFab () = default; + + explicit IntSuperAccumulatorFab (Arena* arena) noexcept + : m_storage(arena) {} + + IntSuperAccumulatorFab (const Box& box, Arena* arena=nullptr) + : m_storage(box, digits_per_entry, arena) {} + + [[nodiscard]] static constexpr int entries () noexcept { return 1; } + + void define (const Box& box, Arena* arena=nullptr) + { + m_storage.resize(box, digits_per_entry, arena); + } + + [[nodiscard]] Array4 array () noexcept { return m_storage.array(); } + [[nodiscard]] Array4 const_array () const noexcept { return m_storage.const_array(); } + + void reset () + { + if (amrex::Gpu::inLaunchRegion()) { + m_storage.template setVal(0); + } else { + m_storage.template setVal(0); + } + } + + void reset (Box const& bx) + { + constexpr int ncomp = digits_per_entry; + if (amrex::Gpu::inLaunchRegion()) { + m_storage.template setVal(0, bx, 0, ncomp); + } else { + m_storage.template setVal(0, bx, 0, ncomp); + } + } + + /** + * \brief Accumulate a floating-point value into the superaccumulator. + * + * The caller must provide the digit array for the single logical entry. + * All operations are carried out using integers; the routine does not + * allocate additional storage. + */ + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void + accumulate (Array const& digits, int i, int j, int k, float value) noexcept + { + if (value <= 0.0F) { + if (value < 0.0F) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab::accumulate: negative input not supported"); +#endif + } + return; + } + + if (!amrex::Math::isfinite(value)) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab::accumulate: non-finite input"); +#else + return; +#endif + } + + const auto bits = detail::decompose(value); + if (bits.is_zero) { return; } + + const int shift = bits.exponent - min_exponent; + int digit_index = shift / digit_bits; + const int bit_offset = shift % digit_bits; + + const unsigned long long magnitude = bits.magnitude; + unsigned long long chunk_value = magnitude << bit_offset; + + constexpr int component_base = 0; + constexpr int component_limit = digits_per_entry; + + while (chunk_value != 0ULL) { + const unsigned long long chunk = chunk_value & static_cast(digit_base - 1); + chunk_value >>= digit_bits; + if (component_base + digit_index >= component_limit) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab: digit overflow"); +#else + break; +#endif + } + detail::add_to_digit(digits, i, j, k, + component_base + digit_index, + component_limit, + chunk); + ++digit_index; + } + } + + /** + * \brief Finalize the accumulator entry and return a binary32 result. + */ + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static float + finalize (Array const& digits, int i, int j, int k) noexcept + { + double result = 0.0; + for (int d = digits_per_entry - 1; d >= 0; --d) { + const Long digit = digits(i,j,k, d); + if (digit == 0) { continue; } + const int exponent = min_exponent + d * digit_bits; + result += std::ldexp(static_cast(digit), exponent); + } + return static_cast(result); + } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void + clear_entry (Array const& digits, int i, int j, int k) noexcept + { + for (int d = 0; d < digits_per_entry; ++d) { + digits(i,j,k, d) = 0; + } + } + +private: + struct Decomposed + { + unsigned long long magnitude; + int exponent; + bool is_zero; + }; + + struct detail + { + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static void + add_to_digit (Array const& digits, int i, int j, int k, + int component, int component_limit, unsigned long long value) noexcept + { + unsigned long long carry = value; + int current_component = component; + constexpr auto base = static_cast(digit_base); + + while (carry != 0ULL) { + if (current_component >= component_limit) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab: carry overflow"); +#else + break; +#endif + } + +#if defined(AMREX_USE_GPU) && AMREX_DEVICE_COMPILE + // When we compile for the device we cannot rely on a carry-save digit layout, so + // each thread performs an immediate normalization using GPU atomics. + Long* const digit_ptr = &(digits(i,j,k,current_component)); + + // Break the carry into the current radix chunk and the carry to forward. + const unsigned long long chunk = carry & (base - 1ULL); + carry >>= digit_bits; + + // Atomically accumulate this chunk. Multiple threads may land on the same digit, + // which is safe because atomicAdd enforces a total order. + if (chunk != 0ULL) { + amrex::Gpu::Atomic::Add(digit_ptr, static_cast(chunk)); + } + + // Track how many bases we successfully removed so they can be forwarded upward. + unsigned long long propagated = 0ULL; + while (true) { + // Snapshot the digit atomically without changing it. This avoids racing + // against other threads that may be normalizing the same slot. + const Long current_value = amrex::Gpu::Atomic::Add(digit_ptr, static_cast(0)); + if (current_value < digit_base) { + break; + } + + // Attempt to drain one full base from the digit. The return value is the + // pre-subtraction contents, so we know whether we actually claimed the carry. + const Long before = amrex::Gpu::Atomic::Add(digit_ptr, static_cast(-digit_base)); + if (before >= digit_base) { + // We consumed one unit of overflow and should forward it to the next digit. + ++propagated; + continue; + } else { + // Another thread already normalized the digit between the peek and our + // subtract; restore the value and exit. + amrex::Gpu::Atomic::Add(digit_ptr, static_cast(digit_base)); + break; + } + } + + // Fold any carries we successfully removed into the higher-order digits. + carry += propagated; +#else + const unsigned long long sum = static_cast(digits(i,j,k,current_component)) + carry; + const unsigned long long remainder = sum & (base - 1ULL); + const unsigned long long quotient = sum >> digit_bits; + + digits(i,j,k,current_component) = static_cast(remainder); + carry = quotient; +#endif + ++current_component; + } + } + + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE static Decomposed + decompose (float value) noexcept + { + Decomposed bits{}; + + if (value <= 0.0F) { + bits.is_zero = true; + return bits; + } + + union { + float f; + std::uint32_t u; + } pun{}; + pun.f = value; + const std::uint32_t u = pun.u; + + const std::uint32_t exponent_field = (u >> 23) & 0xFFU; + const std::uint32_t fraction = u & ((1U << 23) - 1U); + + if (exponent_field == 0xFFU) { +#ifndef AMREX_USE_GPU + amrex::Abort("IntSuperAccumulatorFab::decompose: NaN or Inf not supported (float)"); +#else + Decomposed tmp{}; tmp.is_zero = true; return tmp; +#endif + } else if (exponent_field == 0U) { + // Treat subnormals as zero; caller is expected to pre-round. + bits.is_zero = true; + return bits; + } else { + bits.magnitude = (1U << 23) | fraction; + bits.exponent = static_cast(exponent_field) - 127 - significand_bits + 1; + } + + bits.is_zero = false; + return bits; + } + }; + + BaseFab m_storage; +}; + +} // namespace amrex + +#endif diff --git a/Src/Base/CMakeLists.txt b/Src/Base/CMakeLists.txt index 89f406013d..07802d1d6b 100644 --- a/Src/Base/CMakeLists.txt +++ b/Src/Base/CMakeLists.txt @@ -137,6 +137,7 @@ foreach(D IN LISTS AMReX_SPACEDIM) AMReX_BaseFab.H AMReX_BaseFab.cpp AMReX_Array4.H + AMReX_IntSuperAccumulator.H AMReX_MakeType.H AMReX_TypeTraits.H AMReX_FabDataType.H diff --git a/Src/Base/Make.package b/Src/Base/Make.package index 675d37b2cc..7b7851076e 100644 --- a/Src/Base/Make.package +++ b/Src/Base/Make.package @@ -163,6 +163,7 @@ C$(AMREX_BASE)_headers += AMReX_TypeTraits.H C$(AMREX_BASE)_headers += AMReX_FabDataType.H C$(AMREX_BASE)_headers += AMReX_Array4.H +C$(AMREX_BASE)_headers += AMReX_IntSuperAccumulator.H C$(AMREX_BASE)_sources += AMReX_BaseFab.cpp C$(AMREX_BASE)_headers += AMReX_BaseFab.H AMReX_BaseFabUtility.H C$(AMREX_BASE)_headers += AMReX_FabFactory.H diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt index 3db6cccbf8..c0a4b82e25 100644 --- a/Tests/CMakeLists.txt +++ b/Tests/CMakeLists.txt @@ -126,7 +126,7 @@ else() # List of subdirectories to search for CMakeLists. # set( AMREX_TESTS_SUBDIRS Amr AsyncOut CallNoinline CLZ CommType CTOParFor DeviceGlobal - Enum HeatEquation MultiBlock MultiPeriod ParmParse Parser + Enum HeatEquation MultiBlock MultiPeriod Numerics ParmParse Parser Parser2 ParserUserFn Reinit RoundoffDomain SmallMatrix) if (AMReX_PARTICLES) diff --git a/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt b/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt new file mode 100644 index 0000000000..7e8d9a8080 --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/CMakeLists.txt @@ -0,0 +1,9 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp) + set(_input_files ) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/Numerics/IntSuperAccumulator/GNUmakefile b/Tests/Numerics/IntSuperAccumulator/GNUmakefile new file mode 100644 index 0000000000..a42d27db03 --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/GNUmakefile @@ -0,0 +1,26 @@ +AMREX_HOME ?= ../../../amrex + +PRECISION = FLOAT + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_MPI = FALSE +USE_OMP = FALSE +USE_CUDA = FALSE +USE_HIP = FALSE +USE_SYCL = FALSE + +BL_NO_FORT = TRUE + +TINY_PROFILE = FALSE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/Numerics/IntSuperAccumulator/Make.package b/Tests/Numerics/IntSuperAccumulator/Make.package new file mode 100644 index 0000000000..6b4b865e8f --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/Make.package @@ -0,0 +1 @@ +CEXE_sources += main.cpp diff --git a/Tests/Numerics/IntSuperAccumulator/main.cpp b/Tests/Numerics/IntSuperAccumulator/main.cpp new file mode 100644 index 0000000000..b9577a26ed --- /dev/null +++ b/Tests/Numerics/IntSuperAccumulator/main.cpp @@ -0,0 +1,155 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace +{ + +struct Range +{ + float min_value; + float max_value; + int count; + int seed; +}; + +} // namespace + +int +main (int argc, char* argv[]) +{ + amrex::Initialize(argc, argv); + int status = 0; + + { + using amrex::IntSuperAccumulatorFab; + + const amrex::Box bx(amrex::IntVect::TheZeroVector(), amrex::IntVect::TheZeroVector()); + IntSuperAccumulatorFab accumulator(bx); + + const float lowest_exponent_min = std::numeric_limits::min(); + const float lowest_exponent_max = std::nextafter(std::ldexp(1.0F, -125), 0.0F); + const float highest_exponent_min = std::ldexp(1.0F, 125); + const float highest_exponent_max = std::ldexp(1.0F, 126); // OVERFLOWS TO INF: std::numeric_limits::max(); + + const std::array ranges = {{ + {lowest_exponent_min, lowest_exponent_max, 32768, 3}, + {highest_exponent_min, highest_exponent_max, 4, 5}, + {1.0e-12F, 1.0e-3F, 32768, 11}, + {1.0e-4F, 1.0F, 65536, 23}, + {1.0F, 1.0e6F, 65536, 37}, + {1.0e-2F, 1.0e8F, 131072, 41} + }}; + + auto compute_reference = [] (amrex::Gpu::HostVector const& values) -> float { + volatile double sum = 0.0; + volatile double corr = 0.0; + for (float v : values) { + const volatile double y = static_cast(v) - corr; + const volatile double t = sum + y; + corr = (t - sum) - y; + sum = t; + } + return static_cast(sum); + }; + + auto accumulate_values = [&] (amrex::Gpu::HostVector const& values) -> float { + accumulator.reset(bx); + auto digits = accumulator.array(); + constexpr int i = 0; + constexpr int j = 0; + constexpr int k = 0; + + const int count = static_cast(values.size()); + + if (amrex::Gpu::inLaunchRegion()) { + amrex::Gpu::DeviceVector d_values(count); + amrex::Gpu::copy(amrex::Gpu::hostToDevice, values.begin(), values.end(), d_values.begin()); + + auto *ptr = d_values.data(); + auto arr = digits; + + amrex::ParallelFor(count, [=] AMREX_GPU_DEVICE (int n) { + IntSuperAccumulatorFab::accumulate(arr, i, j, k, ptr[n]); + }); + + amrex::Gpu::streamSynchronize(); + + amrex::Gpu::DeviceScalar d_result; + auto *res_ptr = d_result.dataPtr(); + amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) { + res_ptr[0] = IntSuperAccumulatorFab::finalize(arr, i, j, k); + }); + amrex::Gpu::streamSynchronize(); + return d_result.dataValue(); + } else { + for (float v : values) { + IntSuperAccumulatorFab::accumulate(digits, i, j, k, v); + } + return IntSuperAccumulatorFab::finalize(digits, i, j, k); + } + }; + + for (const auto& cfg : ranges) { + std::mt19937 gen(cfg.seed); + std::uniform_real_distribution dist(cfg.min_value, cfg.max_value); + + amrex::Gpu::HostVector values(cfg.count); + for (int n = 0; n < cfg.count; ++n) { + values[n] = dist(gen); + } + + const float reference = compute_reference(values); + const float result = accumulate_values(values); + + amrex::Print() << "Range [" << cfg.min_value << ", " << cfg.max_value + << "] with count " << cfg.count + << ": reference=" << reference + << " accumulator=" << result << '\n'; + + if (result != reference) { + amrex::Print() << "Mismatch for range [" << cfg.min_value << ", " + << cfg.max_value << "] with count " << cfg.count + << ": reference=" << reference + << " accumulator=" << result << '\n'; + status = 1; + break; + } + } + + if (status == 0) { + constexpr float value = 0.125F; + constexpr int repeats = 200000; + + amrex::Gpu::HostVector values(repeats); + for (int n = 0; n < repeats; ++n) { + values[n] = value; + } + + const float reference = compute_reference(values); + const float result = accumulate_values(values); + + amrex::Print() << "Constant accumulation: reference=" << reference + << " accumulator=" << result << '\n'; + + if (result != reference) { + amrex::Print() << "Mismatch for constant accumulation: reference=" + << reference << " accumulator=" << result << '\n'; + status = 1; + } + } + } + + amrex::Finalize(); + return status; +}