NVIDIA
diff --git a/‎bench/00_misc/fltflt_arithmetic.cu‎
Lines changed: 151 additions & 0 deletions b/‎bench/00_misc/fltflt_arithmetic.cu‎
Lines changed: 151 additions & 0 deletions
diff --git a/‎bench/scripts/run_fltflt_benchmarks.py‎
Lines changed: 51 additions & 3 deletions b/‎bench/scripts/run_fltflt_benchmarks.py‎
Lines changed: 51 additions & 3 deletions
diff --git a/‎include/matx/kernels/fltflt.h‎
Lines changed: 62 additions & 2 deletions b/‎include/matx/kernels/fltflt.h‎
Lines changed: 62 additions & 2 deletions
@@ -409,6 +409,157 @@ NVBENCH_BENCH_TYPES(fltflt_bench_sqrt, NVBENCH_TYPE_AXES(precision_types))
   .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
   .add_int64_axis("Iterations", {250});
 
+//==============================================================================
+// Square Root Fast Benchmark
+// For float/double, this is identical to the sqrt benchmark (sqrtf/sqrt).
+// For fltflt, this uses fltflt_sqrt_fast instead of fltflt_sqrt.
+//==============================================================================
+template <typename T>
+__global__ void iterative_sqrt_fast_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    T val[ILP_FACTOR];
+    const T init_val = static_cast<T>(2.718281828);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      if constexpr (std::is_same_v<T, fltflt>) {
+        val[ilp] = fltflt_sqrt_fast(init_val);
+      } else {
+        val[ilp] = sqrt(init_val);
+      }
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 1; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        if constexpr (std::is_same_v<T, fltflt>) {
+          val[ilp] = fltflt_sqrt_fast(val[ilp]);
+        } else {
+          val[ilp] = sqrt(val[ilp]);
+        }
+      }
+    }
+
+    T result_val = val[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + val[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename PrecisionType>
+void fltflt_bench_sqrt_fast(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_sqrt_fast_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_sqrt_fast, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
+//==============================================================================
+// 3D Norm Benchmark: sqrt(dx^2 + dy^2 + dz^2)
+// Each ILP lane has distinct dx values that depend on the previous iteration's
+// result, creating a true dependency chain that prevents CSE across lanes.
+//==============================================================================
+template <typename T>
+__global__ void iterative_norm3d_kernel(T* __restrict__ result, int64_t size, int32_t iterations)
+{
+  int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < size) {
+    // Per-lane dx values create independent dependency chains
+    T dx[ILP_FACTOR];
+    const T dy = static_cast<T>(-487293.18274);
+    const T dz = static_cast<T>(183649.27391);
+
+    #pragma unroll
+    for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+      dx[ilp] = static_cast<T>(312847.91837) + static_cast<T>(ilp * 0.1);
+    }
+
+    #pragma unroll ITER_UNROLL_FACTOR
+    for (int32_t i = 0; i < iterations; i++) {
+      #pragma unroll
+      for (int ilp = 0; ilp < ILP_FACTOR; ilp++) {
+        T norm;
+        if constexpr (std::is_same_v<T, fltflt>) {
+          norm = fltflt_norm3d(dx[ilp], dy, dz);
+        } else {
+          norm = sqrt(dx[ilp] * dx[ilp] + dy * dy + dz * dz);
+        }
+        // Feed result back into dx to create a dependency chain.
+        // Add the computed norm and subtract off the approximate
+        // expected norm to keep dx in a stable range while preventing
+        // the compiler from optimizing away the computation.
+        if constexpr (std::is_same_v<T, fltflt>) {
+          // fltflt addition/subtraction is expensive and we do not want to bias the benchmark
+          // too much, so at least keep the expected norm as a float rather than fltflt to
+          // reduce the cost of the subtraction.
+          dx[ilp] = dx[ilp] + (norm - 607499.4f);
+        } else {
+          dx[ilp] = dx[ilp] + (norm - static_cast<T>(607499.4));
+        }
+      }
+    }
+
+    T result_val = dx[0];
+    #pragma unroll
+    for (int ilp = 1; ilp < ILP_FACTOR; ilp++) {
+      result_val = result_val + dx[ilp];
+    }
+    result[idx] = result_val;
+  }
+}
+
+template <typename PrecisionType>
+void fltflt_bench_norm3d(nvbench::state &state, nvbench::type_list<PrecisionType>)
+{
+  const index_t size = static_cast<index_t>(state.get_int64("Array Size"));
+  const int32_t iterations = static_cast<int32_t>(state.get_int64("Iterations"));
+  cudaExecutor exec{0};
+
+  auto result = make_tensor<PrecisionType>({size});
+
+  state.add_element_count(size, "NumElements");
+  state.add_global_memory_writes<PrecisionType>(size);
+
+  constexpr int block_size = 256;
+  int grid_size = static_cast<int>((size + block_size - 1) / block_size);
+
+  exec.sync();
+
+  state.exec([&](nvbench::launch &launch) {
+    iterative_norm3d_kernel<<<grid_size, block_size, 0, (cudaStream_t)launch.get_stream()>>>(
+      result.Data(), size, iterations);
+  });
+}
+
+NVBENCH_BENCH_TYPES(fltflt_bench_norm3d, NVBENCH_TYPE_AXES(precision_types))
+  .add_int64_power_of_two_axis("Array Size", nvbench::range(24, 24, 1))
+  .add_int64_axis("Iterations", {250});
+
 //==============================================================================
 // Absolute Value Benchmark
 //==============================================================================
 
@@ -203,6 +203,49 @@ def parse_benchmark_output(output, verbose=False):
     return results
 
 
+def parse_benchmark_output_no_type(output, verbose=False):
+    """
+    Parse nvbench output for benchmarks without a type axis (fltflt-only).
+    Returns a dict with a single 'fltflt' key.
+    """
+    results = {}
+    output = strip_ansi(output)
+    lines = output.strip().split('\n')
+
+    gpu_time_col_idx = None
+    for line in lines:
+        if '|' in line and 'GPU Time' in line:
+            cols = [col.strip() for col in line.split('|')]
+            for j, col in enumerate(cols):
+                if col == 'GPU Time':
+                    gpu_time_col_idx = j
+                    break
+            if gpu_time_col_idx is not None:
+                if verbose:
+                    print(f"  Found GPU Time at column index {gpu_time_col_idx} in: {line.rstrip()}")
+                break
+
+    if gpu_time_col_idx is None:
+        print("  Warning: Could not find GPU Time column in output")
+        return results
+
+    for line in lines:
+        if '|' not in line or 'GPU Time' in line or '---' in line:
+            continue
+        cols = [col.strip() for col in line.split('|')]
+        if len(cols) <= gpu_time_col_idx:
+            continue
+        gpu_time_str = cols[gpu_time_col_idx]
+        gpu_time_ms = parse_time_value(gpu_time_str)
+        if gpu_time_ms is not None:
+            if verbose:
+                print(f"  Parsed: type=fltflt, gpu_time_col={gpu_time_str!r}, value={gpu_time_ms:.6f} ms")
+            results['fltflt'] = gpu_time_ms
+            break
+
+    return results
+
+
 def format_time(time_ms):
     """Format a time in ms with appropriate precision and units."""
     if time_ms is None:
@@ -254,7 +297,7 @@ def print_summary(results, relative):
     print("-" * 66)
 
     # Order benchmarks - use the canonical order but only show benchmarks that were actually run
-    bench_order = ['add', 'sub', 'mul', 'div', 'sqrt', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt']
+    bench_order = ['add', 'sub', 'mul', 'div', 'sqrt', 'sqrt_fast', 'norm3d', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt']
     # Filter to only benchmarks present in results
     bench_order = [b for b in bench_order if b in results]
 
@@ -386,7 +429,9 @@ def main():
     print()
 
     # List of benchmarks to run
-    all_benchmarks = ['add', 'sub', 'mul', 'div', 'sqrt', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt']
+    all_benchmarks = ['add', 'sub', 'mul', 'div', 'sqrt', 'sqrt_fast', 'norm3d', 'abs', 'fma', 'madd', 'round', 'trunc', 'floor', 'fmod', 'cast2dbl', 'cast2fltflt']
+    # Benchmarks that only have a fltflt variant (no float/double type axis)
+    fltflt_only_benchmarks = set()
     benchmarks = args.benchmarks if args.benchmarks is not None else all_benchmarks
 
     # Validate user-provided benchmarks
@@ -410,7 +455,10 @@ def main():
             continue
 
         # Parse results
-        results = parse_benchmark_output(output, verbose=args.verbose)
+        if bench in fltflt_only_benchmarks:
+            results = parse_benchmark_output_no_type(output, verbose=args.verbose)
+        else:
+            results = parse_benchmark_output(output, verbose=args.verbose)
 
         if not results:
             print(f"  Warning: Could not parse results for {bench}")
 
@@ -53,8 +53,10 @@ struct alignas(8) fltflt {
     float hi;
     float lo;
 
-    // The default constructor does not initialize the components, so the value is indeterminate.
-    __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt() = default;
+    // The default constructor does not initialize the components, so the value is indeterminate. Some versions of
+    // nvcc will warn about __host__ and __device__ annotations on default constructors because default
+    // constructors will not run in all conditions (e.g., in static shared memory CUDA kernel allocations).
+    __MATX_INLINE__ fltflt() = default;
     __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ constexpr explicit fltflt(double x)
         : hi(static_cast<float>(x)), lo(static_cast<float>(x - static_cast<double>(hi))) {}
     __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ constexpr explicit fltflt(float x) : hi(x), lo(0.0f) {}
@@ -142,6 +144,10 @@ static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ float fdividef_rn(float a,
 static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ float fltflt_rsqrt(float x)
 {
 #if defined(__CUDA_ARCH__)
+    // rsqrtf has up to 2 ULP of error. This is less precise than 1.0f / ::sqrtf(x), which
+    // would be 0.5 ULP of error. We currently use rsqrtf() because it is significantly faster
+    // while maintaining 44+ bits of precision in testing thus far, but we may need to revisit
+    // this in the future.
     return rsqrtf(x);
 #else
     return 1.0f / ::sqrtf(x);
@@ -519,6 +525,60 @@ static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_sqrt(fltflt a
     return fltflt_add(prod, yn);
 }
 
+// fltflt_sqrt_fast() is a faster approximation of fltflt_sqrt() that uses a single FMA to
+// compute the residual a - yn^2 instead of full fltflt subtraction. The FMA computes
+// a.hi - yn*yn exactly (exact multiply, single rounding), and adding a.lo recovers the
+// input's low-order bits. The result has precision comparable to fltflt_sqrt for most
+// values at roughly 1/5 the cost (~7 FLOPs vs ~35+). We do see differences for some
+// inputs. For example, for 1e9*pi + sqrt(2), fltflt_sqrt() matches the fp64
+// baseline in all mantissa bits and fltflt_sqrt_fast() matches the first 45 mantissa bits.
+// This function may eventually become the default sqrt() implementation.
+static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_sqrt_fast(fltflt a) {
+    const float xn = (a.hi == 0.0f) ? 0.0f : detail::fltflt_rsqrt(a.hi);
+    const float yn = detail::fmul_rn(a.hi, xn);
+    const float residual = detail::fadd_rn(
+        detail::fmaf_rn(-yn, yn, a.hi), a.lo);
+    const float correction = detail::fmul_rn(
+        detail::fmul_rn(xn, 0.5f), residual);
+    return fltflt_fast_two_sum(yn, correction);
+}
+
+// fltflt_norm3d() computes sqrt(dx^2 + dy^2 + dz^2) with minimal intermediate
+// normalizations. Instead of the separate fltflt_mul + fltflt_fma + fltflt_fma + fltflt_sqrt_fast
+// chain (5 normalizations, ~50 ops), this function computes all three exact squares,
+// accumulates with a single normalization, and applies fltflt_sqrt_fast (~39 ops).
+// The three inputs are assumed to be normalized fltflt values.
+static __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt fltflt_norm3d(fltflt dx, fltflt dy, fltflt dz) {
+    // Exact squares of hi components (each captures full rounding error)
+    const fltflt px = fltflt_two_prod_fma(dx.hi, dx.hi);
+    const fltflt py = fltflt_two_prod_fma(dy.hi, dy.hi);
+    const fltflt pz = fltflt_two_prod_fma(dz.hi, dz.hi);
+
+    // Sum the three .hi values using two_sum to capture rounding errors
+    const fltflt s = fltflt_two_sum(px.hi, py.hi);
+    const fltflt t = fltflt_two_sum(s.hi, pz.hi);
+
+    // Accumulate all eight low-order terms into a single float:
+    //   - two_sum rounding errors: s.lo, t.lo
+    //   - two_prod_fma error terms: px.lo, py.lo, pz.lo
+    //   - cross terms from squaring: 2*dx.hi*dx.lo, 2*dy.hi*dy.lo, 2*dz.hi*dz.lo
+    // All terms are O(eps) relative to t.hi, so their sum is at most 8*eps*|t.hi|.
+    // This may result in slight precision loss due to potential overlap between
+    // lo and t.hi, but this should still be valid for ~44 bits prior to the sqrt.
+    float lo = detail::fadd_rn(t.lo, s.lo);
+    lo = detail::fadd_rn(lo, px.lo);
+    lo = detail::fadd_rn(lo, py.lo);
+    lo = detail::fadd_rn(lo, pz.lo);
+    lo = detail::fmaf_rn(detail::fadd_rn(dx.hi, dx.hi), dx.lo, lo);
+    lo = detail::fmaf_rn(detail::fadd_rn(dy.hi, dy.hi), dy.lo, lo);
+    lo = detail::fmaf_rn(detail::fadd_rn(dz.hi, dz.hi), dz.lo, lo);
+
+    // Single normalization before sqrt
+    const fltflt sum_sq = fltflt_fast_two_sum(t.hi, lo);
+
+    return fltflt_sqrt_fast(sum_sq);
+}
+
 // Scalar sqrt overload so unary operator dispatch can handle fltflt expressions
 __MATX_HOST__ __MATX_DEVICE__ __MATX_INLINE__ fltflt sqrt(fltflt a) { return fltflt_sqrt(a); }