GPU support, BitPacking extension

AntonOresten · AntonOresten · commit 40783c155f3e · 2025-09-08T21:29:32.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,17 @@
 name = "Microfloats"
 uuid = "31c70f10-a750-4521-b13c-797315ae2933"
 authors = ["Anton Oresten <antonoresten@gmail.com> and contributors"]
-version = "0.0.6"
+version = "0.0.7"
 
 [deps]
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 
+[weakdeps]
+BitPacking = "b58c8408-13c4-4787-8733-ac52107ded21"
+
+[extensions]
+BitPackingExt = "BitPacking"
+
 [compat]
 Random = "1"
 julia = "1.10"
diff --git a/ext/BitPackingExt.jl b/ext/BitPackingExt.jl
@@ -0,0 +1,8 @@
+module BitPackingExt
+
+using Microfloats
+using BitPacking
+
+BitPacking.bitwidth(::Type{<:Microfloat}) = Microfloats.n_bits(T)
+
+end
diff --git a/src/MX/MX.jl b/src/MX/MX.jl
@@ -2,7 +2,7 @@ abstract type MX <: Variant end
 
 const MX_Microfloat{S,E,M} = Microfloat{S,E,M,MX}
 
-const MX_E5M2 = IEEEMicrofloat{1,5,2}
+const MX_E5M2 = MX_Microfloat{1,5,2} # technically IEEE 754 compliant
 const MX_E4M3 = MX_Microfloat{1,4,3}
 const MX_E3M2 = MX_Microfloat{1,3,2}
 const MX_E2M3 = MX_Microfloat{1,2,3}
@@ -15,7 +15,7 @@ const NO_NAN_OR_INF = Union{MX_E3M2, MX_E2M3, MX_E2M1}
 
 Base.isinf(::NO_INF) = false
 Base.isnan(::NO_NAN) = false
-nan(::Type{T}) where T<:NO_NAN = throw(DomainError(T, "$T has no NaN values"))
+nan(::Type{T}) where T<:NO_NAN = zero(T)
 
 Base.floatmax(::Type{T}) where T<:NO_NAN_OR_INF = reinterpret(T, exponent_mask(T) | mantissa_mask(T))
 
@@ -33,8 +33,8 @@ nan(::Type{MX_E8M0}) = reinterpret(MX_E8M0, 0xff)
 # Float32 conversion for MX variants:
 # - exp=all-ones is "normal" except for the MX NaN sentinel(s)
 # - otherwise identical mapping as IEEE
-function _float32(x::T) where {T<:MX_Microfloat}
-    T isa MX_E8M0 && reinterpret(UInt8, x) == 0xff && return NaN32
+function _float32(x::T) where {T<:Union{MX_E4M3, MX_E3M2, MX_E2M3, MX_E2M1, MX_E8M0}}
+    T <: MX_E8M0 && reinterpret(UInt8, x) === 0xff && return NaN32
 
     sgn = UInt32(right_aligned_sign(x))
     exp = UInt32(right_aligned_exponent(x))
@@ -73,7 +73,7 @@ function _float32(x::T) where {T<:MX_Microfloat}
 end
 
 # Saturating to_microfloat tables for MX (no Infs; overflow -> ±floatmax)
-function create_base_shifttable(::Type{T}) where {T<:MX_Microfloat}
+function create_base_shifttable(::Type{T}) where {T<:Union{MX_E4M3, MX_E3M2, MX_E2M3, MX_E2M1, MX_E8M0}}
     basetable = Vector{T}(undef, 512)
     shifttable = Vector{UInt8}(undef, 512)
 
@@ -89,7 +89,7 @@ function create_base_shifttable(::Type{T}) where {T<:MX_Microfloat}
             shifttable[i|0x000+1] = -e + e_shift_subnorm
             shifttable[i|0x100+1] = -e + e_shift_subnorm
         elseif e < e_overflow_mx
-            basebits = (e + Int(bias(T))) << exponent_offset(T)
+            basebits = (e + Int(exponent_bias(T))) << exponent_offset(T)
             basetable[i|0x000+1] = reinterpret(T, UInt8(basebits))
             basetable[i|0x100+1] = reinterpret(T, UInt8(basebits | Int(sign_mask(T))))
             shifttable[i|0x000+1] = n_mantissa_bits(Float32) - n_mantissa_bits(T)
@@ -106,7 +106,7 @@ function create_base_shifttable(::Type{T}) where {T<:MX_Microfloat}
             shifttable[i|0x100+1] = n_mantissa_bits(Float32) - n_mantissa_bits(T)
         end
     end
-    return reinterpret(UInt8, basetable), shifttable
+    return (reinterpret(UInt8, basetable)...,), (shifttable...,)
 end
 
 # Saturating bounds for MX: use finite extrema
diff --git a/src/conversion/conversion.jl b/src/conversion/conversion.jl
@@ -1,5 +1,4 @@
-bias(::Type{T}) where T<:Microfloat = UInt32(2^(n_exponent_bits(T) - 1) - 1)
-bias_difference(::Type{T}) where T<:Microfloat = UInt32(127 - bias(T))
+bias_difference(::Type{T}) where T<:Microfloat = UInt32(exponent_bias(Float32) - exponent_bias(T))
 
 include("to_microfloat.jl")
 include("from_microfloat.jl")
diff --git a/src/conversion/to_microfloat.jl b/src/conversion/to_microfloat.jl
@@ -1,6 +1,6 @@
-e_subnormal(T) = 1 - bias(T) - n_mantissa_bits(T)
-e_normal(T) = 1 - bias(T)
-e_overflow(T) = (2^n_exponent_bits(T) - 2) - bias(T) + 1
+e_subnormal(T) = 1 - exponent_bias(T) - n_mantissa_bits(T)
+e_normal(T) = 1 - exponent_bias(T)
+e_overflow(T) = (2^n_exponent_bits(T) - 2) - exponent_bias(T) + 1
 
 function create_base_shifttable(::Type{T}) where {T<:Microfloat}
 
@@ -20,7 +20,7 @@ function create_base_shifttable(::Type{T}) where {T<:Microfloat}
             shifttable[i|0x000+1] = -e+e_shift_subnorm
             shifttable[i|0x100+1] = -e+e_shift_subnorm
         elseif e < e_overflow(T)                # Normal numbers just lose precision
-            basebits = (e + Int(bias(T))) << exponent_offset(T)
+            basebits = (e + Int(exponent_bias(T))) << exponent_offset(T)
             basetable[i|0x000+1] = reinterpret(T, UInt8(basebits))
             basetable[i|0x100+1] = reinterpret(T, UInt8(basebits | Int(sign_mask(T))))
             shifttable[i|0x000+1] = n_mantissa_bits(Float32)-n_mantissa_bits(T)
@@ -49,7 +49,6 @@ end
     basetable, shifttable = create_base_shifttable(T)
 
     quote
-        isnan(x) && return nan(T) # TODO retain the significant bits for NaN?
         f = reinterpret(UInt32, x)
     
         # exponent+sign index into 512-entry tables (9 bits), 1-based
@@ -75,6 +74,6 @@ end
                 h = h + (UInt8(1) << mantissa_offset(T))
             end
         end
-        return reinterpret(T, h)
+        return ifelse(isnan(x), nan(T), reinterpret(T, h)) # TODO retain the significant bits for NaN?
     end
 end
diff --git a/src/float-bits.jl b/src/float-bits.jl
@@ -7,11 +7,9 @@ uint(::Type{T}) where T<:Unsigned = T
 as_uint(x::T) where T<:AbstractFloat = reinterpret(uint(T), x)
 bit_ones(N, T=UInt8) = (one(uint(T)) << N) - one(uint(T))
 
-n_total_bits(::Type{T}) where T<:AbstractFloat = sizeof(T) * 8
-n_utilized_bits(::Type{T}) where T<:AbstractFloat = n_sign_bits(T) + n_exponent_bits(T) + n_mantissa_bits(T)
-n_rpad_bits(::Type{T}) where T<:AbstractFloat = 0
+n_bits(::Type{T}) where T<:AbstractFloat = n_sign_bits(T) + n_exponent_bits(T) + n_mantissa_bits(T)
 
-mantissa_offset(::Type{T}) where T<:AbstractFloat = n_rpad_bits(T)
+mantissa_offset(::Type{T}) where T<:AbstractFloat = 0
 exponent_offset(::Type{T}) where T<:AbstractFloat = n_mantissa_bits(T) + mantissa_offset(T)
 sign_offset(::Type{T}) where T<:AbstractFloat = n_exponent_bits(T) + exponent_offset(T)
 
@@ -27,6 +25,8 @@ right_aligned_sign(x::T) where T<:AbstractFloat = only_sign(x) >> sign_offset(T)
 right_aligned_exponent(x::T) where T<:AbstractFloat = only_exponent(x) >> exponent_offset(T)
 right_aligned_mantissa(x::T) where T<:AbstractFloat = only_mantissa(x) >> mantissa_offset(T)
 
+exponent_bias(::Type{T}) where T<:AbstractFloat = UInt32(2^(n_exponent_bits(T) - 1) - 1)
+
 # right_aligned_sign_mask(::Type{T}) where T<:AbstractFloat = bit_ones(n_sign_bits(T), T)
 right_aligned_exponent_mask(::Type{T}) where T<:AbstractFloat = bit_ones(n_exponent_bits(T), T)
 right_aligned_mantissa_mask(::Type{T}) where T<:AbstractFloat = bit_ones(n_mantissa_bits(T), T)
diff --git a/test/MX/MX_compliance.jl b/test/MX/MX_compliance.jl
@@ -8,7 +8,7 @@
         @testset "FP8" begin
 
             @testset "E4M3" begin
-                @test Microfloats.bias(MX_E4M3) == 7
+                @test Microfloats.exponent_bias(MX_E4M3) == 7
 
                 @test isfinite(reinterpret(MX_E4M3, 0b0_1111_000))
                 @test isfinite(reinterpret(MX_E4M3, 0b1_1111_000))
@@ -38,7 +38,7 @@
             end
 
             @testset "E5M2" begin
-                @test Microfloats.bias(MX_E5M2) == 15
+                @test Microfloats.exponent_bias(MX_E5M2) == 15
 
                 @test reinterpret(UInt8, MX_E5M2(Inf)) == 0b0_11111_00
                 @test reinterpret(UInt8, MX_E5M2(-Inf)) == 0b1_11111_00
@@ -71,7 +71,7 @@
         @testset "FP6" begin
 
             @testset "E2M3" begin
-                @test Microfloats.bias(MX_E2M3) == 1
+                @test Microfloats.exponent_bias(MX_E2M3) == 1
 
                 @test isfinite(reinterpret(MX_E2M3, 0b0_11_000))
                 @test isfinite(reinterpret(MX_E2M3, 0b1_11_000))
@@ -98,7 +98,7 @@
             end
 
             @testset "E3M2" begin
-                @test Microfloats.bias(MX_E3M2) == 3
+                @test Microfloats.exponent_bias(MX_E3M2) == 3
 
                 @test isfinite(reinterpret(MX_E3M2, 0b0_111_00))
                 @test isfinite(reinterpret(MX_E3M2, 0b1_111_00))
@@ -129,7 +129,7 @@
         @testset "FP4" begin
 
             @testset "E2M1" begin
-                @test Microfloats.bias(MX_E2M1) == 1
+                @test Microfloats.exponent_bias(MX_E2M1) == 1
 
                 @test isfinite(reinterpret(MX_E2M1, 0b0_11_0))
                 @test isfinite(reinterpret(MX_E2M1, 0b1_11_0))
@@ -158,7 +158,7 @@
 
         # arithmetic not yet supported for unsigned microfloats
         @testset "E8M0" begin
-            @test Microfloats.bias(MX_E8M0) == 127
+            @test Microfloats.exponent_bias(MX_E8M0) == 127
 
             #@test floatmax(E8M0) == floatmax(Float32) / 2
 
diff --git a/test/MX/MX_properties.jl b/test/MX/MX_properties.jl
@@ -76,21 +76,23 @@ end
 end
 
 @testset "MX: saturation and NaN/Inf mapping from Float32" begin
-    for T in (MX_E4M3, MX_E3M2, MX_E2M3, MX_E2M1, MX_E8M0)
+    for T in (MX_E5M2, MX_E4M3, MX_E3M2, MX_E2M3, MX_E2M1, MX_E8M0)
         @testset "$T" begin
             fmax = floatmax(T)
             # +Inf/-Inf map to ±floatmax (unsigned maps both to +floatmax)
-            @test T(Inf32) == fmax
-            if Microfloats.n_sign_bits(T) == 0
-                @test T(-Inf32) == fmax
-            else
-                @test T(-Inf32) == -fmax
+            if !(T <: MX_E5M2)
+                @test T(Inf32) == fmax
+                if Microfloats.n_sign_bits(T) == 0
+                    @test T(-Inf32) == fmax
+                else
+                    @test T(-Inf32) == -fmax
+                end
             end
             # NaN maps to sentinel for E4M3/E8M0, else saturates to floatmax
             if T <: Union{MX_E4M3, MX_E5M2, MX_E8M0}
                 @test isnan(T(NaN32))
             else
-                @test_throws DomainError T(NaN32)
+                @test iszero(T(NaN32))
             end
             # Values just beyond floatmax saturate
             big = nextfloat(Float32(fmax))
@@ -110,7 +112,7 @@ end
             if Microfloats.has_mantissa(T)
                 u = UInt8(1) << Microfloats.mantissa_offset(T)
                 x = reinterpret(T, u)
-                expected = Float32(2.0)^(1 - Microfloats.bias(T) - Microfloats.n_mantissa_bits(T))
+                expected = Float32(2.0)^(1 - Microfloats.exponent_bias(T) - Microfloats.n_mantissa_bits(T))
                 @test Float32(x) == expected
             end
         end
diff --git a/test/Microfloat.jl b/test/Microfloat.jl
@@ -72,7 +72,7 @@ end
 
 @testset "IEEE microfloats: subnormals and rounding" begin
     @testset for T in TYPES
-        bias = Microfloats.bias(T)
+        bias = Microfloats.exponent_bias(T)
         M = Microfloats.n_mantissa_bits(T)
         mo = Microfloats.mantissa_offset(T)
         # Encoding for the minimum positive subnormal (mantissa LSB only)