add NVFP4, remove rpad by aligning bits to right

AntonOresten · AntonOresten · commit 3abb5f639956 · 2025-09-01T12:11:21.000+02:00
diff --git a/Project.toml b/Project.toml
@@ -1,7 +1,7 @@
 name = "Microfloats"
 uuid = "31c70f10-a750-4521-b13c-797315ae2933"
 authors = ["Anton Oresten <antonoresten@gmail.com> and contributors"]
-version = "0.0.3"
+version = "0.0.4"
 
 [deps]
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
diff --git a/README.md b/README.md
@@ -36,6 +36,8 @@ const E2M1 = Microfloat(1, 2, 1, :MX)
 const E8M0 = Microfloat(0, 8, 0, :MX)
 ```
 
+For `INT8`, see `FixedPointNumbers.Q1f6`.
+
 ## Installation
 
 ```julia
@@ -49,3 +51,4 @@ Pkg.add("Microfloats")
 - [MicroFloatingPoints.jl](https://github.com/goualard-f/MicroFloatingPoints.jl)
 - [DLFP8Types.jl](https://github.com/chengchingwen/DLFP8Types.jl)
 - [Float8s.jl](https://github.com/JuliaMath/Float8s.jl)
+- [FixedPointNumbers.jl](https://github.com/JuliaMath/FixedPointNumbers.jl)
diff --git a/src/Microfloat.jl b/src/Microfloat.jl
@@ -1,18 +1,18 @@
-primitive type Microfloat{S,E,M,V} <: AbstractFloat 8 end
+abstract type Variant end
+abstract type IEEE <: Variant end
 
-const SignedMicrofloat = Microfloat{1}
-const UnsignedMicrofloat = Microfloat{0}
+primitive type Microfloat{S,E,M,V} <: AbstractFloat 8 end
 
 """
-    Microfloat(S, E, M, V=:IEEE)
+    Microfloat(S, E, M, V=IEEE)
 
 Create a new `Microfloat` type with `S` sign bits, `E` exponent bits, and `M` mantissa bits.
 
 This "type constructor" ensures that the resulting type is legal.
 
 The `V` argument can be set to `:MX` to create a Microscaling Format (MX) type.
 """
-function Microfloat(S::Int, E::Int, M::Int, V::Symbol=:IEEE)
+function Microfloat(S::Int, E::Int, M::Int, V::Type{<:Variant}=IEEE)
     S in (0, 1) || throw(ArgumentError("sign bit must be 0 or 1"))
     E >= 1 || throw(ArgumentError("number of exponent bits must be non-negative"))
     M >= 0 || throw(ArgumentError("number of mantissa bits must be non-negative"))
@@ -47,7 +47,7 @@ Base.floatmin(::Type{T}) where T<:Microfloat = n_exponent_bits(T) > 1 ? reinterp
 Base.floatmax(::Type{T}) where T<:Microfloat = reinterpret(T, bit_ones(n_exponent_bits(T) - 1) << (exponent_offset(T) + 1) | mantissa_mask(T))
 
 Base.typemin(::Type{T}) where T<:Microfloat = -inf(T)
-Base.typemin(::Type{T}) where T<:UnsignedMicrofloat = zero(T)
+Base.typemin(::Type{T}) where T<:Microfloat{0} = zero(T)
 
 Base.typemax(::Type{T}) where T<:Microfloat = inf(T)
 
diff --git a/src/Microfloats.jl b/src/Microfloats.jl
@@ -4,9 +4,11 @@ include("float-bits.jl")
 
 include("Microfloat.jl")
 export Microfloat
-export SignedMicrofloat, UnsignedMicrofloat
+export IEEE
 
-include("MX/MX.jl")
+include("microscaled/microscaled.jl")
+export MX, NV
+export MX_E5M2, MX_E4M3, MX_E3M2, MX_E2M3, MX_E2M1, MX_E8M0, NV_E2M1
 
 include("conversion/conversion.jl")
 
diff --git a/src/conversion/to_microfloat.jl b/src/conversion/to_microfloat.jl
@@ -43,7 +43,9 @@ function create_base_shifttable(::Type{T}) where {T<:Microfloat}
     return reinterpret(UInt8, basetable), shifttable
 end
 
-@generated function (::Type{T})(x::Float32) where {S,E,M,T<:Microfloat{S,E,M}}
+(::Type{T})(x::Float32) where {S,E,M,T<:Microfloat{S,E,M}} = T{IEEE}(x)
+
+@generated function (::Type{T})(x::Float32) where {S,E,M,V,T<:Microfloat{S,E,M,V}}
     basetable, shifttable = create_base_shifttable(T)
 
     quote
diff --git a/src/float-bits.jl b/src/float-bits.jl
@@ -9,9 +9,9 @@ bit_ones(N, T=UInt8) = (one(uint(T)) << N) - one(uint(T))
 
 n_total_bits(::Type{T}) where T<:AbstractFloat = sizeof(T) * 8
 n_utilized_bits(::Type{T}) where T<:AbstractFloat = n_sign_bits(T) + n_exponent_bits(T) + n_mantissa_bits(T)
-n_padding_bits(::Type{T}) where T<:AbstractFloat = n_total_bits(T) - n_utilized_bits(T)
+n_rpad_bits(::Type{T}) where T<:AbstractFloat = 0
 
-mantissa_offset(::Type{T}) where T<:AbstractFloat = n_padding_bits(T)
+mantissa_offset(::Type{T}) where T<:AbstractFloat = n_rpad_bits(T)
 exponent_offset(::Type{T}) where T<:AbstractFloat = n_mantissa_bits(T) + mantissa_offset(T)
 sign_offset(::Type{T}) where T<:AbstractFloat = n_exponent_bits(T) + exponent_offset(T)
 
diff --git a/src/microscaled/microscaled.jl b/src/microscaled/microscaled.jl
@@ -1,25 +1,28 @@
-# src/MX/MX.jl
+abstract type MX <: Variant end
+abstract type NV <: Variant end
+const Microscaled = Union{MX, NV}
 
-const MXMicrofloat{S,E,M} = Microfloat{S,E,M,:MX}
-const SignedMXMicrofloat = MXMicrofloat{1}
-const UnsignedMXMicrofloat = MXMicrofloat{0}
+const MXMicrofloat{S,E,M} = Microfloat{S,E,M,MX}
+const NVMicrofloat{S,E,M} = Microfloat{S,E,M,NV}
+const MicroscaledMicrofloat{S,E,M} = Microfloat{S,E,M,<:Microscaled}
 
 const MX_E5M2 = MXMicrofloat{1,5,2}
 const MX_E4M3 = MXMicrofloat{1,4,3}
 const MX_E3M2 = MXMicrofloat{1,3,2}
 const MX_E2M3 = MXMicrofloat{1,2,3}
 const MX_E2M1 = MXMicrofloat{1,2,1}
 const MX_E8M0 = MXMicrofloat{0,8,0}
+const NV_E2M1 = NVMicrofloat{1,2,1}
 
-const MX_NO_INF = Union{MX_E4M3, MX_E3M2, MX_E2M3, MX_E2M1, MX_E8M0}
-const MX_NO_NAN = Union{MX_E3M2, MX_E2M3, MX_E2M1}
-const MX_NO_NAN_OR_INF = Union{MX_E3M2, MX_E2M3, MX_E2M1}
+const NO_INF = Union{MX_E4M3, MX_E3M2, MX_E2M3, MX_E2M1, MX_E8M0, NV_E2M1}
+const NO_NAN = Union{MX_E3M2, MX_E2M3, MX_E2M1, NV_E2M1}
+const NO_NAN_OR_INF = Union{MX_E3M2, MX_E2M3, MX_E2M1, NV_E2M1}
 
-Base.isinf(::MX_NO_INF) = false
-Base.isnan(::MX_NO_NAN) = false
-nan(::Type{T}) where T<:MX_NO_NAN = throw(DomainError(T, "$T has no NaN values"))
+Base.isinf(::NO_INF) = false
+Base.isnan(::NO_NAN) = false
+nan(::Type{T}) where T<:NO_NAN = throw(DomainError(T, "$T has no NaN values"))
 
-Base.floatmax(::Type{T}) where T<:MX_NO_NAN_OR_INF = reinterpret(T, exponent_mask(T) | mantissa_mask(T))
+Base.floatmax(::Type{T}) where T<:NO_NAN_OR_INF = reinterpret(T, exponent_mask(T) | mantissa_mask(T))
 
 # E4M3 (MX): no Infs; only mantissa == 111 at exp=1111 is NaN
 nan(::Type{T}) where T<:MX_E4M3 = reinterpret(T, exponent_mask(T) | mantissa_mask(T))
@@ -35,7 +38,7 @@ nan(::Type{MX_E8M0}) = reinterpret(MX_E8M0, 0xff)
 # Float32 conversion for MX variants:
 # - exp=all-ones is "normal" except for the MX NaN sentinel(s)
 # - otherwise identical mapping as IEEE
-function _float32(x::T) where {T<:MXMicrofloat}
+function _float32(x::T) where {T<:MicroscaledMicrofloat}
     T isa MX_E8M0 && reinterpret(UInt8, x) == 0xff && return NaN32
 
     sgn = UInt32(right_aligned_sign(x))
@@ -75,7 +78,7 @@ function _float32(x::T) where {T<:MXMicrofloat}
 end
 
 # Saturating to_microfloat tables for MX (no Infs; overflow -> ±floatmax)
-function create_base_shifttable(::Type{T}) where {T<:MXMicrofloat}
+function create_base_shifttable(::Type{T}) where {T<:MicroscaledMicrofloat}
     basetable = Vector{T}(undef, 512)
     shifttable = Vector{UInt8}(undef, 512)
 
@@ -112,5 +115,5 @@ function create_base_shifttable(::Type{T}) where {T<:MXMicrofloat}
 end
 
 # Saturating bounds for MX: use finite extrema
-Base.typemax(::Type{T}) where {S,E,M,T<:MXMicrofloat{S,E,M}} = floatmax(T)
-Base.typemin(::Type{T}) where {S,E,M,T<:MXMicrofloat{S,E,M}} = ifelse(n_sign_bits(T) == 0, zero(T), -floatmax(T))
+Base.typemax(::Type{T}) where {S,E,M,T<:MicroscaledMicrofloat{S,E,M}} = floatmax(T)
+Base.typemin(::Type{T}) where {S,E,M,T<:MicroscaledMicrofloat{S,E,M}} = ifelse(n_sign_bits(T) == 0, zero(T), -floatmax(T))
diff --git a/test/MX/MX_compliance.jl b/test/MX/MX_compliance.jl
@@ -8,7 +8,7 @@
         @testset "FP8" begin
 
             @testset "E4M3" begin
-                E4M3 = Microfloat(1, 4, 3, :MX)
+                E4M3 = Microfloat(1, 4, 3, MX)
 
                 @test Microfloats.bias(E4M3) == 7
 
@@ -74,90 +74,90 @@
         @testset "FP6" begin
 
             @testset "E2M3" begin
-                E2M3 = Microfloat(1, 2, 3, :MX)
+                E2M3 = Microfloat(1, 2, 3, MX)
 
                 @test Microfloats.bias(E2M3) == 1
 
-                @test isfinite(reinterpret(E2M3, 0b0_11_000_00))
-                @test isfinite(reinterpret(E2M3, 0b1_11_000_00))
+                @test isfinite(reinterpret(E2M3, 0b0_11_000))
+                @test isfinite(reinterpret(E2M3, 0b1_11_000))
 
                 for i in 0b001:0b111
-                    @test isfinite(reinterpret(E2M3, 0b0_11_000_00 | i << 2))
-                    @test isfinite(reinterpret(E2M3, 0b1_11_000_00 | i << 2))
+                    @test isfinite(reinterpret(E2M3, 0b0_11_000 | i << 2))
+                    @test isfinite(reinterpret(E2M3, 0b1_11_000 | i << 2))
                 end
 
-                @test iszero(reinterpret(E2M3, 0b0_00_000_00))
-                @test iszero(reinterpret(E2M3, 0b1_00_000_00))
+                @test iszero(reinterpret(E2M3, 0b0_00_000))
+                @test iszero(reinterpret(E2M3, 0b1_00_000))
 
-                @test reinterpret(E2M3, 0b0_11_111_00) == 2^2 * 1.875
-                @test reinterpret(E2M3, 0b1_11_111_00) == -2^2 * 1.875
+                @test reinterpret(E2M3, 0b0_11_111) == 2^2 * 1.875
+                @test reinterpret(E2M3, 0b1_11_111) == -2^2 * 1.875
 
-                @test reinterpret(E2M3, 0b0_01_000_00) == 2^0 * 1.0
-                @test reinterpret(E2M3, 0b1_01_000_00) == -2^0 * 1.0
+                @test reinterpret(E2M3, 0b0_01_000) == 2^0 * 1.0
+                @test reinterpret(E2M3, 0b1_01_000) == -2^0 * 1.0
 
-                @test reinterpret(E2M3, 0b0_00_111_00) == 2^0 * 0.875
-                @test reinterpret(E2M3, 0b1_00_111_00) == -2^0 * 0.875
+                @test reinterpret(E2M3, 0b0_00_111) == 2^0 * 0.875
+                @test reinterpret(E2M3, 0b1_00_111) == -2^0 * 0.875
 
-                @test reinterpret(E2M3, 0b0_00_001_00) == 2^0 * 0.125
-                @test reinterpret(E2M3, 0b1_00_001_00) == -2^0 * 0.125
+                @test reinterpret(E2M3, 0b0_00_001) == 2^0 * 0.125
+                @test reinterpret(E2M3, 0b1_00_001) == -2^0 * 0.125
 
             end
 
             @testset "E3M2" begin
-                E3M2 = Microfloat(1, 3, 2, :MX)
+                E3M2 = Microfloat(1, 3, 2, MX)
 
                 @test Microfloats.bias(E3M2) == 3
 
-                @test isfinite(reinterpret(E3M2, 0b0_111_00_00))
-                @test isfinite(reinterpret(E3M2, 0b1_111_00_00))
+                @test isfinite(reinterpret(E3M2, 0b0_111_00))
+                @test isfinite(reinterpret(E3M2, 0b1_111_00))
 
                 for i in 0b01:0b11
-                    @test isfinite(reinterpret(E3M2, 0b0_111_00_00 | i << 2))
-                    @test isfinite(reinterpret(E3M2, 0b1_111_00_00 | i << 2))
+                    @test isfinite(reinterpret(E3M2, 0b0_111_00 | i << 2))
+                    @test isfinite(reinterpret(E3M2, 0b1_111_00 | i << 2))
                 end
 
-                @test iszero(reinterpret(E3M2, 0b0_000_00_00))
-                @test iszero(reinterpret(E3M2, 0b1_000_00_00))
+                @test iszero(reinterpret(E3M2, 0b0_000_00))
+                @test iszero(reinterpret(E3M2, 0b1_000_00))
 
-                @test reinterpret(E3M2, 0b0_111_11_00) == 2^4 * 1.75
-                @test reinterpret(E3M2, 0b1_111_11_00) == -2^4 * 1.75
+                @test reinterpret(E3M2, 0b0_111_11) == 2^4 * 1.75
+                @test reinterpret(E3M2, 0b1_111_11) == -2^4 * 1.75
 
-                @test reinterpret(E3M2, 0b0_001_00_00) == 2^-2 * 1.0
-                @test reinterpret(E3M2, 0b1_001_00_00) == -2^-2 * 1.0
+                @test reinterpret(E3M2, 0b0_001_00) == 2^-2 * 1.0
+                @test reinterpret(E3M2, 0b1_001_00) == -2^-2 * 1.0
 
-                @test reinterpret(E3M2, 0b0_000_11_00) == 2^-2 * 0.75
-                @test reinterpret(E3M2, 0b1_000_11_00) == -2^-2 * 0.75
+                @test reinterpret(E3M2, 0b0_000_11) == 2^-2 * 0.75
+                @test reinterpret(E3M2, 0b1_000_11) == -2^-2 * 0.75
 
-                @test reinterpret(E3M2, 0b0_000_01_00) == 2^-2 * 0.25
-                @test reinterpret(E3M2, 0b1_000_01_00) == -2^-2 * 0.25
+                @test reinterpret(E3M2, 0b0_000_01) == 2^-2 * 0.25
+                @test reinterpret(E3M2, 0b1_000_01) == -2^-2 * 0.25
             end
 
         end
 
         @testset "FP4" begin
 
             @testset "E2M1" begin
-                E2M1 = Microfloat(1, 2, 1, :MX)
+                E2M1 = Microfloat(1, 2, 1, MX)
 
                 @test Microfloats.bias(E2M1) == 1
 
-                @test isfinite(reinterpret(E2M1, 0b0_11_0_0000))
-                @test isfinite(reinterpret(E2M1, 0b1_11_0_0000))
+                @test isfinite(reinterpret(E2M1, 0b0_11_0))
+                @test isfinite(reinterpret(E2M1, 0b1_11_0))
 
-                @test isfinite(reinterpret(E2M1, 0b0_11_1_0000))
-                @test isfinite(reinterpret(E2M1, 0b1_11_1_0000))
+                @test isfinite(reinterpret(E2M1, 0b0_11_1))
+                @test isfinite(reinterpret(E2M1, 0b1_11_1))
 
-                @test iszero(reinterpret(E2M1, 0b0_00_0_0000))
-                @test iszero(reinterpret(E2M1, 0b1_00_0_0000))
+                @test iszero(reinterpret(E2M1, 0b0_00_0))
+                @test iszero(reinterpret(E2M1, 0b1_00_0))
 
-                @test reinterpret(E2M1, 0b0_11_1_0000) == 2^2 * 1.5
-                @test reinterpret(E2M1, 0b1_11_1_0000) == -2^2 * 1.5
+                @test reinterpret(E2M1, 0b0_11_1) == 2^2 * 1.5
+                @test reinterpret(E2M1, 0b1_11_1) == -2^2 * 1.5
 
-                @test reinterpret(E2M1, 0b0_01_0_0000) == 2^0 * 1.0
-                @test reinterpret(E2M1, 0b1_01_0_0000) == -2^0 * 1.0
+                @test reinterpret(E2M1, 0b0_01_0) == 2^0 * 1.0
+                @test reinterpret(E2M1, 0b1_01_0) == -2^0 * 1.0
 
-                @test reinterpret(E2M1, 0b0_00_1_0000) == 2^0 * 0.5
-                @test reinterpret(E2M1, 0b1_00_1_0000) == -2^0 * 0.5
+                @test reinterpret(E2M1, 0b0_00_1) == 2^0 * 0.5
+                @test reinterpret(E2M1, 0b1_00_1) == -2^0 * 0.5
             end
 
         end
@@ -168,7 +168,7 @@
 
         # arithmetic not yet supported for unsigned microfloats
         @testset "E8M0" begin
-            E8M0 = Microfloat(0, 8, 0, :MX)
+            E8M0 = Microfloat(0, 8, 0, MX)
 
             @test Microfloats.bias(E8M0) == 127
 
diff --git a/test/MX/MX_properties.jl b/test/MX/MX_properties.jl
@@ -1,9 +1,9 @@
-const E4M3 = Microfloat(1, 4, 3, :MX)
-const E5M2 = Microfloat(1, 5, 2, :MX)
-const E3M2 = Microfloat(1, 3, 2, :MX)
-const E2M3 = Microfloat(1, 2, 3, :MX)
-const E2M1 = Microfloat(1, 2, 1, :MX)
-const E8M0 = Microfloat(0, 8, 0, :MX)
+const E4M3 = Microfloat(1, 4, 3, MX)
+const E5M2 = Microfloat(1, 5, 2, MX)
+const E3M2 = Microfloat(1, 3, 2, MX)
+const E2M3 = Microfloat(1, 2, 3, MX)
+const E2M1 = Microfloat(1, 2, 1, MX)
+const E8M0 = Microfloat(0, 8, 0, MX)
 
 @testset "MX: no Infs" begin
     for T in (E4M3, E3M2, E2M3, E2M1, E8M0)
@@ -27,8 +27,8 @@ end
         maxm = UInt8((UInt16(1) << nm) - 1)
         for s in (UInt8(0), sm)
             for mv in UInt8(0):maxm
-                m = mv << mo
-                x = reinterpret(T, s | em | m)
+                m = (mv << mo) & mm
+                x = reinterpret(T, (s & sm) | em | m)
                 if m == mm
                     @test isnan(x)
                 else
@@ -45,11 +45,12 @@ end
             sm = UInt8(Microfloats.sign_mask(T))
             mo = Microfloats.mantissa_offset(T)
             nm = Microfloats.n_mantissa_bits(T)
+            mm = UInt8(Microfloats.mantissa_mask(T))
             maxm = UInt8((UInt16(1) << nm) - 1)
             for s in (UInt8(0), sm)
                 for mv in UInt8(0):maxm
-                    m = mv << mo
-                    x = reinterpret(T, s | em | m)
+                    m = (mv << mo) & mm
+                    x = reinterpret(T, (s & sm) | em | m)
                     @test isfinite(x)
                     @test !isnan(x)
                 end
@@ -69,11 +70,10 @@ end
 @testset "MX: round-trip via Float32 preserves bits (canonical encodings)" begin
     for T in (E4M3, E5M2, E3M2, E2M3, E2M1, E8M0)
         @testset "$T" begin
-            mshift = Microfloats.mantissa_offset(T)
-            mmask  = UInt8(Microfloats.mantissa_mask(T))
+            used_mask = UInt8(Microfloats.sign_mask(T) | Microfloats.exponent_mask(T) | Microfloats.mantissa_mask(T))
             for u in UInt8(0):UInt8(0xff)
                 # Only test canonical encodings where mantissa padding bits are zero
-                (u & ~mmask) != (u & ~mmask & ~(UInt8(1)<<mshift - UInt8(1))) && continue
+                (u & ~used_mask) != 0x00 && continue
                 x = reinterpret(T, u)
                 y = T(Float32(x))
                 @test y ≡ x
@@ -170,10 +170,9 @@ end
             for u in UInt8(0):UInt8(0xff)
                 x = reinterpret(T, u)
                 isnan(x) && continue
-                # Only include canonical encodings
-                mshift = Microfloats.mantissa_offset(T)
-                mmask  = UInt8(Microfloats.mantissa_mask(T))
-                (u & ~mmask) != (u & ~mmask & ~(UInt8(1)<<mshift - UInt8(1))) && continue
+                # Only include canonical encodings: padding bits outside fields are zero
+                used_mask = UInt8(Microfloats.sign_mask(T) | Microfloats.exponent_mask(T) | Microfloats.mantissa_mask(T))
+                (u & ~used_mask) != 0x00 && continue
                 push!(vals, (u, Float32(x), x))
             end
             sort!(vals, by = t -> t[2])
diff --git a/test/Microfloat.jl b/test/Microfloat.jl