Add Quantized GEMM kernel for Arm NEON on macOS ARM (#249)

Copilot · web-flow · commit d9a43a8fd02b · 2026-03-28T21:20:11.000+09:00
diff --git a/src/ArchAvailable.h b/src/ArchAvailable.h
@@ -59,7 +59,7 @@ namespace kiwi
 		static_cast<std::ptrdiff_t>(ArchType::sse4_1)
 #endif
 #if CPUINFO_ARCH_ARM64
-		//static_cast<std::ptrdiff_t>(ArchType::neon)
+		static_cast<std::ptrdiff_t>(ArchType::neon)
 #endif
 #else
 #ifdef KIWI_ARCH_X86_64
@@ -72,7 +72,7 @@ namespace kiwi
 		static_cast<std::ptrdiff_t>(ArchType::sse4_1)
 #endif
 #ifdef KIWI_ARCH_ARM64
-		//static_cast<std::ptrdiff_t>(ArchType::neon)
+		static_cast<std::ptrdiff_t>(ArchType::neon)
 #endif
 #endif
 	>;
diff --git a/src/CoNgramModel.cpp b/src/CoNgramModel.cpp
@@ -1,5 +1,7 @@
-﻿#include <iostream>
+#include <iostream>
 #include <fstream>
+#include <cstring>
+#include <limits>
 #include "PathEvaluator.hpp"
 #include "Joiner.hpp"
 #include "Kiwi.hpp"
@@ -626,7 +628,8 @@ namespace kiwi
 				if constexpr (quantized)
 				{
 					float scale;
-					eptr += requantizePackedInts<arch>(optr, scale, eptr, header.dim, header.qbit, header.qgroup, true);
+					const bool toUint8 = arch != ArchType::neon;
+					eptr += requantizePackedInts<arch>(optr, scale, eptr, header.dim, header.qbit, header.qgroup, toUint8);
 					optr += header.dim;
 					*reinterpret_cast<float*>(optr) = scale;
 					optr += sizeof(float);
@@ -678,11 +681,22 @@ namespace kiwi
 
 			if constexpr (quantized)
 			{
-				qgemm::invNormU8<arch>(
-					header.contextSize, header.dim,
-					getContextQuantEmb(0), contextEmbStride(),
-					const_cast<float*>(invNormContextPtr)
-				);
+				if constexpr (arch == ArchType::neon)
+				{
+					qgemm::invNormS8<arch>(
+						header.contextSize, header.dim,
+						getContextQuantEmbS8(0), contextEmbStride(),
+						const_cast<float*>(invNormContextPtr)
+					);
+				}
+				else
+				{
+					qgemm::invNormU8<arch>(
+						header.contextSize, header.dim,
+						getContextQuantEmb(0), contextEmbStride(),
+						const_cast<float*>(invNormContextPtr)
+					);
+				}
 				qgemm::invNormS8<arch>(
 					header.vocabSize, header.dim,
 					getOutputQuantEmb(0), outputEmbStride(),
@@ -711,7 +725,8 @@ namespace kiwi
 					if (quantized)
 					{
 						float scale;
-						eptr += requantizePackedInts<arch>(optr, scale, eptr, header.dim, header.qbit, header.qgroup, true);
+						const bool toUint8 = arch != ArchType::neon;
+						eptr += requantizePackedInts<arch>(optr, scale, eptr, header.dim, header.qbit, header.qgroup, toUint8);
 						optr += header.dim;
 						*reinterpret_cast<float*>(optr) = scale;
 						optr += sizeof(float);
@@ -771,6 +786,7 @@ namespace kiwi
 					eptr += sizeof(uint16_t);
 				}
 			}
+
 		}
 
 		template<ArchType arch, class KeyType, class VlKeyType, size_t windowSize, bool quantized>
@@ -856,13 +872,25 @@ namespace kiwi
 				{
 					const auto* contextPtr = getContextQuantEmb(unpackedContextId);
 					const auto* outputPtr = getOutputQuantEmb(next);
-					int32_t acc = qgemm::dotprod<arch>(contextPtr, outputPtr, header.dim);
-					const float contextScale = *reinterpret_cast<const float*>(contextPtr + header.dim),
-						outputScale = *reinterpret_cast<const float*>(outputPtr + header.dim),
+					float contextBias;
+					if constexpr (arch == ArchType::neon)
+					{
+						const auto* contextPtrS8 = getContextQuantEmbS8(unpackedContextId);
+						const auto* contextRaw = reinterpret_cast<const uint8_t*>(contextPtrS8);
+						const float score = qgemm::dotS8S8<arch>(header.dim, contextPtrS8, outputPtr);
+						contextBias = *reinterpret_cast<const float*>(contextRaw + header.dim + sizeof(float));
+						ll = score + contextBias;
+					}
+					else
+					{
+						int32_t acc = qgemm::dotprod<arch>(contextPtr, outputPtr, header.dim);
+						const float contextScale = *reinterpret_cast<const float*>(contextPtr + header.dim);
+						const float outputScale = *reinterpret_cast<const float*>(outputPtr + header.dim);
 						contextBias = *reinterpret_cast<const float*>(contextPtr + header.dim + sizeof(float));
-					const int32_t hsum = *reinterpret_cast<const int32_t*>(outputPtr + header.dim + sizeof(float));
-					acc -= hsum;
-					ll = acc * contextScale * outputScale + contextBias;
+						const int32_t hsum = *reinterpret_cast<const int32_t*>(outputPtr + header.dim + sizeof(float));
+						acc -= hsum;
+						ll = acc * contextScale * outputScale + contextBias;
+					}
 					if (outputEmbBiasPtr) ll += outputEmbBiasPtr[next];
 				}
 				else
@@ -2474,11 +2502,24 @@ namespace kiwi
 
 			if constexpr (quantized)
 			{
-				qgemm::gemvU8U8<arch>(
-					header.contextSize, header.dim,
-					getContextQuantEmb(contextId),
-					getContextQuantEmb(0), contextEmbStride(),
-					scores);
+				if constexpr (arch == ArchType::neon)
+				{
+					qgemm::gemvS8S8<arch>(
+						header.contextSize, header.dim,
+						getContextQuantEmbS8(contextId),
+						getContextQuantEmbS8(0), contextEmbStride(),
+						scores
+					);
+				}
+				else
+				{
+					qgemm::gemvU8U8<arch>(
+						header.contextSize, header.dim,
+						getContextQuantEmb(contextId),
+						getContextQuantEmb(0), contextEmbStride(),
+						scores
+					);
+				}
 			}
 			else
 			{
@@ -2525,10 +2566,20 @@ namespace kiwi
 			float result = 0;
 			if constexpr (quantized)
 			{
-				result = qgemm::dotU8U8<arch>(
-					header.dim,
-					getContextQuantEmb(contextId1), getContextQuantEmb(contextId2)
-				);
+				if constexpr (arch == ArchType::neon)
+				{
+					result = qgemm::dotS8S8<arch>(
+						header.dim,
+						getContextQuantEmbS8(contextId1), getContextQuantEmbS8(contextId2)
+					);
+				}
+				else
+				{
+					result = qgemm::dotU8U8<arch>(
+						header.dim,
+						getContextQuantEmb(contextId1), getContextQuantEmb(contextId2)
+					);
+				}
 			}
 			else
 			{
@@ -2554,12 +2605,24 @@ namespace kiwi
 			float* scores = resultBuf.data() + header.vocabSize;
 			if constexpr (quantized)
 			{
-				qgemm::gemv<arch>(
-					header.vocabSize, header.dim,
-					getContextQuantEmb(contextId),
-					getOutputQuantEmb(0), outputEmbStride(),
-					scores
-				);
+				if constexpr (arch == ArchType::neon)
+				{
+					qgemm::gemvS8S8<arch>(
+						header.vocabSize, header.dim,
+						getContextQuantEmbS8(contextId),
+						getOutputQuantEmb(0), outputEmbStride(),
+						scores
+					);
+				}
+				else
+				{
+					qgemm::gemv<arch>(
+						header.vocabSize, header.dim,
+						getContextQuantEmb(contextId),
+						getOutputQuantEmb(0), outputEmbStride(),
+						scores
+					);
+				}
 			}
 			else
 			{
@@ -2606,18 +2669,36 @@ namespace kiwi
 			float* scores = resultBuf.data() + header.vocabSize;
 			if constexpr (quantized)
 			{
-				qgemm::gemv<arch>(
-					header.vocabSize, header.dim,
-					getContextQuantEmb(bgContextId),
-					getOutputQuantEmb(0), outputEmbStride(),
-					resultBuf.data()
-				);
-				qgemm::gemv<arch>(
-					header.vocabSize, header.dim,
-					getContextQuantEmb(contextId),
-					getOutputQuantEmb(0), outputEmbStride(),
-					scores
-				);
+				if constexpr (arch == ArchType::neon)
+				{
+					qgemm::gemvS8S8<arch>(
+						header.vocabSize, header.dim,
+						getContextQuantEmbS8(bgContextId),
+						getOutputQuantEmb(0), outputEmbStride(),
+						resultBuf.data()
+					);
+					qgemm::gemvS8S8<arch>(
+						header.vocabSize, header.dim,
+						getContextQuantEmbS8(contextId),
+						getOutputQuantEmb(0), outputEmbStride(),
+						scores
+					);
+				}
+				else
+				{
+					qgemm::gemv<arch>(
+						header.vocabSize, header.dim,
+						getContextQuantEmb(bgContextId),
+						getOutputQuantEmb(0), outputEmbStride(),
+						resultBuf.data()
+					);
+					qgemm::gemv<arch>(
+						header.vocabSize, header.dim,
+						getContextQuantEmb(contextId),
+						getOutputQuantEmb(0), outputEmbStride(),
+						scores
+					);
+				}
 			}
 			else
 			{
diff --git a/src/CoNgramModel.hpp b/src/CoNgramModel.hpp
@@ -51,7 +51,7 @@ namespace kiwi
 			const uint8_t* alignedKeyValueData = nullptr;
 			std::unique_ptr<int32_t[]> allRootValueData;
 			std::unique_ptr<uint8_t[]> allEmbs;
-			const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)]
+			const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)] (quantized NEON: dim stores S8 values)
 			const uint8_t* outputEmbPtr = nullptr; // [numOutputs, (dim + scale? + sum?)]
 			const uint8_t* distantEmbPtr = nullptr; // [numOutputs, (dim + scale? + bias + confid + pad?)]
 			const float* positionConfidPtr = nullptr;
@@ -109,11 +109,16 @@ namespace kiwi
 				return reinterpret_cast<const float*>(contextEmbPtr + idx * contextEmbStride());
 			}
 
-			inline const uint8_t* getContextQuantEmb(uint32_t idx) const
+			inline const uint8_t* getContextQuantEmb(size_t idx) const
 			{
 				return contextEmbPtr + idx * contextEmbStride();
 			}
 
+			inline const int8_t* getContextQuantEmbS8(size_t idx) const
+			{
+				return reinterpret_cast<const int8_t*>(contextEmbPtr + idx * contextEmbStride());
+			}
+
 			inline float getContextBias(uint32_t idx) const
 			{
 				const size_t offset = quantized ?
diff --git a/src/SIMD.hpp b/src/SIMD.hpp
@@ -896,11 +896,18 @@ namespace kiwi
             static STRONG_INLINE int32_t dotprod(const uint8_t* a, const int8_t* b, size_t size)
             {
                 int32x4_t sum = vdupq_n_s32(0);
-				uint16x8_t pa;
-				int8x16_t pb;
                 for (size_t i = 0; i < size; i += 16)
                 {
-					//
+					uint8x16_t pa = vld1q_u8(a + i);
+					int8x16_t pb = vld1q_s8(b + i);
+					// Extend a (uint8, 0-255) to int16 via zero-extend, b (int8) via sign-extend
+					// Product range: 0*(-128) to 255*127 = [-32640, 32385], fits in int16
+					int16x8_t pa_lo = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(pa)));
+					int16x8_t pa_hi = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(pa)));
+					int16x8_t pb_lo = vmovl_s8(vget_low_s8(pb));
+					int16x8_t pb_hi = vmovl_s8(vget_high_s8(pb));
+					sum = vpadalq_s16(sum, vmulq_s16(pa_lo, pb_lo));
+					sum = vpadalq_s16(sum, vmulq_s16(pa_hi, pb_hi));
                 }
 				sum = vpaddq_s32(sum, sum);
 				sum = vpaddq_s32(sum, sum);
diff --git a/src/archImpl/neon.cpp b/src/archImpl/neon.cpp
diff --git a/src/qgemm.hpp b/src/qgemm.hpp

Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ namespace kiwi`
`51`	`51`	`const uint8_t* alignedKeyValueData = nullptr;`
`52`	`52`	`std::unique_ptr<int32_t[]> allRootValueData;`
`53`	`53`	`std::unique_ptr<uint8_t[]> allEmbs;`
`54`		`- const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)]`
	`54`	`+ const uint8_t* contextEmbPtr = nullptr; // [numContexts, (dim + scale? + bias + confid + vts)] (quantized NEON: dim stores S8 values)`
`55`	`55`	`const uint8_t* outputEmbPtr = nullptr; // [numOutputs, (dim + scale? + sum?)]`
`56`	`56`	`const uint8_t* distantEmbPtr = nullptr; // [numOutputs, (dim + scale? + bias + confid + pad?)]`
`57`	`57`	`const float* positionConfidPtr = nullptr;`
`@@ -109,11 +109,16 @@ namespace kiwi`
`109`	`109`	`return reinterpret_cast<const float>(contextEmbPtr + idx contextEmbStride());`
`110`	`110`	`}`
`111`	`111`
`112`		`- inline const uint8_t* getContextQuantEmb(uint32_t idx) const`
	`112`	`+ inline const uint8_t* getContextQuantEmb(size_t idx) const`
`113`	`113`	`{`
`114`	`114`	`return contextEmbPtr + idx * contextEmbStride();`
`115`	`115`	`}`
`116`	`116`
	`117`	`+ inline const int8_t* getContextQuantEmbS8(size_t idx) const`
	`118`	`+ {`
	`119`	`+ return reinterpret_cast<const int8_t>(contextEmbPtr + idx contextEmbStride());`
	`120`	`+ }`
	`121`	`+`
`117`	`122`	`inline float getContextBias(uint32_t idx) const`
`118`	`123`	`{`
`119`	`124`	`const size_t offset = quantized ?`