diff --git a/.gitignore b/.gitignore index 223c5cd..691ef30 100644 --- a/.gitignore +++ b/.gitignore @@ -50,3 +50,4 @@ picoclaw/ # Internal dev docs picoclaw/PICOLM_INTEGRATION.md +.aider* diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index ee584b2..0bfb2b7 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -12,7 +12,7 @@ Thanks for your interest in PicoLLM! This project is intentionally small (~2,500 ## What We Need Help With ### High Impact -- **SIMD kernels** — AVX2/AVX-512 for x86, optimized NEON for ARM +- **SIMD kernels** — AVX-512 for x86 server CPUs, optimized NEON for ARM - **New quantization formats** — Q5_K fused dot product, IQ formats - **New model architectures** — Mistral, Phi, Gemma (LLaMA-compatible) - **Platform testing** — RISC-V boards, Pi Zero, exotic ARM SBCs @@ -106,12 +106,20 @@ If you're adding SIMD code: // ARM NEON path (Pi 3/4/5) float32x4_t v = vld1q_f32(ptr); ... +#elif defined(PICOLM_AVX2) + // x86 AVX2 path (Haswell+, Excavator+ — 256-bit integer + float) + __m256i v = _mm256_loadu_si256((const __m256i *)ptr); + ... +#elif defined(PICOLM_AVX) + // x86 AVX path (Sandy Bridge+, Bulldozer+ — 8-wide float) + __m256 v = _mm256_loadu_ps(ptr); + ... #elif defined(PICOLM_SSE2) - // x86 SSE2 path (Intel/AMD) + // x86 SSE2 path (any x86-64 — 4-wide float) __m128 v = _mm_loadu_ps(ptr); ... #endif - // Scalar fallback (always works) + // Scalar fallback (always works — also reachable via `make scalar`) for (int i = 0; i < n; i++) { ... } ``` diff --git a/README.md b/README.md index 1d9ad7b..fb5b81f 100644 --- a/README.md +++ b/README.md @@ -183,7 +183,7 @@ The model file (638MB) stays on disk. PicoLM **memory-maps** it and streams one | **FP16 KV Cache** | Halves KV cache memory (44MB vs 88MB for 2048 context) | | **Flash Attention** | Online softmax — no O(seq_len) attention buffer needed | | **Pre-computed RoPE** | cos/sin lookup tables eliminate transcendentals from hot loop | -| **SIMD Acceleration** | ARM NEON (Pi 3/4/5) and x86 SSE2 (Intel/AMD) auto-detected | +| **SIMD Acceleration** | ARM NEON (Pi 3/4/5), x86 SSE2/SSE3/AVX/AVX2 — auto-detected at compile time | | **Fused Dot Products** | Dequantize + dot-product in one pass — no intermediate buffer | | **Multi-threaded matmul** | Parallel matrix-vector multiply across CPU cores | | **Grammar-Constrained JSON** | `--json` flag forces valid JSON output (for tool calling) | @@ -234,7 +234,10 @@ make model ```cmd cd picolm -build.bat +build.bat :: SSE2 baseline (any x86-64) +build.bat avx2 :: AVX2 (Haswell+ / Excavator+, fastest) +build.bat avx :: AVX (Sandy Bridge+ / Bulldozer+) +build.bat scalar :: no SIMD (portable fallback) picolm.exe model.gguf -p "Hello world" -n 50 ``` @@ -242,6 +245,12 @@ picolm.exe model.gguf -p "Hello world" -n 50 ```bash make native # x86/ARM auto-detect (recommended for local machine) +make x86 # x86-64 safe default (SSE2 only — runs on any x86-64) +make sse2 # x86-64 SSE2 only (same as x86) +make sse3 # x86-64 SSE2+SSE3+SSSE3 (AMD Phenom/Athlon, older Intel) +make avx # x86-64 AVX (Sandy Bridge+, Bulldozer+ — wider SIMD, faster) +make avx2 # x86-64 AVX2 (Haswell+, Excavator+ — widest SIMD, fastest) +make scalar # No SIMD (portable scalar fallback, any architecture) make pi # Raspberry Pi 3/4/5 (64-bit ARM + NEON SIMD) make pi-arm32 # Pi Zero / Pi 1 (32-bit ARM) make cross-pi # Cross-compile for Pi from x86 (static binary) @@ -348,7 +357,7 @@ Measured on TinyLlama 1.1B Q4_K_M (638 MB model): + FP16 KV cache █████████████████░░░ (halve memory bandwidth) + Pre-computed RoPE ██████████████████░░ (no sin/cos in hot loop) + Flash attention ██████████████████░░ (no O(n) attention alloc) - + NEON/SSE2 SIMD ███████████████████░ (4-wide vector ops) + + NEON/SSE2/AVX SIMD ███████████████████░ (4-wide to 8-wide vector ops) + KV cache persistence ████████████████████ (skip prefill entirely) ``` @@ -477,9 +486,14 @@ PicoLM implements 9 optimizations that brought generation speed from **1.6 tok/s 4-wide float vector operations for all hot paths. Example: dequantizing Q4_K nibbles with `vmovl_u8` → `vmovl_u16` → `vcvtq_f32_u32`, and RoPE with interleaved `vld2q_f32` / `vst2q_f32`. -### 2. x86 SSE2 SIMD +### 2. x86 SIMD (SSE2 / SSE3 / AVX / AVX2) -Auto-detected on Intel/AMD. 4-wide `__m128` operations for dot products, RMSNorm, and vector operations. +Four compile-time tiers for Intel/AMD: + +- **SSE2** (`make sse2` or `make x86`): 4-wide `__m128` operations for dot products, RMSNorm, softmax, RoPE, and element-wise ops. Safe baseline for all x86-64 CPUs. +- **SSE3** (`make sse3`): adds `_mm_addsub_ps` for a cleaner RoPE rotation kernel (no sign-mask workaround needed). +- **AVX** (`make avx`): 8-wide `__m256` float accumulators for all ops. Q4_K and Q6_K dot products widen the float accumulation stage while keeping integer nibble extraction at 128-bit (no AVX2 required). RoPE processes 4 complex pairs per iteration with `_mm256_addsub_ps`. +- **AVX2** (`make avx2`): adds 256-bit integer operations. Q4_0 nibble extraction uses `_mm256_cvtepu8_epi32` (8 nibbles → 8 int32 in 2 ops vs. 4-step unpack chain). Q6_K weight extraction uses `_mm256_cvtepi8_epi32` (8 int8 → 8 int32 in 2 ops vs. 4-instruction macro chain). Targets Haswell+ Intel and Excavator+ AMD. ### 3. FP16 KV Cache @@ -636,7 +650,7 @@ A: llama.cpp is excellent but requires ~200MB+ for the runtime on small models, A: TinyLlama 1.1B is a small model — it handles simple tasks (Q&A, summarization, basic reasoning, JSON generation) well. It won't match GPT-4, but it runs on a $10 board with no internet. For structured output, the `--json` grammar mode guarantees valid JSON regardless of model quality. **Q: What about GPU acceleration?** -A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2) provides meaningful speedup. +A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2/AVX) provides meaningful speedup. **Q: Can I use a different model?** A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https://huggingface.co/models?search=gguf) and point PicoLM at it. Recommended quantizations: Q4_K_M (best quality/size balance) or Q2_K (smallest, lower quality). @@ -645,7 +659,9 @@ A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https:// ## Roadmap -- [ ] AVX2/AVX-512 kernels for x86 (2-4x generation speed on modern CPUs) +- [x] AVX kernels for x86 (`make avx` — 8-wide float ops, ~2x vs SSE2) +- [x] AVX2 kernels for x86 (`make avx2` — 256-bit integer ops for Q4_0 and Q6_K quantized paths) +- [ ] AVX-512 kernels for x86 (512-bit ops for server CPUs) - [ ] Speculative decoding with a draft model - [ ] Context sliding window (infinite generation beyond max_seq_len) - [ ] Weight pruning for further memory reduction diff --git a/picolm/Makefile b/picolm/Makefile index 4fd3c7a..6b1cd43 100644 --- a/picolm/Makefile +++ b/picolm/Makefile @@ -1,5 +1,5 @@ CC = gcc -CFLAGS = -O2 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic +CFLAGS = -O3 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic LDFLAGS = -lm -lpthread SRCS = picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c TARGET = picolm @@ -11,11 +11,35 @@ MODEL_DIR ?= /opt/picolm/models native: CFLAGS += -march=native native: $(TARGET) +# --- x86-64 default (SSE2 only, safe for all x86-64) --- +x86: sse2 + +# --- No SIMD (scalar fallback, portable to any architecture) --- +scalar: CFLAGS += -mno-sse2 -mno-avx +scalar: $(TARGET) + +# --- x86-64 with SSE2 only --- +sse2: CFLAGS += -msse2 +sse2: $(TARGET) + +# --- x86-64 with SSE2+SSE3+SSSE3 (covers AMD Phenom/Athlon and similar without AVX) --- +sse3: CFLAGS += -msse2 -msse3 -mssse3 -mpopcnt +sse3: $(TARGET) + +# --- x86-64 with AVX (Sandy Bridge and newer Intel; Bulldozer and newer AMD) --- +avx: CFLAGS += -mavx -mfma -mpopcnt +avx: $(TARGET) + +# --- x86-64 with AVX2 (Haswell and newer Intel; Excavator and newer AMD) --- +avx2: CFLAGS += -mavx2 -mfma -mpopcnt +avx2: $(TARGET) + $(TARGET): $(SRCS) $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) # --- Static build for single-binary deployment --- -static: CFLAGS += -march=native +# Uses SSE2 (not -march=native) so the binary runs on any x86-64, not just the build machine. +static: CFLAGS += -msse2 static: LDFLAGS += -static static: $(TARGET) @@ -70,4 +94,4 @@ model: clean: rm -f $(TARGET) $(TARGET).exe *.obj *.o -.PHONY: native static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean +.PHONY: native x86 scalar sse2 sse3 avx avx2 static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean diff --git a/picolm/build.bat b/picolm/build.bat index 3f65e6c..2df2026 100644 --- a/picolm/build.bat +++ b/picolm/build.bat @@ -1,7 +1,26 @@ @echo off +REM PicoLM Windows build script (MSVC) +REM +REM SIMD targets: +REM build.bat -- SSE2 baseline (safe for any x86-64) +REM build.bat avx2 -- AVX2 (Haswell+ / Excavator+, fastest) +REM build.bat avx -- AVX (Sandy Bridge+ / Bulldozer+) +REM build.bat scalar -- no SIMD (portable scalar fallback) + call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1 -echo Compiling... -cl /O2 /W3 /Fe:picolm.exe picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c + +set SIMD_FLAG= +if /I "%1"=="avx2" set SIMD_FLAG=/arch:AVX2 +if /I "%1"=="avx" set SIMD_FLAG=/arch:AVX +if /I "%1"=="scalar" set SIMD_FLAG=/d2archSSE42- + +if "%SIMD_FLAG%"=="" ( + echo Building: SSE2 baseline +) else ( + echo Building: %1 ^(%SIMD_FLAG%^) +) + +cl /O2 /W3 %SIMD_FLAG% /Fe:picolm.exe picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c if %ERRORLEVEL% neq 0 ( echo BUILD FAILED ) else ( diff --git a/picolm/model.c b/picolm/model.c index 4b4040d..51bfbc5 100644 --- a/picolm/model.c +++ b/picolm/model.c @@ -406,7 +406,6 @@ static int parse_gguf(model_t *m, int max_seq_len) { fprintf(stderr, " n_layers=%d, vocab_size=%d, max_seq=%d\n", cfg->n_layers, cfg->vocab_size, cfg->max_seq_len); fprintf(stderr, " head_dim=%d, rope_base=%.1f\n", cfg->head_dim, cfg->rope_freq_base); - free(tinfos); return 0; } diff --git a/picolm/picolm.c b/picolm/picolm.c index c2f1624..f9c713d 100644 --- a/picolm/picolm.c +++ b/picolm/picolm.c @@ -131,6 +131,21 @@ int main(int argc, char **argv) { /* Load model */ fprintf(stderr, "Loading model: %s\n", model_path); + fprintf(stderr, "SIMD: %s\n", +#if defined(PICOLM_AVX2) + "AVX2" +#elif defined(PICOLM_AVX) + "AVX" +#elif defined(PICOLM_SSE3) + "SSE3" +#elif defined(PICOLM_SSE2) + "SSE2" +#elif defined(PICOLM_NEON) + "NEON" +#else + "scalar" +#endif + ); model_t model; if (model_load(&model, model_path, context_override) != 0) { fprintf(stderr, "Failed to load model\n"); diff --git a/picolm/quant.c b/picolm/quant.c index 5a92998..cdea2c6 100644 --- a/picolm/quant.c +++ b/picolm/quant.c @@ -352,6 +352,21 @@ float vec_dot_f32_f32(const void *src, const float *x, int n) { for (; i < n; i++) sum += w[i] * x[i]; return sum; +#elif defined(PICOLM_AVX) + __m256 acc0 = _mm256_setzero_ps(); + __m256 acc1 = _mm256_setzero_ps(); + int i = 0; + for (; i + 15 < n; i += 16) { + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_loadu_ps(w + i), _mm256_loadu_ps(x + i))); + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_loadu_ps(w + i + 8), _mm256_loadu_ps(x + i + 8))); + } + /* pick up a trailing group of 8 (common: hidden sizes are multiples of 8 not 16) */ + for (; i + 7 < n; i += 8) + acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_loadu_ps(w + i), _mm256_loadu_ps(x + i))); + float sum = hsum_avx(_mm256_add_ps(acc0, acc1)); + for (; i < n; i++) sum += w[i] * x[i]; + return sum; + #elif defined(PICOLM_SSE2) __m128 acc0 = _mm_setzero_ps(); __m128 acc1 = _mm_setzero_ps(); @@ -440,6 +455,113 @@ float vec_dot_q4_K_f32(const void *src, const float *x, int n) { float sum_x1 = vaddvq_f32_compat(sum_x1_v); float sum_qx2 = vaddvq_f32_compat(sum_qx2_v); float sum_x2 = vaddvq_f32_compat(sum_x2_v); +#elif defined(PICOLM_AVX2) + /* AVX2: 256-bit integer ops allow zero-extending 8 uint8 nibbles + * to 8 int32 in one _mm256_cvtepu8_epi32 instruction, then a + * single _mm256_cvtepi32_ps — no multi-step unpack chain needed. */ + __m256 sum_qx1_v = _mm256_setzero_ps(); + __m256 sum_x1_v = _mm256_setzero_ps(); + __m256 sum_qx2_v = _mm256_setzero_ps(); + __m256 sum_x2_v = _mm256_setzero_ps(); + const __m128i mask4 = _mm_set1_epi8(0x0F); + + for (int l = 0; l < 32; l += 8) { + __m128i qb = _mm_loadl_epi64((const __m128i *)(q + l)); + __m128i lo8 = _mm_and_si128(qb, mask4); + __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4); + + /* AVX2: zero-extend 8 uint8 → 8 int32 → 8 float in 2 ops */ + __m256 qf_lo = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(lo8)); + __m256 qf_hi = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(hi8)); + + __m256 xv_lo = _mm256_loadu_ps(xp + l); + __m256 xv_hi = _mm256_loadu_ps(xp + l + 32); + + sum_qx1_v = _mm256_add_ps(sum_qx1_v, _mm256_mul_ps(qf_lo, xv_lo)); + sum_x1_v = _mm256_add_ps(sum_x1_v, xv_lo); + sum_qx2_v = _mm256_add_ps(sum_qx2_v, _mm256_mul_ps(qf_hi, xv_hi)); + sum_x2_v = _mm256_add_ps(sum_x2_v, xv_hi); + } + float sum_qx1 = hsum_avx(sum_qx1_v); + float sum_x1 = hsum_avx(sum_x1_v); + float sum_qx2 = hsum_avx(sum_qx2_v); + float sum_x2 = hsum_avx(sum_x2_v); +#elif defined(PICOLM_AVX) + /* AVX: 128-bit nibble extraction (no AVX2 int needed), 256-bit float accumulators */ + __m256 sum_qx1_v = _mm256_setzero_ps(); + __m256 sum_x1_v = _mm256_setzero_ps(); + __m256 sum_qx2_v = _mm256_setzero_ps(); + __m256 sum_x2_v = _mm256_setzero_ps(); + const __m128i mask4 = _mm_set1_epi8(0x0F); + const __m128i zero_i = _mm_setzero_si128(); + + for (int l = 0; l < 32; l += 8) { + __m128i qb = _mm_loadl_epi64((const __m128i *)(q + l)); + __m128i lo8 = _mm_and_si128(qb, mask4); + __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4); + + __m128i lo16 = _mm_unpacklo_epi8(lo8, zero_i); + __m128i hi16 = _mm_unpacklo_epi8(hi8, zero_i); + + /* Combine two __m128 → one __m256 of 8 floats */ + __m256 qf_lo = _mm256_set_m128( + _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo16, zero_i)), + _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo16, zero_i))); + __m256 qf_hi = _mm256_set_m128( + _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi16, zero_i)), + _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi16, zero_i))); + + __m256 xv_lo = _mm256_loadu_ps(xp + l); + __m256 xv_hi = _mm256_loadu_ps(xp + l + 32); + + sum_qx1_v = _mm256_add_ps(sum_qx1_v, _mm256_mul_ps(qf_lo, xv_lo)); + sum_x1_v = _mm256_add_ps(sum_x1_v, xv_lo); + sum_qx2_v = _mm256_add_ps(sum_qx2_v, _mm256_mul_ps(qf_hi, xv_hi)); + sum_x2_v = _mm256_add_ps(sum_x2_v, xv_hi); + } + float sum_qx1 = hsum_avx(sum_qx1_v); + float sum_x1 = hsum_avx(sum_x1_v); + float sum_qx2 = hsum_avx(sum_qx2_v); + float sum_x2 = hsum_avx(sum_x2_v); +#elif defined(PICOLM_SSE2) + /* SSE2: lo nibble → group1 (xp+l), hi nibble → group2 (xp+l+32) */ + __m128 sum_qx1_v = _mm_setzero_ps(); + __m128 sum_x1_v = _mm_setzero_ps(); + __m128 sum_qx2_v = _mm_setzero_ps(); + __m128 sum_x2_v = _mm_setzero_ps(); + const __m128i mask4 = _mm_set1_epi8(0x0F); + const __m128i zero_i = _mm_setzero_si128(); + + for (int l = 0; l < 32; l += 8) { + /* Load 8 quantized bytes -> 8 lo + 8 hi nibbles */ + __m128i qb = _mm_loadl_epi64((const __m128i *)(q + l)); + __m128i lo8 = _mm_and_si128(qb, mask4); + __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4); + + /* Widen uint8 nibbles -> int32 -> float (8 values each) */ + __m128i lo16 = _mm_unpacklo_epi8(lo8, zero_i); + __m128i hi16 = _mm_unpacklo_epi8(hi8, zero_i); + __m128 qf_lo0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo16, zero_i)); + __m128 qf_lo1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo16, zero_i)); + __m128 qf_hi0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi16, zero_i)); + __m128 qf_hi1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi16, zero_i)); + + __m128 xv_lo0 = _mm_loadu_ps(xp + l); + __m128 xv_lo1 = _mm_loadu_ps(xp + l + 4); + __m128 xv_hi0 = _mm_loadu_ps(xp + l + 32); + __m128 xv_hi1 = _mm_loadu_ps(xp + l + 36); + + sum_qx1_v = _mm_add_ps(sum_qx1_v, + _mm_add_ps(_mm_mul_ps(qf_lo0, xv_lo0), _mm_mul_ps(qf_lo1, xv_lo1))); + sum_x1_v = _mm_add_ps(sum_x1_v, _mm_add_ps(xv_lo0, xv_lo1)); + sum_qx2_v = _mm_add_ps(sum_qx2_v, + _mm_add_ps(_mm_mul_ps(qf_hi0, xv_hi0), _mm_mul_ps(qf_hi1, xv_hi1))); + sum_x2_v = _mm_add_ps(sum_x2_v, _mm_add_ps(xv_hi0, xv_hi1)); + } + float sum_qx1 = hsum_sse(sum_qx1_v); + float sum_x1 = hsum_sse(sum_x1_v); + float sum_qx2 = hsum_sse(sum_qx2_v); + float sum_x2 = hsum_sse(sum_x2_v); #else float sum_qx1 = 0.0f, sum_x1 = 0.0f; float sum_qx2 = 0.0f, sum_x2 = 0.0f; @@ -476,9 +598,199 @@ float vec_dot_q6_K_f32(const void *src, const float *x, int n) { const int8_t *sc = blocks[i].scales; const float *xp = x + i * 256; - /* Accumulate per-scale-group sums: 16 groups of 16 elements each */ float sums[16] = {0}; +/* sign-extend packed int8 → two __m128 floats; used by AVX and SSE2 paths. + * Idiom: unpacklo_epi8(zero, x) places each byte in the HIGH byte of a 16-bit + * lane; srai_epi16(..., 8) then arithmetic-shifts it down, propagating the sign + * bit — equivalent to a sign-extending byte→int16 widening without SSE4.1. */ +#if defined(PICOLM_AVX) || defined(PICOLM_SSE2) +#define Q6K_CONV(qi8, fa, fb) do { \ + __m128i w16 = _mm_srai_epi16(_mm_unpacklo_epi8(zero_i, qi8), 8); \ + fa = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero_i, w16), 16)); \ + fb = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero_i, w16), 16)); \ +} while (0) +#endif + +#ifdef PICOLM_AVX2 + /* AVX2: _mm256_cvtepi8_epi32 replaces the 4-op Q6K_CONV sign-extension chain */ + const __m128i mask4 = _mm_set1_epi8(0x0F); + const __m128i mask3 = _mm_set1_epi8(0x03); + const __m128i sub32 = _mm_set1_epi8(32); + + for (int chunk = 0; chunk < 2; chunk++) { + int is = chunk * 8; + const uint8_t *ql_c = ql + chunk * 64; + const uint8_t *qh_c = qh + chunk * 32; + const float *xp_c = xp + chunk * 128; + + for (int half = 0; half < 2; half++) { + int l0 = half * 16; + int sidx = is + half; + __m256 acc1 = _mm256_setzero_ps(); + __m256 acc2 = _mm256_setzero_ps(); + __m256 acc3 = _mm256_setzero_ps(); + __m256 acc4 = _mm256_setzero_ps(); + + for (int l = l0; l < l0 + 16; l += 8) { + __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l)); + __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32)); + __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l)); + + __m128i lo_a = _mm_and_si128(qla, mask4); + __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4); + __m128i lo_b = _mm_and_si128(qlb, mask4); + __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4); + + __m128i h01 = _mm_and_si128(qhv, mask3); + __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3); + __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3); + __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3); + + __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32); + __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32); + __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32); + __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32); + + /* AVX2: sign-extend 8 int8 → 8 int32 → 8 float in 2 ops */ + __m256 qf1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q1_i8)); + __m256 qf2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q2_i8)); + __m256 qf3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q3_i8)); + __m256 qf4 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q4_i8)); + + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(qf1, _mm256_loadu_ps(xp_c + l))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(qf2, _mm256_loadu_ps(xp_c + l + 32))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(qf3, _mm256_loadu_ps(xp_c + l + 64))); + acc4 = _mm256_add_ps(acc4, _mm256_mul_ps(qf4, _mm256_loadu_ps(xp_c + l + 96))); + } + sums[sidx + 0] += hsum_avx(acc1); + sums[sidx + 2] += hsum_avx(acc2); + sums[sidx + 4] += hsum_avx(acc3); + sums[sidx + 6] += hsum_avx(acc4); + } + } +#elif defined(PICOLM_AVX) + /* AVX: 128-bit integer extraction, 256-bit float accumulators */ + const __m128i mask4 = _mm_set1_epi8(0x0F); + const __m128i mask3 = _mm_set1_epi8(0x03); + const __m128i sub32 = _mm_set1_epi8(32); + const __m128i zero_i = _mm_setzero_si128(); + + for (int chunk = 0; chunk < 2; chunk++) { + int is = chunk * 8; + const uint8_t *ql_c = ql + chunk * 64; + const uint8_t *qh_c = qh + chunk * 32; + const float *xp_c = xp + chunk * 128; + + for (int half = 0; half < 2; half++) { + int l0 = half * 16; + int sidx = is + half; + __m256 acc1 = _mm256_setzero_ps(); + __m256 acc2 = _mm256_setzero_ps(); + __m256 acc3 = _mm256_setzero_ps(); + __m256 acc4 = _mm256_setzero_ps(); + + for (int l = l0; l < l0 + 16; l += 8) { + __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l)); + __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32)); + __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l)); + + __m128i lo_a = _mm_and_si128(qla, mask4); + __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4); + __m128i lo_b = _mm_and_si128(qlb, mask4); + __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4); + + __m128i h01 = _mm_and_si128(qhv, mask3); + __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3); + __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3); + __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3); + + __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32); + __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32); + __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32); + __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32); + + __m128 qf1a, qf1b, qf2a, qf2b, qf3a, qf3b, qf4a, qf4b; + Q6K_CONV(q1_i8, qf1a, qf1b); + Q6K_CONV(q2_i8, qf2a, qf2b); + Q6K_CONV(q3_i8, qf3a, qf3b); + Q6K_CONV(q4_i8, qf4a, qf4b); + + acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_set_m128(qf1b, qf1a), _mm256_loadu_ps(xp_c + l))); + acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_set_m128(qf2b, qf2a), _mm256_loadu_ps(xp_c + l + 32))); + acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_set_m128(qf3b, qf3a), _mm256_loadu_ps(xp_c + l + 64))); + acc4 = _mm256_add_ps(acc4, _mm256_mul_ps(_mm256_set_m128(qf4b, qf4a), _mm256_loadu_ps(xp_c + l + 96))); + } + sums[sidx + 0] += hsum_avx(acc1); + sums[sidx + 2] += hsum_avx(acc2); + sums[sidx + 4] += hsum_avx(acc3); + sums[sidx + 6] += hsum_avx(acc4); + } + } +#elif defined(PICOLM_SSE2) + /* SSE2: 6-bit values = lo4(ql) | hi2(qh)<<4, biased by 32 */ + const __m128i mask4 = _mm_set1_epi8(0x0F); + const __m128i mask3 = _mm_set1_epi8(0x03); + const __m128i sub32 = _mm_set1_epi8(32); + const __m128i zero_i = _mm_setzero_si128(); + + for (int chunk = 0; chunk < 2; chunk++) { + int is = chunk * 8; + const uint8_t *ql_c = ql + chunk * 64; + const uint8_t *qh_c = qh + chunk * 32; + const float *xp_c = xp + chunk * 128; + + for (int half = 0; half < 2; half++) { /* half=0 -> sums[+0,2,4,6], half=1 -> [+1,3,5,7] */ + int l0 = half * 16; + int sidx = is + half; + __m128 acc1a = _mm_setzero_ps(), acc1b = _mm_setzero_ps(); + __m128 acc2a = _mm_setzero_ps(), acc2b = _mm_setzero_ps(); + __m128 acc3a = _mm_setzero_ps(), acc3b = _mm_setzero_ps(); + __m128 acc4a = _mm_setzero_ps(), acc4b = _mm_setzero_ps(); + + for (int l = l0; l < l0 + 16; l += 8) { + __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l)); + __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32)); + __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l)); + + __m128i lo_a = _mm_and_si128(qla, mask4); + __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4); + __m128i lo_b = _mm_and_si128(qlb, mask4); + __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4); + + /* epi16 shifts on qh: avoids byte-lane bleed from epi8 shifts */ + __m128i h01 = _mm_and_si128(qhv, mask3); + __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3); + __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3); + __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3); + + __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32); + __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32); + __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32); + __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32); + + __m128 qf1a, qf1b, qf2a, qf2b, qf3a, qf3b, qf4a, qf4b; + Q6K_CONV(q1_i8, qf1a, qf1b); + Q6K_CONV(q2_i8, qf2a, qf2b); + Q6K_CONV(q3_i8, qf3a, qf3b); + Q6K_CONV(q4_i8, qf4a, qf4b); + + acc1a = _mm_add_ps(acc1a, _mm_mul_ps(qf1a, _mm_loadu_ps(xp_c + l))); + acc1b = _mm_add_ps(acc1b, _mm_mul_ps(qf1b, _mm_loadu_ps(xp_c + l + 4))); + acc2a = _mm_add_ps(acc2a, _mm_mul_ps(qf2a, _mm_loadu_ps(xp_c + l + 32))); + acc2b = _mm_add_ps(acc2b, _mm_mul_ps(qf2b, _mm_loadu_ps(xp_c + l + 36))); + acc3a = _mm_add_ps(acc3a, _mm_mul_ps(qf3a, _mm_loadu_ps(xp_c + l + 64))); + acc3b = _mm_add_ps(acc3b, _mm_mul_ps(qf3b, _mm_loadu_ps(xp_c + l + 68))); + acc4a = _mm_add_ps(acc4a, _mm_mul_ps(qf4a, _mm_loadu_ps(xp_c + l + 96))); + acc4b = _mm_add_ps(acc4b, _mm_mul_ps(qf4b, _mm_loadu_ps(xp_c + l + 100))); + } + sums[sidx + 0] += hsum_sse(_mm_add_ps(acc1a, acc1b)); + sums[sidx + 2] += hsum_sse(_mm_add_ps(acc2a, acc2b)); + sums[sidx + 4] += hsum_sse(_mm_add_ps(acc3a, acc3b)); + sums[sidx + 6] += hsum_sse(_mm_add_ps(acc4a, acc4b)); + } + } +#else for (int chunk = 0; chunk < 2; chunk++) { int is = chunk * 8; const uint8_t *ql_c = ql + chunk * 64; @@ -506,6 +818,9 @@ float vec_dot_q6_K_f32(const void *src, const float *x, int n) { sums[is + 7] += (float)q4 * xp_c[l + 96]; } } +#endif + +#undef Q6K_CONV for (int j = 0; j < 16; j++) { sumf += d * (float)sc[j] * sums[j]; diff --git a/picolm/quant.h b/picolm/quant.h index e35095c..3e8118a 100644 --- a/picolm/quant.h +++ b/picolm/quant.h @@ -4,23 +4,61 @@ #include #include -/* ---- SIMD detection ---- */ +/* ---- SIMD detection ---- + * + * Each level explicitly implies all lower levels so that code only needs to + * check a single flag. The order of checks (highest first) lets each block + * upgrade flags that a lower-level check would otherwise miss if the compiler + * only predefines the highest applicable macro. + * + * Hierarchy (x86): + * PICOLM_SSE2 ⊂ PICOLM_SSE3 ⊂ PICOLM_AVX ⊂ PICOLM_AVX2 + * + * ARM: + * PICOLM_NEON (independent) + */ + +/* --- ARM NEON --- */ #if defined(__ARM_NEON) || defined(__ARM_NEON__) -#define PICOLM_NEON 1 -#include +# define PICOLM_NEON 1 +# include static inline float vaddvq_f32_compat(float32x4_t v) { -#if defined(__aarch64__) +# if defined(__aarch64__) return vaddvq_f32(v); -#else +# else float32x2_t r = vadd_f32(vget_low_f32(v), vget_high_f32(v)); return vget_lane_f32(vpadd_f32(r, r), 0); -#endif +# endif } #endif -#if defined(__SSE2__) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64))) -#define PICOLM_SSE2 1 -#include +/* --- x86 SIMD: detect highest level, then propagate downward --- */ + +/* AVX2 implies AVX + SSE3 + SSE2 */ +#if defined(__AVX2__) +# define PICOLM_AVX2 1 +# define PICOLM_AVX 1 +# define PICOLM_SSE3 1 +# define PICOLM_SSE2 1 +/* AVX implies SSE3 + SSE2 */ +#elif defined(__AVX__) +# define PICOLM_AVX 1 +# define PICOLM_SSE3 1 +# define PICOLM_SSE2 1 +/* SSE3 implies SSE2 */ +#elif defined(__SSE3__) +# define PICOLM_SSE3 1 +# define PICOLM_SSE2 1 +/* SSE2 baseline (also the default for all x86-64 targets) */ +#elif defined(__SSE2__) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64))) +# define PICOLM_SSE2 1 +#endif + +/* Include x86 SIMD header once for any x86 SIMD level. + * is an umbrella header that exposes all intrinsics + * available for the current -march target. */ +#ifdef PICOLM_SSE2 +# include static inline float hsum_sse(__m128 v) { __m128 shuf = _mm_movehl_ps(v, v); __m128 sum = _mm_add_ps(v, shuf); @@ -30,6 +68,14 @@ static inline float hsum_sse(__m128 v) { } #endif +#ifdef PICOLM_AVX +static inline float hsum_avx(__m256 v) { + __m128 lo = _mm256_castps256_ps128(v); + __m128 hi = _mm256_extractf128_ps(v, 1); + return hsum_sse(_mm_add_ps(lo, hi)); +} +#endif + /* GGUF tensor data types */ typedef enum { GGUF_TYPE_F32 = 0, diff --git a/picolm/tensor.c b/picolm/tensor.c index 59a68e6..a12db9d 100644 --- a/picolm/tensor.c +++ b/picolm/tensor.c @@ -137,6 +137,15 @@ void rmsnorm(float *out, const float *x, const float *weight, int size) { } ss = vaddvq_f32_compat(acc); for (; i < size; i++) ss += x[i] * x[i]; +#elif defined(PICOLM_AVX) + __m256 acc = _mm256_setzero_ps(); + int i = 0; + for (; i + 7 < size; i += 8) { + __m256 v = _mm256_loadu_ps(x + i); + acc = _mm256_add_ps(acc, _mm256_mul_ps(v, v)); + } + ss = hsum_avx(acc); + for (; i < size; i++) ss += x[i] * x[i]; #elif defined(PICOLM_SSE2) __m128 acc = _mm_setzero_ps(); int i = 0; @@ -161,6 +170,15 @@ void rmsnorm(float *out, const float *x, const float *weight, int size) { vst1q_f32(out + i, vmulq_f32(vmulq_f32(v, scale), w)); } for (; i < size; i++) out[i] = x[i] * ss * weight[i]; +#elif defined(PICOLM_AVX) + __m256 scale = _mm256_set1_ps(ss); + i = 0; + for (; i + 7 < size; i += 8) { + __m256 v = _mm256_loadu_ps(x + i); + __m256 w = _mm256_loadu_ps(weight + i); + _mm256_storeu_ps(out + i, _mm256_mul_ps(_mm256_mul_ps(v, scale), w)); + } + for (; i < size; i++) out[i] = x[i] * ss * weight[i]; #elif defined(PICOLM_SSE2) __m128 scale = _mm_set1_ps(ss); i = 0; @@ -194,6 +212,13 @@ void softmax(float *x, int size) { vst1q_f32(x + i, vmulq_f32(vld1q_f32(x + i), inv_v)); } for (; i < size; i++) x[i] *= inv; +#elif defined(PICOLM_AVX) + __m256 inv_v = _mm256_set1_ps(inv); + int i = 0; + for (; i + 7 < size; i += 8) { + _mm256_storeu_ps(x + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), inv_v)); + } + for (; i < size; i++) x[i] *= inv; #elif defined(PICOLM_SSE2) __m128 inv_v = _mm_set1_ps(inv); int i = 0; @@ -206,6 +231,64 @@ void softmax(float *x, int size) { #endif } +/* AVX RoPE: 4 complex pairs/iter; addsub handles r*cos-i*sin / r*sin+i*cos in one op */ +#ifdef PICOLM_AVX +static void rope_avx(float *h, int half, const float *cos_pos, const float *sin_pos) { + int i = 0; + for (; i + 3 < half; i += 4) { + __m256 v = _mm256_loadu_ps(h + i * 2); + __m128 c4 = _mm_loadu_ps(cos_pos + i); + __m128 s4 = _mm_loadu_ps(sin_pos + i); + __m256 cv = _mm256_set_m128(_mm_unpackhi_ps(c4, c4), _mm_unpacklo_ps(c4, c4)); + __m256 sv = _mm256_set_m128(_mm_unpackhi_ps(s4, s4), _mm_unpacklo_ps(s4, s4)); + __m256 sw = _mm256_permute_ps(v, 0xB1); /* swap r,i within each pair */ + _mm256_storeu_ps(h + i * 2, + _mm256_addsub_ps(_mm256_mul_ps(v, cv), _mm256_mul_ps(sw, sv))); + } + for (; i < half; i++) { + float r = h[i * 2], im = h[i * 2 + 1]; + h[i * 2] = r * cos_pos[i] - im * sin_pos[i]; + h[i * 2 + 1] = r * sin_pos[i] + im * cos_pos[i]; + } +} +#endif + +/* SSE2/SSE3 RoPE: 2 pairs/iter; SSE3 uses addsub, SSE2 uses sign-mask to negate even lanes */ +#if defined(PICOLM_SSE2) && !defined(PICOLM_AVX) +static void rope_sse(float *h, int half, const float *cos_pos, const float *sin_pos) { + int i = 0; +#ifdef PICOLM_SSE3 + for (; i + 1 < half; i += 2) { + __m128 v = _mm_loadu_ps(h + i * 2); + __m128 c2 = _mm_unpacklo_ps(_mm_load_ss(cos_pos + i), _mm_load_ss(cos_pos + i + 1)); + __m128 s2 = _mm_unpacklo_ps(_mm_load_ss(sin_pos + i), _mm_load_ss(sin_pos + i + 1)); + __m128 cv = _mm_shuffle_ps(c2, c2, _MM_SHUFFLE(1,1,0,0)); + __m128 sv = _mm_shuffle_ps(s2, s2, _MM_SHUFFLE(1,1,0,0)); + __m128 sw = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,3,0,1)); + _mm_storeu_ps(h + i * 2, _mm_addsub_ps(_mm_mul_ps(v, cv), _mm_mul_ps(sw, sv))); + } +#else + const __m128 sign = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f); + for (; i + 1 < half; i += 2) { + __m128 v = _mm_loadu_ps(h + i * 2); + __m128 c2 = _mm_unpacklo_ps(_mm_load_ss(cos_pos + i), _mm_load_ss(cos_pos + i + 1)); + __m128 s2 = _mm_unpacklo_ps(_mm_load_ss(sin_pos + i), _mm_load_ss(sin_pos + i + 1)); + __m128 cv = _mm_shuffle_ps(c2, c2, _MM_SHUFFLE(1,1,0,0)); + __m128 sv = _mm_shuffle_ps(s2, s2, _MM_SHUFFLE(1,1,0,0)); + __m128 sw = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2,3,0,1)); + __m128 a = _mm_mul_ps(v, cv); + __m128 b = _mm_mul_ps(_mm_mul_ps(sign, sw), sv); + _mm_storeu_ps(h + i * 2, _mm_add_ps(a, b)); + } +#endif + for (; i < half; i++) { + float r = h[i * 2], im = h[i * 2 + 1]; + h[i * 2] = r * cos_pos[i] - im * sin_pos[i]; + h[i * 2 + 1] = r * sin_pos[i] + im * cos_pos[i]; + } +} +#endif + /* Rotary position encoding using pre-computed cos/sin tables */ void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads, const float *cos_pos, const float *sin_pos) { @@ -233,6 +316,10 @@ void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads, qh[i * 2] = q0 * cos_pos[i] - q1 * sin_pos[i]; qh[i * 2 + 1] = q0 * sin_pos[i] + q1 * cos_pos[i]; } +#elif defined(PICOLM_AVX) + rope_avx(qh, half, cos_pos, sin_pos); +#elif defined(PICOLM_SSE2) + rope_sse(qh, half, cos_pos, sin_pos); #else for (int i = 0; i < half; i++) { float q0 = qh[i * 2]; @@ -246,12 +333,18 @@ void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads, /* Apply RoPE to all KV heads */ for (int h = 0; h < n_kv_heads; h++) { float *kh = k + h * head_dim; +#ifdef PICOLM_AVX + rope_avx(kh, half, cos_pos, sin_pos); +#elif defined(PICOLM_SSE2) + rope_sse(kh, half, cos_pos, sin_pos); +#else for (int i = 0; i < half; i++) { float k0 = kh[i * 2]; float k1 = kh[i * 2 + 1]; kh[i * 2] = k0 * cos_pos[i] - k1 * sin_pos[i]; kh[i * 2 + 1] = k0 * sin_pos[i] + k1 * cos_pos[i]; } +#endif } } @@ -268,6 +361,12 @@ void elemwise_mul(float *out, const float *a, const float *b, int size) { vst1q_f32(out + i, vmulq_f32(vld1q_f32(a + i), vld1q_f32(b + i))); } for (; i < size; i++) out[i] = a[i] * b[i]; +#elif defined(PICOLM_AVX) + int i = 0; + for (; i + 7 < size; i += 8) { + _mm256_storeu_ps(out + i, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i))); + } + for (; i < size; i++) out[i] = a[i] * b[i]; #elif defined(PICOLM_SSE2) int i = 0; for (; i + 3 < size; i += 4) { @@ -286,6 +385,12 @@ void vec_add(float *a, const float *b, int size) { vst1q_f32(a + i, vaddq_f32(vld1q_f32(a + i), vld1q_f32(b + i))); } for (; i < size; i++) a[i] += b[i]; +#elif defined(PICOLM_AVX) + int i = 0; + for (; i + 7 < size; i += 8) { + _mm256_storeu_ps(a + i, _mm256_add_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i))); + } + for (; i < size; i++) a[i] += b[i]; #elif defined(PICOLM_SSE2) int i = 0; for (; i + 3 < size; i += 4) {