diff --git a/.gitignore b/.gitignore
index 223c5cd..691ef30 100644
--- a/.gitignore
+++ b/.gitignore
@@ -50,3 +50,4 @@ picoclaw/
 
 # Internal dev docs
 picoclaw/PICOLM_INTEGRATION.md
+.aider*
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index ee584b2..0bfb2b7 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -12,7 +12,7 @@ Thanks for your interest in PicoLLM! This project is intentionally small (~2,500
 ## What We Need Help With
 
 ### High Impact
-- **SIMD kernels** — AVX2/AVX-512 for x86, optimized NEON for ARM
+- **SIMD kernels** — AVX-512 for x86 server CPUs, optimized NEON for ARM
 - **New quantization formats** — Q5_K fused dot product, IQ formats
 - **New model architectures** — Mistral, Phi, Gemma (LLaMA-compatible)
 - **Platform testing** — RISC-V boards, Pi Zero, exotic ARM SBCs
@@ -106,12 +106,20 @@ If you're adding SIMD code:
     // ARM NEON path (Pi 3/4/5)
     float32x4_t v = vld1q_f32(ptr);
     ...
+#elif defined(PICOLM_AVX2)
+    // x86 AVX2 path (Haswell+, Excavator+ — 256-bit integer + float)
+    __m256i v = _mm256_loadu_si256((const __m256i *)ptr);
+    ...
+#elif defined(PICOLM_AVX)
+    // x86 AVX path (Sandy Bridge+, Bulldozer+ — 8-wide float)
+    __m256 v = _mm256_loadu_ps(ptr);
+    ...
 #elif defined(PICOLM_SSE2)
-    // x86 SSE2 path (Intel/AMD)
+    // x86 SSE2 path (any x86-64 — 4-wide float)
     __m128 v = _mm_loadu_ps(ptr);
     ...
 #endif
-    // Scalar fallback (always works)
+    // Scalar fallback (always works — also reachable via `make scalar`)
     for (int i = 0; i < n; i++) { ... }
 ```
 
diff --git a/README.md b/README.md
index 1d9ad7b..fb5b81f 100644
--- a/README.md
+++ b/README.md
@@ -183,7 +183,7 @@ The model file (638MB) stays on disk. PicoLM **memory-maps** it and streams one
 | **FP16 KV Cache** | Halves KV cache memory (44MB vs 88MB for 2048 context) |
 | **Flash Attention** | Online softmax — no O(seq_len) attention buffer needed |
 | **Pre-computed RoPE** | cos/sin lookup tables eliminate transcendentals from hot loop |
-| **SIMD Acceleration** | ARM NEON (Pi 3/4/5) and x86 SSE2 (Intel/AMD) auto-detected |
+| **SIMD Acceleration** | ARM NEON (Pi 3/4/5), x86 SSE2/SSE3/AVX/AVX2 — auto-detected at compile time |
 | **Fused Dot Products** | Dequantize + dot-product in one pass — no intermediate buffer |
 | **Multi-threaded matmul** | Parallel matrix-vector multiply across CPU cores |
 | **Grammar-Constrained JSON** | `--json` flag forces valid JSON output (for tool calling) |
@@ -234,7 +234,10 @@ make model
 
 ```cmd
 cd picolm
-build.bat
+build.bat           :: SSE2 baseline (any x86-64)
+build.bat avx2      :: AVX2 (Haswell+ / Excavator+, fastest)
+build.bat avx       :: AVX  (Sandy Bridge+ / Bulldozer+)
+build.bat scalar    :: no SIMD (portable fallback)
 picolm.exe model.gguf -p "Hello world" -n 50
 ```
 
@@ -242,6 +245,12 @@ picolm.exe model.gguf -p "Hello world" -n 50
 
 ```bash
 make native      # x86/ARM auto-detect (recommended for local machine)
+make x86         # x86-64 safe default (SSE2 only — runs on any x86-64)
+make sse2        # x86-64 SSE2 only (same as x86)
+make sse3        # x86-64 SSE2+SSE3+SSSE3 (AMD Phenom/Athlon, older Intel)
+make avx         # x86-64 AVX (Sandy Bridge+, Bulldozer+ — wider SIMD, faster)
+make avx2        # x86-64 AVX2 (Haswell+, Excavator+ — widest SIMD, fastest)
+make scalar      # No SIMD (portable scalar fallback, any architecture)
 make pi          # Raspberry Pi 3/4/5 (64-bit ARM + NEON SIMD)
 make pi-arm32    # Pi Zero / Pi 1 (32-bit ARM)
 make cross-pi    # Cross-compile for Pi from x86 (static binary)
@@ -348,7 +357,7 @@ Measured on TinyLlama 1.1B Q4_K_M (638 MB model):
  + FP16 KV cache          █████████████████░░░  (halve memory bandwidth)
  + Pre-computed RoPE      ██████████████████░░  (no sin/cos in hot loop)
  + Flash attention        ██████████████████░░  (no O(n) attention alloc)
- + NEON/SSE2 SIMD         ███████████████████░  (4-wide vector ops)
+ + NEON/SSE2/AVX SIMD     ███████████████████░  (4-wide to 8-wide vector ops)
  + KV cache persistence   ████████████████████  (skip prefill entirely)
 ```
 
@@ -477,9 +486,14 @@ PicoLM implements 9 optimizations that brought generation speed from **1.6 tok/s
 
 4-wide float vector operations for all hot paths. Example: dequantizing Q4_K nibbles with `vmovl_u8` → `vmovl_u16` → `vcvtq_f32_u32`, and RoPE with interleaved `vld2q_f32` / `vst2q_f32`.
 
-### 2. x86 SSE2 SIMD
+### 2. x86 SIMD (SSE2 / SSE3 / AVX / AVX2)
 
-Auto-detected on Intel/AMD. 4-wide `__m128` operations for dot products, RMSNorm, and vector operations.
+Four compile-time tiers for Intel/AMD:
+
+- **SSE2** (`make sse2` or `make x86`): 4-wide `__m128` operations for dot products, RMSNorm, softmax, RoPE, and element-wise ops. Safe baseline for all x86-64 CPUs.
+- **SSE3** (`make sse3`): adds `_mm_addsub_ps` for a cleaner RoPE rotation kernel (no sign-mask workaround needed).
+- **AVX** (`make avx`): 8-wide `__m256` float accumulators for all ops. Q4_K and Q6_K dot products widen the float accumulation stage while keeping integer nibble extraction at 128-bit (no AVX2 required). RoPE processes 4 complex pairs per iteration with `_mm256_addsub_ps`.
+- **AVX2** (`make avx2`): adds 256-bit integer operations. Q4_0 nibble extraction uses `_mm256_cvtepu8_epi32` (8 nibbles → 8 int32 in 2 ops vs. 4-step unpack chain). Q6_K weight extraction uses `_mm256_cvtepi8_epi32` (8 int8 → 8 int32 in 2 ops vs. 4-instruction macro chain). Targets Haswell+ Intel and Excavator+ AMD.
 
 ### 3. FP16 KV Cache
 
@@ -636,7 +650,7 @@ A: llama.cpp is excellent but requires ~200MB+ for the runtime on small models,
 A: TinyLlama 1.1B is a small model — it handles simple tasks (Q&A, summarization, basic reasoning, JSON generation) well. It won't match GPT-4, but it runs on a $10 board with no internet. For structured output, the `--json` grammar mode guarantees valid JSON regardless of model quality.
 
 **Q: What about GPU acceleration?**
-A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2) provides meaningful speedup.
+A: PicoLM is CPU-only by design. The target hardware ($10-15 boards) doesn't have GPUs. On x86/ARM CPUs, SIMD (NEON/SSE2/AVX) provides meaningful speedup.
 
 **Q: Can I use a different model?**
 A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https://huggingface.co/models?search=gguf) and point PicoLM at it. Recommended quantizations: Q4_K_M (best quality/size balance) or Q2_K (smallest, lower quality).
@@ -645,7 +659,9 @@ A: Any LLaMA-architecture GGUF model works. Download from [HuggingFace](https://
 
 ## Roadmap
 
-- [ ] AVX2/AVX-512 kernels for x86 (2-4x generation speed on modern CPUs)
+- [x] AVX kernels for x86 (`make avx` — 8-wide float ops, ~2x vs SSE2)
+- [x] AVX2 kernels for x86 (`make avx2` — 256-bit integer ops for Q4_0 and Q6_K quantized paths)
+- [ ] AVX-512 kernels for x86 (512-bit ops for server CPUs)
 - [ ] Speculative decoding with a draft model
 - [ ] Context sliding window (infinite generation beyond max_seq_len)
 - [ ] Weight pruning for further memory reduction
diff --git a/picolm/Makefile b/picolm/Makefile
index 4fd3c7a..6b1cd43 100644
--- a/picolm/Makefile
+++ b/picolm/Makefile
@@ -1,5 +1,5 @@
 CC      = gcc
-CFLAGS  = -O2 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic
+CFLAGS  = -O3 -std=c11 -D_GNU_SOURCE -Wall -Wextra -Wpedantic
 LDFLAGS = -lm -lpthread
 SRCS    = picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c
 TARGET  = picolm
@@ -11,11 +11,35 @@ MODEL_DIR ?= /opt/picolm/models
 native: CFLAGS += -march=native
 native: $(TARGET)
 
+# --- x86-64 default (SSE2 only, safe for all x86-64) ---
+x86: sse2
+
+# --- No SIMD (scalar fallback, portable to any architecture) ---
+scalar: CFLAGS += -mno-sse2 -mno-avx
+scalar: $(TARGET)
+
+# --- x86-64 with SSE2 only ---
+sse2: CFLAGS += -msse2
+sse2: $(TARGET)
+
+# --- x86-64 with SSE2+SSE3+SSSE3 (covers AMD Phenom/Athlon and similar without AVX) ---
+sse3: CFLAGS += -msse2 -msse3 -mssse3 -mpopcnt
+sse3: $(TARGET)
+
+# --- x86-64 with AVX (Sandy Bridge and newer Intel; Bulldozer and newer AMD) ---
+avx: CFLAGS += -mavx -mfma -mpopcnt
+avx: $(TARGET)
+
+# --- x86-64 with AVX2 (Haswell and newer Intel; Excavator and newer AMD) ---
+avx2: CFLAGS += -mavx2 -mfma -mpopcnt
+avx2: $(TARGET)
+
 $(TARGET): $(SRCS)
 	$(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS)
 
 # --- Static build for single-binary deployment ---
-static: CFLAGS += -march=native
+# Uses SSE2 (not -march=native) so the binary runs on any x86-64, not just the build machine.
+static: CFLAGS += -msse2
 static: LDFLAGS += -static
 static: $(TARGET)
 
@@ -70,4 +94,4 @@ model:
 clean:
 	rm -f $(TARGET) $(TARGET).exe *.obj *.o
 
-.PHONY: native static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean
+.PHONY: native x86 scalar sse2 sse3 avx avx2 static pi pi-arm32 cross-pi riscv cross-riscv debug install model clean
diff --git a/picolm/build.bat b/picolm/build.bat
index 3f65e6c..2df2026 100644
--- a/picolm/build.bat
+++ b/picolm/build.bat
@@ -1,7 +1,26 @@
 @echo off
+REM PicoLM Windows build script (MSVC)
+REM
+REM SIMD targets:
+REM   build.bat          -- SSE2 baseline (safe for any x86-64)
+REM   build.bat avx2     -- AVX2 (Haswell+ / Excavator+, fastest)
+REM   build.bat avx      -- AVX  (Sandy Bridge+ / Bulldozer+)
+REM   build.bat scalar   -- no SIMD (portable scalar fallback)
+
 call "C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\VC\Auxiliary\Build\vcvars64.bat" >nul 2>&1
-echo Compiling...
-cl /O2 /W3 /Fe:picolm.exe picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c
+
+set SIMD_FLAG=
+if /I "%1"=="avx2"   set SIMD_FLAG=/arch:AVX2
+if /I "%1"=="avx"    set SIMD_FLAG=/arch:AVX
+if /I "%1"=="scalar" set SIMD_FLAG=/d2archSSE42-
+
+if "%SIMD_FLAG%"=="" (
+    echo Building: SSE2 baseline
+) else (
+    echo Building: %1 ^(%SIMD_FLAG%^)
+)
+
+cl /O2 /W3 %SIMD_FLAG% /Fe:picolm.exe picolm.c model.c tensor.c quant.c tokenizer.c sampler.c grammar.c
 if %ERRORLEVEL% neq 0 (
     echo BUILD FAILED
 ) else (
diff --git a/picolm/model.c b/picolm/model.c
index 4b4040d..51bfbc5 100644
--- a/picolm/model.c
+++ b/picolm/model.c
@@ -406,7 +406,6 @@ static int parse_gguf(model_t *m, int max_seq_len) {
     fprintf(stderr, "  n_layers=%d, vocab_size=%d, max_seq=%d\n",
             cfg->n_layers, cfg->vocab_size, cfg->max_seq_len);
     fprintf(stderr, "  head_dim=%d, rope_base=%.1f\n", cfg->head_dim, cfg->rope_freq_base);
-
     free(tinfos);
     return 0;
 }
diff --git a/picolm/picolm.c b/picolm/picolm.c
index c2f1624..f9c713d 100644
--- a/picolm/picolm.c
+++ b/picolm/picolm.c
@@ -131,6 +131,21 @@ int main(int argc, char **argv) {
 
     /* Load model */
     fprintf(stderr, "Loading model: %s\n", model_path);
+    fprintf(stderr, "SIMD: %s\n",
+#if defined(PICOLM_AVX2)
+        "AVX2"
+#elif defined(PICOLM_AVX)
+        "AVX"
+#elif defined(PICOLM_SSE3)
+        "SSE3"
+#elif defined(PICOLM_SSE2)
+        "SSE2"
+#elif defined(PICOLM_NEON)
+        "NEON"
+#else
+        "scalar"
+#endif
+    );
     model_t model;
     if (model_load(&model, model_path, context_override) != 0) {
         fprintf(stderr, "Failed to load model\n");
diff --git a/picolm/quant.c b/picolm/quant.c
index 5a92998..cdea2c6 100644
--- a/picolm/quant.c
+++ b/picolm/quant.c
@@ -352,6 +352,21 @@ float vec_dot_f32_f32(const void *src, const float *x, int n) {
     for (; i < n; i++) sum += w[i] * x[i];
     return sum;
 
+#elif defined(PICOLM_AVX)
+    __m256 acc0 = _mm256_setzero_ps();
+    __m256 acc1 = _mm256_setzero_ps();
+    int i = 0;
+    for (; i + 15 < n; i += 16) {
+        acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_loadu_ps(w + i),     _mm256_loadu_ps(x + i)));
+        acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_loadu_ps(w + i + 8), _mm256_loadu_ps(x + i + 8)));
+    }
+    /* pick up a trailing group of 8 (common: hidden sizes are multiples of 8 not 16) */
+    for (; i + 7 < n; i += 8)
+        acc0 = _mm256_add_ps(acc0, _mm256_mul_ps(_mm256_loadu_ps(w + i), _mm256_loadu_ps(x + i)));
+    float sum = hsum_avx(_mm256_add_ps(acc0, acc1));
+    for (; i < n; i++) sum += w[i] * x[i];
+    return sum;
+
 #elif defined(PICOLM_SSE2)
     __m128 acc0 = _mm_setzero_ps();
     __m128 acc1 = _mm_setzero_ps();
@@ -440,6 +455,113 @@ float vec_dot_q4_K_f32(const void *src, const float *x, int n) {
             float sum_x1  = vaddvq_f32_compat(sum_x1_v);
             float sum_qx2 = vaddvq_f32_compat(sum_qx2_v);
             float sum_x2  = vaddvq_f32_compat(sum_x2_v);
+#elif defined(PICOLM_AVX2)
+            /* AVX2: 256-bit integer ops allow zero-extending 8 uint8 nibbles
+             * to 8 int32 in one _mm256_cvtepu8_epi32 instruction, then a
+             * single _mm256_cvtepi32_ps — no multi-step unpack chain needed. */
+            __m256 sum_qx1_v = _mm256_setzero_ps();
+            __m256 sum_x1_v  = _mm256_setzero_ps();
+            __m256 sum_qx2_v = _mm256_setzero_ps();
+            __m256 sum_x2_v  = _mm256_setzero_ps();
+            const __m128i mask4 = _mm_set1_epi8(0x0F);
+
+            for (int l = 0; l < 32; l += 8) {
+                __m128i qb  = _mm_loadl_epi64((const __m128i *)(q + l));
+                __m128i lo8 = _mm_and_si128(qb, mask4);
+                __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4);
+
+                /* AVX2: zero-extend 8 uint8 → 8 int32 → 8 float in 2 ops */
+                __m256 qf_lo = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(lo8));
+                __m256 qf_hi = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(hi8));
+
+                __m256 xv_lo = _mm256_loadu_ps(xp + l);
+                __m256 xv_hi = _mm256_loadu_ps(xp + l + 32);
+
+                sum_qx1_v = _mm256_add_ps(sum_qx1_v, _mm256_mul_ps(qf_lo, xv_lo));
+                sum_x1_v  = _mm256_add_ps(sum_x1_v,  xv_lo);
+                sum_qx2_v = _mm256_add_ps(sum_qx2_v, _mm256_mul_ps(qf_hi, xv_hi));
+                sum_x2_v  = _mm256_add_ps(sum_x2_v,  xv_hi);
+            }
+            float sum_qx1 = hsum_avx(sum_qx1_v);
+            float sum_x1  = hsum_avx(sum_x1_v);
+            float sum_qx2 = hsum_avx(sum_qx2_v);
+            float sum_x2  = hsum_avx(sum_x2_v);
+#elif defined(PICOLM_AVX)
+            /* AVX: 128-bit nibble extraction (no AVX2 int needed), 256-bit float accumulators */
+            __m256 sum_qx1_v = _mm256_setzero_ps();
+            __m256 sum_x1_v  = _mm256_setzero_ps();
+            __m256 sum_qx2_v = _mm256_setzero_ps();
+            __m256 sum_x2_v  = _mm256_setzero_ps();
+            const __m128i mask4  = _mm_set1_epi8(0x0F);
+            const __m128i zero_i = _mm_setzero_si128();
+
+            for (int l = 0; l < 32; l += 8) {
+                __m128i qb  = _mm_loadl_epi64((const __m128i *)(q + l));
+                __m128i lo8 = _mm_and_si128(qb, mask4);
+                __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4);
+
+                __m128i lo16 = _mm_unpacklo_epi8(lo8, zero_i);
+                __m128i hi16 = _mm_unpacklo_epi8(hi8, zero_i);
+
+                /* Combine two __m128 → one __m256 of 8 floats */
+                __m256 qf_lo = _mm256_set_m128(
+                    _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo16, zero_i)),
+                    _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo16, zero_i)));
+                __m256 qf_hi = _mm256_set_m128(
+                    _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi16, zero_i)),
+                    _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi16, zero_i)));
+
+                __m256 xv_lo = _mm256_loadu_ps(xp + l);
+                __m256 xv_hi = _mm256_loadu_ps(xp + l + 32);
+
+                sum_qx1_v = _mm256_add_ps(sum_qx1_v, _mm256_mul_ps(qf_lo, xv_lo));
+                sum_x1_v  = _mm256_add_ps(sum_x1_v,  xv_lo);
+                sum_qx2_v = _mm256_add_ps(sum_qx2_v, _mm256_mul_ps(qf_hi, xv_hi));
+                sum_x2_v  = _mm256_add_ps(sum_x2_v,  xv_hi);
+            }
+            float sum_qx1 = hsum_avx(sum_qx1_v);
+            float sum_x1  = hsum_avx(sum_x1_v);
+            float sum_qx2 = hsum_avx(sum_qx2_v);
+            float sum_x2  = hsum_avx(sum_x2_v);
+#elif defined(PICOLM_SSE2)
+            /* SSE2: lo nibble → group1 (xp+l), hi nibble → group2 (xp+l+32) */
+            __m128 sum_qx1_v = _mm_setzero_ps();
+            __m128 sum_x1_v  = _mm_setzero_ps();
+            __m128 sum_qx2_v = _mm_setzero_ps();
+            __m128 sum_x2_v  = _mm_setzero_ps();
+            const __m128i mask4 = _mm_set1_epi8(0x0F);
+            const __m128i zero_i = _mm_setzero_si128();
+
+            for (int l = 0; l < 32; l += 8) {
+                /* Load 8 quantized bytes -> 8 lo + 8 hi nibbles */
+                __m128i qb  = _mm_loadl_epi64((const __m128i *)(q + l));
+                __m128i lo8 = _mm_and_si128(qb, mask4);
+                __m128i hi8 = _mm_and_si128(_mm_srli_epi16(qb, 4), mask4);
+
+                /* Widen uint8 nibbles -> int32 -> float (8 values each) */
+                __m128i lo16 = _mm_unpacklo_epi8(lo8, zero_i);
+                __m128i hi16 = _mm_unpacklo_epi8(hi8, zero_i);
+                __m128 qf_lo0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(lo16, zero_i));
+                __m128 qf_lo1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(lo16, zero_i));
+                __m128 qf_hi0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(hi16, zero_i));
+                __m128 qf_hi1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(hi16, zero_i));
+
+                __m128 xv_lo0 = _mm_loadu_ps(xp + l);
+                __m128 xv_lo1 = _mm_loadu_ps(xp + l + 4);
+                __m128 xv_hi0 = _mm_loadu_ps(xp + l + 32);
+                __m128 xv_hi1 = _mm_loadu_ps(xp + l + 36);
+
+                sum_qx1_v = _mm_add_ps(sum_qx1_v,
+                    _mm_add_ps(_mm_mul_ps(qf_lo0, xv_lo0), _mm_mul_ps(qf_lo1, xv_lo1)));
+                sum_x1_v  = _mm_add_ps(sum_x1_v,  _mm_add_ps(xv_lo0, xv_lo1));
+                sum_qx2_v = _mm_add_ps(sum_qx2_v,
+                    _mm_add_ps(_mm_mul_ps(qf_hi0, xv_hi0), _mm_mul_ps(qf_hi1, xv_hi1)));
+                sum_x2_v  = _mm_add_ps(sum_x2_v,  _mm_add_ps(xv_hi0, xv_hi1));
+            }
+            float sum_qx1 = hsum_sse(sum_qx1_v);
+            float sum_x1  = hsum_sse(sum_x1_v);
+            float sum_qx2 = hsum_sse(sum_qx2_v);
+            float sum_x2  = hsum_sse(sum_x2_v);
 #else
             float sum_qx1 = 0.0f, sum_x1 = 0.0f;
             float sum_qx2 = 0.0f, sum_x2 = 0.0f;
@@ -476,9 +598,199 @@ float vec_dot_q6_K_f32(const void *src, const float *x, int n) {
         const int8_t  *sc = blocks[i].scales;
         const float *xp = x + i * 256;
 
-        /* Accumulate per-scale-group sums: 16 groups of 16 elements each */
         float sums[16] = {0};
 
+/* sign-extend packed int8 → two __m128 floats; used by AVX and SSE2 paths.
+ * Idiom: unpacklo_epi8(zero, x) places each byte in the HIGH byte of a 16-bit
+ * lane; srai_epi16(..., 8) then arithmetic-shifts it down, propagating the sign
+ * bit — equivalent to a sign-extending byte→int16 widening without SSE4.1. */
+#if defined(PICOLM_AVX) || defined(PICOLM_SSE2)
+#define Q6K_CONV(qi8, fa, fb) do { \
+    __m128i w16 = _mm_srai_epi16(_mm_unpacklo_epi8(zero_i, qi8), 8); \
+    fa = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero_i, w16), 16)); \
+    fb = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero_i, w16), 16)); \
+} while (0)
+#endif
+
+#ifdef PICOLM_AVX2
+        /* AVX2: _mm256_cvtepi8_epi32 replaces the 4-op Q6K_CONV sign-extension chain */
+        const __m128i mask4  = _mm_set1_epi8(0x0F);
+        const __m128i mask3  = _mm_set1_epi8(0x03);
+        const __m128i sub32  = _mm_set1_epi8(32);
+
+        for (int chunk = 0; chunk < 2; chunk++) {
+            int is = chunk * 8;
+            const uint8_t *ql_c = ql + chunk * 64;
+            const uint8_t *qh_c = qh + chunk * 32;
+            const float   *xp_c = xp + chunk * 128;
+
+            for (int half = 0; half < 2; half++) {
+                int l0   = half * 16;
+                int sidx = is + half;
+                __m256 acc1 = _mm256_setzero_ps();
+                __m256 acc2 = _mm256_setzero_ps();
+                __m256 acc3 = _mm256_setzero_ps();
+                __m256 acc4 = _mm256_setzero_ps();
+
+                for (int l = l0; l < l0 + 16; l += 8) {
+                    __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l));
+                    __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32));
+                    __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l));
+
+                    __m128i lo_a = _mm_and_si128(qla, mask4);
+                    __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4);
+                    __m128i lo_b = _mm_and_si128(qlb, mask4);
+                    __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4);
+
+                    __m128i h01 = _mm_and_si128(qhv, mask3);
+                    __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3);
+                    __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3);
+                    __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3);
+
+                    __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32);
+                    __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32);
+                    __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32);
+                    __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32);
+
+                    /* AVX2: sign-extend 8 int8 → 8 int32 → 8 float in 2 ops */
+                    __m256 qf1 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q1_i8));
+                    __m256 qf2 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q2_i8));
+                    __m256 qf3 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q3_i8));
+                    __m256 qf4 = _mm256_cvtepi32_ps(_mm256_cvtepi8_epi32(q4_i8));
+
+                    acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(qf1, _mm256_loadu_ps(xp_c + l)));
+                    acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(qf2, _mm256_loadu_ps(xp_c + l + 32)));
+                    acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(qf3, _mm256_loadu_ps(xp_c + l + 64)));
+                    acc4 = _mm256_add_ps(acc4, _mm256_mul_ps(qf4, _mm256_loadu_ps(xp_c + l + 96)));
+                }
+                sums[sidx + 0] += hsum_avx(acc1);
+                sums[sidx + 2] += hsum_avx(acc2);
+                sums[sidx + 4] += hsum_avx(acc3);
+                sums[sidx + 6] += hsum_avx(acc4);
+            }
+        }
+#elif defined(PICOLM_AVX)
+        /* AVX: 128-bit integer extraction, 256-bit float accumulators */
+        const __m128i mask4  = _mm_set1_epi8(0x0F);
+        const __m128i mask3  = _mm_set1_epi8(0x03);
+        const __m128i sub32  = _mm_set1_epi8(32);
+        const __m128i zero_i = _mm_setzero_si128();
+
+        for (int chunk = 0; chunk < 2; chunk++) {
+            int is = chunk * 8;
+            const uint8_t *ql_c = ql + chunk * 64;
+            const uint8_t *qh_c = qh + chunk * 32;
+            const float   *xp_c = xp + chunk * 128;
+
+            for (int half = 0; half < 2; half++) {
+                int l0   = half * 16;
+                int sidx = is + half;
+                __m256 acc1 = _mm256_setzero_ps();
+                __m256 acc2 = _mm256_setzero_ps();
+                __m256 acc3 = _mm256_setzero_ps();
+                __m256 acc4 = _mm256_setzero_ps();
+
+                for (int l = l0; l < l0 + 16; l += 8) {
+                    __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l));
+                    __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32));
+                    __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l));
+
+                    __m128i lo_a = _mm_and_si128(qla, mask4);
+                    __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4);
+                    __m128i lo_b = _mm_and_si128(qlb, mask4);
+                    __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4);
+
+                    __m128i h01 = _mm_and_si128(qhv, mask3);
+                    __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3);
+                    __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3);
+                    __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3);
+
+                    __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32);
+                    __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32);
+                    __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32);
+                    __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32);
+
+                    __m128 qf1a, qf1b, qf2a, qf2b, qf3a, qf3b, qf4a, qf4b;
+                    Q6K_CONV(q1_i8, qf1a, qf1b);
+                    Q6K_CONV(q2_i8, qf2a, qf2b);
+                    Q6K_CONV(q3_i8, qf3a, qf3b);
+                    Q6K_CONV(q4_i8, qf4a, qf4b);
+
+                    acc1 = _mm256_add_ps(acc1, _mm256_mul_ps(_mm256_set_m128(qf1b, qf1a), _mm256_loadu_ps(xp_c + l)));
+                    acc2 = _mm256_add_ps(acc2, _mm256_mul_ps(_mm256_set_m128(qf2b, qf2a), _mm256_loadu_ps(xp_c + l + 32)));
+                    acc3 = _mm256_add_ps(acc3, _mm256_mul_ps(_mm256_set_m128(qf3b, qf3a), _mm256_loadu_ps(xp_c + l + 64)));
+                    acc4 = _mm256_add_ps(acc4, _mm256_mul_ps(_mm256_set_m128(qf4b, qf4a), _mm256_loadu_ps(xp_c + l + 96)));
+                }
+                sums[sidx + 0] += hsum_avx(acc1);
+                sums[sidx + 2] += hsum_avx(acc2);
+                sums[sidx + 4] += hsum_avx(acc3);
+                sums[sidx + 6] += hsum_avx(acc4);
+            }
+        }
+#elif defined(PICOLM_SSE2)
+        /* SSE2: 6-bit values = lo4(ql) | hi2(qh)<<4, biased by 32 */
+        const __m128i mask4  = _mm_set1_epi8(0x0F);
+        const __m128i mask3  = _mm_set1_epi8(0x03);
+        const __m128i sub32  = _mm_set1_epi8(32);
+        const __m128i zero_i = _mm_setzero_si128();
+
+        for (int chunk = 0; chunk < 2; chunk++) {
+            int is = chunk * 8;
+            const uint8_t *ql_c = ql + chunk * 64;
+            const uint8_t *qh_c = qh + chunk * 32;
+            const float   *xp_c = xp + chunk * 128;
+
+            for (int half = 0; half < 2; half++) { /* half=0 -> sums[+0,2,4,6], half=1 -> [+1,3,5,7] */
+                int l0   = half * 16;
+                int sidx = is + half;
+                __m128 acc1a = _mm_setzero_ps(), acc1b = _mm_setzero_ps();
+                __m128 acc2a = _mm_setzero_ps(), acc2b = _mm_setzero_ps();
+                __m128 acc3a = _mm_setzero_ps(), acc3b = _mm_setzero_ps();
+                __m128 acc4a = _mm_setzero_ps(), acc4b = _mm_setzero_ps();
+
+                for (int l = l0; l < l0 + 16; l += 8) {
+                    __m128i qla = _mm_loadl_epi64((const __m128i *)(ql_c + l));
+                    __m128i qlb = _mm_loadl_epi64((const __m128i *)(ql_c + l + 32));
+                    __m128i qhv = _mm_loadl_epi64((const __m128i *)(qh_c + l));
+
+                    __m128i lo_a = _mm_and_si128(qla, mask4);
+                    __m128i hi_a = _mm_and_si128(_mm_srli_epi16(qla, 4), mask4);
+                    __m128i lo_b = _mm_and_si128(qlb, mask4);
+                    __m128i hi_b = _mm_and_si128(_mm_srli_epi16(qlb, 4), mask4);
+
+                    /* epi16 shifts on qh: avoids byte-lane bleed from epi8 shifts */
+                    __m128i h01 = _mm_and_si128(qhv, mask3);
+                    __m128i h23 = _mm_and_si128(_mm_srli_epi16(qhv, 2), mask3);
+                    __m128i h45 = _mm_and_si128(_mm_srli_epi16(qhv, 4), mask3);
+                    __m128i h67 = _mm_and_si128(_mm_srli_epi16(qhv, 6), mask3);
+
+                    __m128i q1_i8 = _mm_sub_epi8(_mm_or_si128(lo_a, _mm_slli_epi16(h01, 4)), sub32);
+                    __m128i q2_i8 = _mm_sub_epi8(_mm_or_si128(lo_b, _mm_slli_epi16(h23, 4)), sub32);
+                    __m128i q3_i8 = _mm_sub_epi8(_mm_or_si128(hi_a, _mm_slli_epi16(h45, 4)), sub32);
+                    __m128i q4_i8 = _mm_sub_epi8(_mm_or_si128(hi_b, _mm_slli_epi16(h67, 4)), sub32);
+
+                    __m128 qf1a, qf1b, qf2a, qf2b, qf3a, qf3b, qf4a, qf4b;
+                    Q6K_CONV(q1_i8, qf1a, qf1b);
+                    Q6K_CONV(q2_i8, qf2a, qf2b);
+                    Q6K_CONV(q3_i8, qf3a, qf3b);
+                    Q6K_CONV(q4_i8, qf4a, qf4b);
+
+                    acc1a = _mm_add_ps(acc1a, _mm_mul_ps(qf1a, _mm_loadu_ps(xp_c + l)));
+                    acc1b = _mm_add_ps(acc1b, _mm_mul_ps(qf1b, _mm_loadu_ps(xp_c + l + 4)));
+                    acc2a = _mm_add_ps(acc2a, _mm_mul_ps(qf2a, _mm_loadu_ps(xp_c + l + 32)));
+                    acc2b = _mm_add_ps(acc2b, _mm_mul_ps(qf2b, _mm_loadu_ps(xp_c + l + 36)));
+                    acc3a = _mm_add_ps(acc3a, _mm_mul_ps(qf3a, _mm_loadu_ps(xp_c + l + 64)));
+                    acc3b = _mm_add_ps(acc3b, _mm_mul_ps(qf3b, _mm_loadu_ps(xp_c + l + 68)));
+                    acc4a = _mm_add_ps(acc4a, _mm_mul_ps(qf4a, _mm_loadu_ps(xp_c + l + 96)));
+                    acc4b = _mm_add_ps(acc4b, _mm_mul_ps(qf4b, _mm_loadu_ps(xp_c + l + 100)));
+                }
+                sums[sidx + 0] += hsum_sse(_mm_add_ps(acc1a, acc1b));
+                sums[sidx + 2] += hsum_sse(_mm_add_ps(acc2a, acc2b));
+                sums[sidx + 4] += hsum_sse(_mm_add_ps(acc3a, acc3b));
+                sums[sidx + 6] += hsum_sse(_mm_add_ps(acc4a, acc4b));
+            }
+        }
+#else
         for (int chunk = 0; chunk < 2; chunk++) {
             int is = chunk * 8;
             const uint8_t *ql_c = ql + chunk * 64;
@@ -506,6 +818,9 @@ float vec_dot_q6_K_f32(const void *src, const float *x, int n) {
                 sums[is + 7] += (float)q4 * xp_c[l + 96];
             }
         }
+#endif
+
+#undef Q6K_CONV
 
         for (int j = 0; j < 16; j++) {
             sumf += d * (float)sc[j] * sums[j];
diff --git a/picolm/quant.h b/picolm/quant.h
index e35095c..3e8118a 100644
--- a/picolm/quant.h
+++ b/picolm/quant.h
@@ -4,23 +4,61 @@
 #include <stdint.h>
 #include <stddef.h>
 
-/* ---- SIMD detection ---- */
+/* ---- SIMD detection ----
+ *
+ * Each level explicitly implies all lower levels so that code only needs to
+ * check a single flag.  The order of checks (highest first) lets each block
+ * upgrade flags that a lower-level check would otherwise miss if the compiler
+ * only predefines the highest applicable macro.
+ *
+ * Hierarchy (x86):
+ *   PICOLM_SSE2 ⊂ PICOLM_SSE3 ⊂ PICOLM_AVX ⊂ PICOLM_AVX2
+ *
+ * ARM:
+ *   PICOLM_NEON (independent)
+ */
+
+/* --- ARM NEON --- */
 #if defined(__ARM_NEON) || defined(__ARM_NEON__)
-#define PICOLM_NEON 1
-#include <arm_neon.h>
+#  define PICOLM_NEON 1
+#  include <arm_neon.h>
 static inline float vaddvq_f32_compat(float32x4_t v) {
-#if defined(__aarch64__)
+#  if defined(__aarch64__)
     return vaddvq_f32(v);
-#else
+#  else
     float32x2_t r = vadd_f32(vget_low_f32(v), vget_high_f32(v));
     return vget_lane_f32(vpadd_f32(r, r), 0);
-#endif
+#  endif
 }
 #endif
 
-#if defined(__SSE2__) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))
-#define PICOLM_SSE2 1
-#include <immintrin.h>
+/* --- x86 SIMD: detect highest level, then propagate downward --- */
+
+/* AVX2 implies AVX + SSE3 + SSE2 */
+#if defined(__AVX2__)
+#  define PICOLM_AVX2 1
+#  define PICOLM_AVX  1
+#  define PICOLM_SSE3 1
+#  define PICOLM_SSE2 1
+/* AVX implies SSE3 + SSE2 */
+#elif defined(__AVX__)
+#  define PICOLM_AVX  1
+#  define PICOLM_SSE3 1
+#  define PICOLM_SSE2 1
+/* SSE3 implies SSE2 */
+#elif defined(__SSE3__)
+#  define PICOLM_SSE3 1
+#  define PICOLM_SSE2 1
+/* SSE2 baseline (also the default for all x86-64 targets) */
+#elif defined(__SSE2__) || (defined(_MSC_VER) && (defined(_M_X64) || defined(_M_AMD64)))
+#  define PICOLM_SSE2 1
+#endif
+
+/* Include x86 SIMD header once for any x86 SIMD level.
+ * <immintrin.h> is an umbrella header that exposes all intrinsics
+ * available for the current -march target. */
+#ifdef PICOLM_SSE2
+#  include <immintrin.h>
 static inline float hsum_sse(__m128 v) {
     __m128 shuf = _mm_movehl_ps(v, v);
     __m128 sum  = _mm_add_ps(v, shuf);
@@ -30,6 +68,14 @@ static inline float hsum_sse(__m128 v) {
 }
 #endif
 
+#ifdef PICOLM_AVX
+static inline float hsum_avx(__m256 v) {
+    __m128 lo = _mm256_castps256_ps128(v);
+    __m128 hi = _mm256_extractf128_ps(v, 1);
+    return hsum_sse(_mm_add_ps(lo, hi));
+}
+#endif
+
 /* GGUF tensor data types */
 typedef enum {
     GGUF_TYPE_F32   = 0,
diff --git a/picolm/tensor.c b/picolm/tensor.c
index 59a68e6..a12db9d 100644
--- a/picolm/tensor.c
+++ b/picolm/tensor.c
@@ -137,6 +137,15 @@ void rmsnorm(float *out, const float *x, const float *weight, int size) {
     }
     ss = vaddvq_f32_compat(acc);
     for (; i < size; i++) ss += x[i] * x[i];
+#elif defined(PICOLM_AVX)
+    __m256 acc = _mm256_setzero_ps();
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        __m256 v = _mm256_loadu_ps(x + i);
+        acc = _mm256_add_ps(acc, _mm256_mul_ps(v, v));
+    }
+    ss = hsum_avx(acc);
+    for (; i < size; i++) ss += x[i] * x[i];
 #elif defined(PICOLM_SSE2)
     __m128 acc = _mm_setzero_ps();
     int i = 0;
@@ -161,6 +170,15 @@ void rmsnorm(float *out, const float *x, const float *weight, int size) {
         vst1q_f32(out + i, vmulq_f32(vmulq_f32(v, scale), w));
     }
     for (; i < size; i++) out[i] = x[i] * ss * weight[i];
+#elif defined(PICOLM_AVX)
+    __m256 scale = _mm256_set1_ps(ss);
+    i = 0;
+    for (; i + 7 < size; i += 8) {
+        __m256 v = _mm256_loadu_ps(x + i);
+        __m256 w = _mm256_loadu_ps(weight + i);
+        _mm256_storeu_ps(out + i, _mm256_mul_ps(_mm256_mul_ps(v, scale), w));
+    }
+    for (; i < size; i++) out[i] = x[i] * ss * weight[i];
 #elif defined(PICOLM_SSE2)
     __m128 scale = _mm_set1_ps(ss);
     i = 0;
@@ -194,6 +212,13 @@ void softmax(float *x, int size) {
         vst1q_f32(x + i, vmulq_f32(vld1q_f32(x + i), inv_v));
     }
     for (; i < size; i++) x[i] *= inv;
+#elif defined(PICOLM_AVX)
+    __m256 inv_v = _mm256_set1_ps(inv);
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        _mm256_storeu_ps(x + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), inv_v));
+    }
+    for (; i < size; i++) x[i] *= inv;
 #elif defined(PICOLM_SSE2)
     __m128 inv_v = _mm_set1_ps(inv);
     int i = 0;
@@ -206,6 +231,64 @@ void softmax(float *x, int size) {
 #endif
 }
 
+/* AVX RoPE: 4 complex pairs/iter; addsub handles r*cos-i*sin / r*sin+i*cos in one op */
+#ifdef PICOLM_AVX
+static void rope_avx(float *h, int half, const float *cos_pos, const float *sin_pos) {
+    int i = 0;
+    for (; i + 3 < half; i += 4) {
+        __m256 v   = _mm256_loadu_ps(h + i * 2);
+        __m128 c4  = _mm_loadu_ps(cos_pos + i);
+        __m128 s4  = _mm_loadu_ps(sin_pos + i);
+        __m256 cv  = _mm256_set_m128(_mm_unpackhi_ps(c4, c4), _mm_unpacklo_ps(c4, c4));
+        __m256 sv  = _mm256_set_m128(_mm_unpackhi_ps(s4, s4), _mm_unpacklo_ps(s4, s4));
+        __m256 sw  = _mm256_permute_ps(v, 0xB1); /* swap r,i within each pair */
+        _mm256_storeu_ps(h + i * 2,
+            _mm256_addsub_ps(_mm256_mul_ps(v, cv), _mm256_mul_ps(sw, sv)));
+    }
+    for (; i < half; i++) {
+        float r = h[i * 2], im = h[i * 2 + 1];
+        h[i * 2]     = r * cos_pos[i] - im * sin_pos[i];
+        h[i * 2 + 1] = r * sin_pos[i] + im * cos_pos[i];
+    }
+}
+#endif
+
+/* SSE2/SSE3 RoPE: 2 pairs/iter; SSE3 uses addsub, SSE2 uses sign-mask to negate even lanes */
+#if defined(PICOLM_SSE2) && !defined(PICOLM_AVX)
+static void rope_sse(float *h, int half, const float *cos_pos, const float *sin_pos) {
+    int i = 0;
+#ifdef PICOLM_SSE3
+    for (; i + 1 < half; i += 2) {
+        __m128 v  = _mm_loadu_ps(h + i * 2);
+        __m128 c2 = _mm_unpacklo_ps(_mm_load_ss(cos_pos + i), _mm_load_ss(cos_pos + i + 1));
+        __m128 s2 = _mm_unpacklo_ps(_mm_load_ss(sin_pos + i), _mm_load_ss(sin_pos + i + 1));
+        __m128 cv = _mm_shuffle_ps(c2, c2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sv = _mm_shuffle_ps(s2, s2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sw = _mm_shuffle_ps(v,  v,  _MM_SHUFFLE(2,3,0,1));
+        _mm_storeu_ps(h + i * 2, _mm_addsub_ps(_mm_mul_ps(v, cv), _mm_mul_ps(sw, sv)));
+    }
+#else
+    const __m128 sign = _mm_set_ps(1.0f, -1.0f, 1.0f, -1.0f);
+    for (; i + 1 < half; i += 2) {
+        __m128 v  = _mm_loadu_ps(h + i * 2);
+        __m128 c2 = _mm_unpacklo_ps(_mm_load_ss(cos_pos + i), _mm_load_ss(cos_pos + i + 1));
+        __m128 s2 = _mm_unpacklo_ps(_mm_load_ss(sin_pos + i), _mm_load_ss(sin_pos + i + 1));
+        __m128 cv = _mm_shuffle_ps(c2, c2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sv = _mm_shuffle_ps(s2, s2, _MM_SHUFFLE(1,1,0,0));
+        __m128 sw = _mm_shuffle_ps(v,  v,  _MM_SHUFFLE(2,3,0,1));
+        __m128 a  = _mm_mul_ps(v, cv);
+        __m128 b  = _mm_mul_ps(_mm_mul_ps(sign, sw), sv);
+        _mm_storeu_ps(h + i * 2, _mm_add_ps(a, b));
+    }
+#endif
+    for (; i < half; i++) {
+        float r = h[i * 2], im = h[i * 2 + 1];
+        h[i * 2]     = r * cos_pos[i] - im * sin_pos[i];
+        h[i * 2 + 1] = r * sin_pos[i] + im * cos_pos[i];
+    }
+}
+#endif
+
 /* Rotary position encoding using pre-computed cos/sin tables */
 void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads,
           const float *cos_pos, const float *sin_pos) {
@@ -233,6 +316,10 @@ void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads,
             qh[i * 2]     = q0 * cos_pos[i] - q1 * sin_pos[i];
             qh[i * 2 + 1] = q0 * sin_pos[i] + q1 * cos_pos[i];
         }
+#elif defined(PICOLM_AVX)
+        rope_avx(qh, half, cos_pos, sin_pos);
+#elif defined(PICOLM_SSE2)
+        rope_sse(qh, half, cos_pos, sin_pos);
 #else
         for (int i = 0; i < half; i++) {
             float q0 = qh[i * 2];
@@ -246,12 +333,18 @@ void rope(float *q, float *k, int head_dim, int n_heads, int n_kv_heads,
     /* Apply RoPE to all KV heads */
     for (int h = 0; h < n_kv_heads; h++) {
         float *kh = k + h * head_dim;
+#ifdef PICOLM_AVX
+        rope_avx(kh, half, cos_pos, sin_pos);
+#elif defined(PICOLM_SSE2)
+        rope_sse(kh, half, cos_pos, sin_pos);
+#else
         for (int i = 0; i < half; i++) {
             float k0 = kh[i * 2];
             float k1 = kh[i * 2 + 1];
             kh[i * 2]     = k0 * cos_pos[i] - k1 * sin_pos[i];
             kh[i * 2 + 1] = k0 * sin_pos[i] + k1 * cos_pos[i];
         }
+#endif
     }
 }
 
@@ -268,6 +361,12 @@ void elemwise_mul(float *out, const float *a, const float *b, int size) {
         vst1q_f32(out + i, vmulq_f32(vld1q_f32(a + i), vld1q_f32(b + i)));
     }
     for (; i < size; i++) out[i] = a[i] * b[i];
+#elif defined(PICOLM_AVX)
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        _mm256_storeu_ps(out + i, _mm256_mul_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)));
+    }
+    for (; i < size; i++) out[i] = a[i] * b[i];
 #elif defined(PICOLM_SSE2)
     int i = 0;
     for (; i + 3 < size; i += 4) {
@@ -286,6 +385,12 @@ void vec_add(float *a, const float *b, int size) {
         vst1q_f32(a + i, vaddq_f32(vld1q_f32(a + i), vld1q_f32(b + i)));
     }
     for (; i < size; i++) a[i] += b[i];
+#elif defined(PICOLM_AVX)
+    int i = 0;
+    for (; i + 7 < size; i += 8) {
+        _mm256_storeu_ps(a + i, _mm256_add_ps(_mm256_loadu_ps(a + i), _mm256_loadu_ps(b + i)));
+    }
+    for (; i < size; i++) a[i] += b[i];
 #elif defined(PICOLM_SSE2)
     int i = 0;
     for (; i + 3 < size; i += 4) {