Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
62 changes: 59 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,11 +1,16 @@
CC ?= cc
UNAME_S := $(shell uname -s)
UNAME_M := $(shell uname -m)

ifeq ($(UNAME_S),Darwin)
ifeq ($(UNAME_M),arm64)
NATIVE_CPU_FLAG ?= -mcpu=native
else
NATIVE_CPU_FLAG ?= -march=native
endif
else
NATIVE_CPU_FLAG ?= -march=native
endif

DEBUG_FLAGS ?= -g
CFLAGS ?= -O3 -ffast-math $(DEBUG_FLAGS) $(NATIVE_CPU_FLAG) -Wall -Wextra -std=c99
Expand Down Expand Up @@ -33,7 +38,7 @@ CUDA_LDLIBS ?= -lm -Xcompiler -pthread -L$(CUDA_HOME)/targets/sbsa-linux/lib -L$
METAL_LDLIBS := $(LDLIBS)
endif

.PHONY: all help clean test cpu cuda cuda-spark cuda-generic cuda-regression
.PHONY: all help clean test cpu cpu-avx2 cpu-avx512 cpu-avx512-vnni cpu-simd-build cuda cuda-spark cuda-generic cuda-regression

ifeq ($(UNAME_S),Darwin)
all: ds4 ds4-server ds4-bench ds4-eval ds4-agent
Expand All @@ -42,6 +47,9 @@ help:
@echo "DS4 build targets:"
@echo " make Build Metal ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
@echo " make cpu Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
@echo " make cpu-avx2 Build CPU-only with AVX2 (x86_64 only)"
@echo " make cpu-avx512 Build CPU-only with AVX512BW (x86_64 only)"
@echo " make cpu-avx512-vnni Build CPU-only with AVX512BW+VNNI (x86_64 only)"
@echo " make test Build and run tests"
@echo " make clean Remove build outputs"

Expand Down Expand Up @@ -78,6 +86,9 @@ help:
@echo " make cuda-generic Build CUDA for a generic local CUDA GPU"
@echo " make cuda CUDA_ARCH=sm_N Build CUDA with an explicit nvcc -arch value"
@echo " make cpu Build CPU-only ./ds4, ./ds4-server, ./ds4-bench, ./ds4-eval, and ./ds4-agent"
@echo " make cpu-avx2 Build CPU-only with AVX2"
@echo " make cpu-avx512 Build CPU-only with AVX512BW"
@echo " make cpu-avx512-vnni Build CPU-only with AVX512BW+VNNI"
@echo " make test Build and run tests"
@echo " make clean Remove build outputs"

Expand Down Expand Up @@ -121,6 +132,46 @@ cuda-regression: tests/cuda_long_context_smoke
./tests/cuda_long_context_smoke
endif

# --- SIMD-specific CPU builds (x86_64 only, shared across Darwin/Linux) ---
X86_64_HOST := $(filter x86_64 amd64,$(UNAME_M))

ifneq ($(X86_64_HOST),)
cpu-avx2:
$(MAKE) cpu-simd-build NATIVE_CPU_FLAG= SUFFIX=-avx2 SIMDFLAGS="-mavx2"

cpu-avx512:
$(MAKE) cpu-simd-build NATIVE_CPU_FLAG= SUFFIX=-avx512 SIMDFLAGS="-mavx2 -mavx512f -mavx512bw"

cpu-avx512-vnni:
$(MAKE) cpu-simd-build NATIVE_CPU_FLAG= SUFFIX=-avx512-vnni SIMDFLAGS="-mavx2 -mavx512f -mavx512bw -mavx512vnni"
else
cpu-avx2 cpu-avx512 cpu-avx512-vnni:
@echo "error: $$@ requires an x86_64 host (detected: $(UNAME_M))"
@false
endif

BDIR = build/cpu$(SUFFIX)

cpu-simd-build:
@mkdir -p $(BDIR)
$(CC) $(CFLAGS) $(SIMDFLAGS) -DDS4_NO_GPU -c -o $(BDIR)/ds4_cpu.o ds4.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -DDS4_NO_GPU -c -o $(BDIR)/ds4_cli_cpu.o ds4_cli.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -DDS4_NO_GPU -c -o $(BDIR)/ds4_server_cpu.o ds4_server.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -DDS4_NO_GPU -c -o $(BDIR)/ds4_bench_cpu.o ds4_bench.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -DDS4_NO_GPU -c -o $(BDIR)/ds4_eval_cpu.o ds4_eval.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -DDS4_NO_GPU -c -o $(BDIR)/ds4_agent_cpu.o ds4_agent.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -c -o $(BDIR)/ds4_distributed.o ds4_distributed.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -c -o $(BDIR)/ds4_help.o ds4_help.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -c -o $(BDIR)/ds4_web.o ds4_web.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -c -o $(BDIR)/ds4_kvstore.o ds4_kvstore.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -c -o $(BDIR)/linenoise.o linenoise.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -c -o $(BDIR)/rax.o rax.c
$(CC) $(CFLAGS) $(SIMDFLAGS) -o ds4$(SUFFIX) $(BDIR)/ds4_cli_cpu.o $(BDIR)/ds4_help.o $(BDIR)/linenoise.o $(BDIR)/ds4_cpu.o $(BDIR)/ds4_distributed.o $(LDLIBS)
$(CC) $(CFLAGS) $(SIMDFLAGS) -o ds4-server$(SUFFIX) $(BDIR)/ds4_server_cpu.o $(BDIR)/ds4_help.o $(BDIR)/ds4_kvstore.o $(BDIR)/rax.o $(BDIR)/ds4_cpu.o $(BDIR)/ds4_distributed.o $(LDLIBS)
$(CC) $(CFLAGS) $(SIMDFLAGS) -o ds4-bench$(SUFFIX) $(BDIR)/ds4_bench_cpu.o $(BDIR)/ds4_help.o $(BDIR)/ds4_cpu.o $(BDIR)/ds4_distributed.o $(LDLIBS)
$(CC) $(CFLAGS) $(SIMDFLAGS) -o ds4-eval$(SUFFIX) $(BDIR)/ds4_eval_cpu.o $(BDIR)/ds4_help.o $(BDIR)/ds4_cpu.o $(BDIR)/ds4_distributed.o $(LDLIBS)
$(CC) $(CFLAGS) $(SIMDFLAGS) -o ds4-agent$(SUFFIX) $(BDIR)/ds4_agent_cpu.o $(BDIR)/ds4_help.o $(BDIR)/ds4_web.o $(BDIR)/ds4_kvstore.o $(BDIR)/linenoise.o $(BDIR)/ds4_cpu.o $(BDIR)/ds4_distributed.o $(LDLIBS)

ds4.o: ds4.c ds4.h ds4_distributed.h ds4_gpu.h
$(CC) $(CFLAGS) -c -o $@ ds4.c

Expand Down Expand Up @@ -197,13 +248,18 @@ else
$(NVCC) $(NVCCFLAGS) -o $@ ds4_test.o ds4_help.o ds4_kvstore.o rax.o $(CORE_OBJS) $(CUDA_LDLIBS)
endif

test: ds4_test ds4-eval q4k-dot-test
test: ds4_test ds4-eval quant-dot-test q4k-dot-test
./ds4-eval --self-test-extractors
./ds4_test

quant-dot-test: tests/test_quant_dot.c
$(CC) $(CFLAGS) -o tests/test_quant_dot tests/test_quant_dot.c -lm -pthread
./tests/test_quant_dot

q4k-dot-test: tests/test_q4k_dot.c
$(CC) -O2 -Wall -Wextra -std=c99 -o tests/test_q4k_dot tests/test_q4k_dot.c -lm -pthread
./tests/test_q4k_dot

clean:
rm -f ds4 ds4-server ds4-bench ds4-eval ds4-agent ds4_cpu ds4_native ds4_server_test ds4_test tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
rm -f ds4 ds4-avx2 ds4-avx512 ds4-avx512-vnni ds4-server ds4-server-avx2 ds4-server-avx512 ds4-server-avx512-vnni ds4-bench ds4-bench-avx2 ds4-bench-avx512 ds4-bench-avx512-vnni ds4-eval ds4-eval-avx2 ds4-eval-avx512 ds4-eval-avx512-vnni ds4-agent ds4-agent-avx2 ds4-agent-avx512 ds4-agent-avx512-vnni ds4_cpu ds4_native ds4_server_test ds4_test tests/test_quant_dot tests/test_q4k_dot *.o tests/cuda_long_context_smoke tests/cuda_long_context_smoke.o
rm -rf build/
Loading