-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathMakefile
More file actions
221 lines (197 loc) · 11.2 KB
/
Copy pathMakefile
File metadata and controls
221 lines (197 loc) · 11.2 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
# FFAI — Makefile
#
# Common dev-loop targets. See planning/plan.md for the phased
# build-out and scripts/ for the longer-form scripts.
.DEFAULT_GOAL := help
# ─── Paths ────────────────────────────────────────────────────────────
PROJECT_ROOT := $(shell pwd)
METALTILE_DIR := $(PROJECT_ROOT)/../metaltile
KERNEL_OUT := $(PROJECT_ROOT)/Sources/MetalTileSwift
# ─── Help ─────────────────────────────────────────────────────────────
.PHONY: help
help: ## show this help
@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | sort | \
awk 'BEGIN {FS = ":.*?## "}; {printf " \033[36m%-22s\033[0m %s\n", $$1, $$2}'
# ─── Setup ────────────────────────────────────────────────────────────
.PHONY: setup-dev
setup-dev: ## one-time dev environment setup (toolchains, deps, first build)
./scripts/setup-dev.sh
# Alias kept for muscle-memory parity with the older `make setup` name.
.PHONY: setup
setup: setup-dev
# ─── Git hooks ────────────────────────────────────────────────────────
# `core.hooksPath = scripts/hooks` is per-clone — every contributor runs
# `make install-hooks` once after cloning. The hooks themselves are
# version-controlled under scripts/hooks/ so updates propagate via pull.
.PHONY: install-hooks
install-hooks: ## set core.hooksPath -> scripts/hooks (pre-commit / commit-msg / pre-push)
./scripts/install-hooks.sh
.PHONY: uninstall-hooks
uninstall-hooks: ## clear core.hooksPath (disables the in-tree hooks)
git config --unset core.hooksPath && echo "✓ Uninstalled hooks"
# ─── Build ────────────────────────────────────────────────────────────
.PHONY: build
build: regenerate-kernels ## swift build (debug)
swift build
.PHONY: build-release
build-release: regenerate-kernels ## swift build (release)
swift build -c release
.PHONY: regenerate-kernels
regenerate-kernels: ## run `tile build --emit` to regenerate metallib + Swift wrappers
@if [ ! -d "$(METALTILE_DIR)" ]; then \
echo "Error: metaltile not found at $(METALTILE_DIR)"; \
echo "Clone the sibling metaltile repo at ../metaltile."; \
exit 1; \
fi
@# Run cargo from the metaltile dir so its rust-toolchain.toml (nightly,
@# 2024 edition) is honored. Running cargo from FFAI/ would use the
@# system default toolchain, which lacks edition=2024 support.
@#
@# `tile build --emit` writes:
@# $(KERNEL_OUT)/Resources/kernels/<name>.metal per-kernel MSL
@# $(KERNEL_OUT)/Resources/kernels.metallib compiled metallib
@# $(KERNEL_OUT)/Resources/manifest.json IR descriptor
@# $(KERNEL_OUT)/Generated/MetalTileKernels.swift dispatch wrappers
@#
cd $(METALTILE_DIR) && cargo run --release \
--bin tile -- build --emit all --out $(KERNEL_OUT)
@# Prepend the `// swift-format-ignore-file` directive so swift-format
@# skips the generated wrappers. The directive needs to live on line 1
@# for swift-format to honor it. Until metaltile's codegen emits it
@# directly (see planning/session-plan.md), prepend it post-emit so
@# `make format-check` stays clean across regenerations.
@gen=$(KERNEL_OUT)/Generated/MetalTileKernels.swift; \
if [ -f "$$gen" ] && ! head -1 "$$gen" | grep -q swift-format-ignore-file; then \
tmp="$$gen.tmp"; \
{ printf '// swift-format-ignore-file\n//\n'; cat "$$gen"; } > "$$tmp" && mv "$$tmp" "$$gen"; \
fi
# ─── Test ─────────────────────────────────────────────────────────────
#
# Production-parity defaults. The 2026-05-19 GPU-pin root cause —
# wrong-dispatch-shape Ops wrappers — is fixed at the source (see the
# post-mortem in papers/ and OpsValidation in Sources/FFAI/). Test
# runs now use the same FFAI_MAX_COMMAND_BUFFERS=16 cap as production,
# so anything that passes in CI is proven safe under production load.
#
# Defense in depth still in place:
#
# 1. **OpsValidation** preconditions on every reduction-mode wrapper.
# Catches degenerate dispatch shapes (wrong head_dim, wrong n,
# etc.) before the kernel ever launches.
#
# 2. **Thread-safe shared state.** PSOCache uses single-flight
# compilation (compileLock) so parallel suites can't both compile
# the same PSO. BufferPool uses NSLock. See PSOCache.swift +
# BufferPool.swift.
#
# 3. **ModelLoadLock** (Tests/ModelIntegrationTests/ModelLoadLock.swift) — global
# async mutex around `Model.load(...)`. Different concern from GPU
# access: model load is heavy on RAM + disk-IO + GPU memory
# allocation BEFORE any cmdbuf exists. The lock makes Model.load()
# a global critical section so only one multi-GB checkpoint is
# loading at a time.
#
# Targets:
# - `make test-unit` — FFAITests + MetalTileSwiftTests at the
# production cap (FFAI_MAX_COMMAND_BUFFERS=16).
# - `make test-integration` — ModelIntegrationTests at production cap + ModelLoadLock
# + `--parallel --num-workers 1` (memory
# pressure, not GPU). Matches release.yml.
# - `make test` — both in sequence.
# - `make test-stress` — canary; both suites at production cap with
# integration parallelism uncapped. Run after
# touching anything dispatch-related to
# confirm production safety holds under
# maximal parallel load.
.PHONY: test
test: regenerate-kernels test-unit test-integration ## run unit then integration test suites
.PHONY: test-unit
test-unit: regenerate-kernels ## unit + Metal tests at production cap (FFAI_MAX_COMMAND_BUFFERS=16)
FFAI_MAX_COMMAND_BUFFERS=16 swift test --filter "FFAITests|MetalTileSwiftTests"
.PHONY: test-integration
test-integration: regenerate-kernels ## end-to-end model tests; production cap + ModelLoadLock; matches release.yml
@# ModelLoadLock (Tests/ModelIntegrationTests/ModelLoadLock.swift) serializes
@# Model.load() across suites so multi-GB checkpoints load one at a
@# time. --num-workers 1 caps Swift Testing's cross-suite parallelism
@# to one model resident at a time (memory pressure, not GPU).
FFAI_MAX_COMMAND_BUFFERS=16 swift test --parallel --num-workers 1 --filter "ModelIntegrationTests"
.PHONY: test-stress
test-stress: regenerate-kernels ## canary; production cap with uncapped parallelism — run after touching dispatch code
@echo "Stress mode. Running unit + integration at FFAI_MAX_COMMAND_BUFFERS=16"
@echo "with no --num-workers cap on integration. If anything regresses our"
@echo "wrapper-precondition / PSOCache / ModelLoadLock defenses, this is"
@echo "where it surfaces."
@echo ""
FFAI_MAX_COMMAND_BUFFERS=16 swift test --filter "FFAITests|MetalTileSwiftTests"
FFAI_MAX_COMMAND_BUFFERS=16 swift test --filter "ModelIntegrationTests"
.PHONY: coverage
coverage: ## swift test with coverage report (unit suite only, matches ci.yml)
FFAI_MAX_COMMAND_BUFFERS=16 ./scripts/coverage.sh
.PHONY: integration-bisect
integration-bisect: regenerate-kernels ## run each ModelIntegrationTests/*IntegrationTests suite alone; tag GPU-pinned exits
@# Runs every integration suite in its OWN swift-test process, captures
@# pass/fail/timeout + GPU active-residency 3 s after exit. Any suite
@# that leaves the GPU at ≥ 50% after the test ends is flagged "PINNED"
@# and earmarked for xctrace profiling. GPU sampling requires sudo
@# (powermetrics) — without it, the table still shows pass/fail but
@# the GPU column degrades to "?".
@#
@# Pass suite names to run only a subset:
@# make integration-bisect SUITES="Whisper Llama"
@#
@# Or use the script directly for per-suite timeouts:
@# PER_TEST_TIMEOUT=600 ./scripts/integration-bisect.sh Whisper
./scripts/integration-bisect.sh $(SUITES)
# ─── Lint / format ────────────────────────────────────────────────────
# Invoke via `xcrun swift-format` so the call works both when
# swift-format is on $PATH (e.g. brew install swift-format) AND when
# it's only shipped inside the Xcode toolchain (xcrun resolves it via
# DEVELOPER_DIR). `xcrun` falls back to a PATH lookup when neither
# path resolves, so the behavior is identical for Homebrew installs.
#
# Files that should be skipped opt out at the source-file level via
# `// swift-format-ignore-file` on line 1. See
# Sources/MetalTileSwift/Generated/MetalTileKernels.swift for the
# canonical example (the file is auto-regenerated by metaltile's
# `tile build --emit swift`; the directive needs to be re-emitted at
# the top by the codegen — see planning/session-plan.md).
.PHONY: format
format: ## run swift-format on all .swift files
xcrun swift-format format --in-place --configuration .swift-format --recursive .
.PHONY: format-check
format-check: ## swift-format lint (no writes)
xcrun swift-format lint --configuration .swift-format --recursive . && echo "format OK"
# ─── Docs ─────────────────────────────────────────────────────────────
# User-facing documentation lives at https://ffai.dev
# (source: github.com/thewafflehaus/ffai-website).
#
# The website fetches markdown from this repo at build time, so committing
# changes to documentation/, README.md, planning/architecture.md, or
# planning/roadmap.md on `main` triggers a rebuild via a GitHub Action.
WEBSITE_DIR := $(PROJECT_ROOT)/../ffai-website
.PHONY: docs
docs: ## verify markdown + preview the docs site locally (if ../ffai-website is checked out)
./scripts/verify-docs.sh
@if [ -d "$(WEBSITE_DIR)" ]; then \
echo ""; \
echo "Preview the docs site (Ctrl+C to stop):"; \
echo " cd $(WEBSITE_DIR) && pnpm dev"; \
echo ""; \
echo "Or to build a one-shot static preview:"; \
echo " cd $(WEBSITE_DIR) && pnpm build && pnpm dlx serve dist"; \
else \
echo ""; \
echo "Tip: clone the docs site to preview locally:"; \
echo " git clone https://github.com/thewafflehaus/ffai-website $(WEBSITE_DIR)"; \
fi
.PHONY: docs-verify
docs-verify: ## swift-docc target-by-target verification only (no website preview)
./scripts/verify-docs.sh
# ─── Clean ────────────────────────────────────────────────────────────
.PHONY: clean
clean: ## remove .build and generated kernel artifacts
rm -rf .build
rm -f Sources/MetalTileSwift/Resources/kernels.metallib
rm -f Sources/MetalTileSwift/Resources/manifest.json
rm -rf Sources/MetalTileSwift/Resources/kernels
rm -rf Sources/MetalTileSwift/Generated/MetalTileKernels.swift