Skip to content

Commit 36b2f75

Browse files
committed
fix: smart exclusions -- expand SKIP_DIRS, .saarignore support (OPE-128)
2 parents 74d6a34 + 827e2de commit 36b2f75

2 files changed

Lines changed: 103 additions & 26 deletions

File tree

saar/extractor.py

Lines changed: 46 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,21 @@ class DNAExtractor:
5050
"""
5151

5252
SKIP_DIRS = {
53-
"node_modules", ".git", "__pycache__", "venv", "env", "dist",
54-
"build", ".next", "coverage", ".venv", "site-packages",
53+
# version control
54+
".git",
55+
# python
56+
"__pycache__", "venv", "env", ".venv", "site-packages",
57+
"*.egg-info", ".eggs",
58+
# js/ts
59+
"node_modules", ".next", ".nuxt", ".svelte-kit",
60+
# build outputs
61+
"dist", "build", "out", "target",
62+
# test/coverage artifacts
63+
"coverage", ".pytest_cache", "htmlcov", ".nyc_output",
64+
# common data / cloned repo dirs that aren't project code
65+
"repos", "data", "datasets", "tmp", "temp", "cache",
66+
# ide
67+
".idea", ".vscode",
5568
}
5669
MAX_FILE_SIZE = 1024 * 1024 # 1MB
5770
MAX_FILES = 5000
@@ -168,33 +181,35 @@ def _detect_language(self, file_path: str) -> str:
168181
".ts": "typescript", ".tsx": "typescript",
169182
}.get(ext, "unknown")
170183

171-
def _read_gitignore_dirs(self, repo_path: Path) -> set:
172-
"""Parse .gitignore for directory patterns to skip.
184+
def _read_ignore_dirs(self, repo_path: Path) -> set:
185+
"""Parse .gitignore and .saarignore for directory patterns to skip.
173186
174-
Only extracts simple directory names (like 'repos/' or 'data/').
175-
Does not handle globs or negation -- those need a full gitignore parser.
187+
Reads both files and merges results. Only handles simple directory
188+
names and trailing-slash patterns -- no glob negation. A full
189+
gitignore-spec parser is overkill for our use case.
176190
"""
177191
dirs: set = set()
178-
gitignore = repo_path / ".gitignore"
179-
if not gitignore.exists():
180-
return dirs
181-
try:
182-
for line in gitignore.read_text(encoding="utf-8").splitlines():
183-
line = line.strip()
184-
if not line or line.startswith("#"):
185-
continue
186-
# lines ending with / are directories
187-
if line.endswith("/"):
188-
dirs.add(line.rstrip("/"))
189-
# bare names that exist as dirs
190-
elif "/" not in line and "*" not in line and "!" not in line:
191-
candidate = repo_path / line
192-
if candidate.is_dir():
193-
dirs.add(line)
194-
except Exception as e:
195-
logger.debug("Error reading .gitignore: %s", e)
192+
# check both ignore files -- .saarignore takes same syntax as .gitignore
193+
for ignore_file in [repo_path / ".gitignore", repo_path / ".saarignore"]:
194+
if not ignore_file.exists():
195+
continue
196+
try:
197+
for line in ignore_file.read_text(encoding="utf-8").splitlines():
198+
line = line.strip()
199+
if not line or line.startswith("#"):
200+
continue
201+
# lines ending with / are explicitly directories
202+
if line.endswith("/"):
203+
dirs.add(line.rstrip("/"))
204+
# bare names without glob chars that exist as dirs in the repo
205+
elif "/" not in line and "*" not in line and "!" not in line:
206+
candidate = repo_path / line
207+
if candidate.is_dir():
208+
dirs.add(line)
209+
except Exception as e:
210+
logger.debug("Error reading %s: %s", ignore_file.name, e)
196211
if dirs:
197-
logger.info("Gitignore dirs to skip: %s", dirs)
212+
logger.debug("Ignore file dirs to skip: %s", dirs)
198213
return dirs
199214

200215
# -- team rules -------------------------------------------------------
@@ -652,9 +667,14 @@ def extract(
652667
skip = set(self.SKIP_DIRS)
653668
if exclude_dirs:
654669
skip.update(exclude_dirs)
655-
skip.update(self._read_gitignore_dirs(path))
670+
skip.update(self._read_ignore_dirs(path))
656671
self._active_skip_dirs = skip
657672

673+
# show user-added skips at info level so --verbose surfaces them
674+
extra_skips = skip - self.SKIP_DIRS
675+
if extra_skips:
676+
logger.info("Extra dirs excluded: %s", sorted(extra_skips))
677+
658678
self._reset_cache()
659679
repo_name = path.name
660680
logger.info("Extracting DNA from %s", repo_name)

tests/test_extractor.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,3 +195,60 @@ def test_respects_max_file_size(self, tmp_path: Path):
195195
extractor = DNAExtractor()
196196
dna = extractor.extract(str(tmp_path))
197197
assert dna is not None
198+
199+
def test_skips_repos_dir(self, tmp_path: Path):
200+
"""repos/ should be skipped -- it contains user-cloned repos, not project code."""
201+
repos_dir = tmp_path / "repos" / "some-user-repo"
202+
repos_dir.mkdir(parents=True)
203+
# put a python file inside repos/ -- should NOT be counted
204+
(repos_dir / "main.py").write_text("def user_code(): pass\n" * 50)
205+
# real project code
206+
(tmp_path / "app.py").write_text("def real_function(): pass\n")
207+
208+
extractor = DNAExtractor()
209+
dna = extractor.extract(str(tmp_path))
210+
assert dna is not None
211+
# only app.py should be counted, not repos/
212+
assert dna.language_distribution.get("python", 0) == 1
213+
214+
def test_skips_dist_and_build(self, tmp_path: Path):
215+
"""dist/ and build/ are generated artifacts, not source code."""
216+
for junk_dir in ["dist", "build", "out"]:
217+
d = tmp_path / junk_dir
218+
d.mkdir()
219+
(d / "bundle.py").write_text("# compiled output\n" * 100)
220+
(tmp_path / "src.py").write_text("def real(): pass\n")
221+
222+
extractor = DNAExtractor()
223+
dna = extractor.extract(str(tmp_path))
224+
assert dna is not None
225+
assert dna.language_distribution.get("python", 0) == 1
226+
227+
def test_respects_saarignore(self, tmp_path: Path):
228+
""".saarignore uses same syntax as .gitignore and is merged into skip dirs."""
229+
custom_dir = tmp_path / "vendor"
230+
custom_dir.mkdir()
231+
(custom_dir / "lib.py").write_text("def vendor_code(): pass\n" * 20)
232+
(tmp_path / "app.py").write_text("def real(): pass\n")
233+
# tell saar to skip vendor/ via .saarignore
234+
(tmp_path / ".saarignore").write_text("vendor/\n")
235+
236+
extractor = DNAExtractor()
237+
dna = extractor.extract(str(tmp_path))
238+
assert dna is not None
239+
assert dna.language_distribution.get("python", 0) == 1
240+
241+
def test_saarignore_stacks_with_gitignore(self, tmp_path: Path):
242+
"""Both .gitignore and .saarignore dirs are skipped -- they merge, not replace."""
243+
(tmp_path / "gitignored").mkdir()
244+
(tmp_path / "gitignored" / "a.py").write_text("x = 1\n" * 10)
245+
(tmp_path / "saarignored").mkdir()
246+
(tmp_path / "saarignored" / "b.py").write_text("y = 2\n" * 10)
247+
(tmp_path / "real.py").write_text("def real(): pass\n")
248+
(tmp_path / ".gitignore").write_text("gitignored/\n")
249+
(tmp_path / ".saarignore").write_text("saarignored/\n")
250+
251+
extractor = DNAExtractor()
252+
dna = extractor.extract(str(tmp_path))
253+
assert dna is not None
254+
assert dna.language_distribution.get("python", 0) == 1

0 commit comments

Comments
 (0)