@@ -50,8 +50,21 @@ class DNAExtractor:
5050 """
5151
5252 SKIP_DIRS = {
53- "node_modules" , ".git" , "__pycache__" , "venv" , "env" , "dist" ,
54- "build" , ".next" , "coverage" , ".venv" , "site-packages" ,
53+ # version control
54+ ".git" ,
55+ # python
56+ "__pycache__" , "venv" , "env" , ".venv" , "site-packages" ,
57+ "*.egg-info" , ".eggs" ,
58+ # js/ts
59+ "node_modules" , ".next" , ".nuxt" , ".svelte-kit" ,
60+ # build outputs
61+ "dist" , "build" , "out" , "target" ,
62+ # test/coverage artifacts
63+ "coverage" , ".pytest_cache" , "htmlcov" , ".nyc_output" ,
64+ # common data / cloned repo dirs that aren't project code
65+ "repos" , "data" , "datasets" , "tmp" , "temp" , "cache" ,
66+ # ide
67+ ".idea" , ".vscode" ,
5568 }
5669 MAX_FILE_SIZE = 1024 * 1024 # 1MB
5770 MAX_FILES = 5000
@@ -168,33 +181,35 @@ def _detect_language(self, file_path: str) -> str:
168181 ".ts" : "typescript" , ".tsx" : "typescript" ,
169182 }.get (ext , "unknown" )
170183
171- def _read_gitignore_dirs (self , repo_path : Path ) -> set :
172- """Parse .gitignore for directory patterns to skip.
184+ def _read_ignore_dirs (self , repo_path : Path ) -> set :
185+ """Parse .gitignore and .saarignore for directory patterns to skip.
173186
174- Only extracts simple directory names (like 'repos/' or 'data/').
175- Does not handle globs or negation -- those need a full gitignore parser.
187+ Reads both files and merges results. Only handles simple directory
188+ names and trailing-slash patterns -- no glob negation. A full
189+ gitignore-spec parser is overkill for our use case.
176190 """
177191 dirs : set = set ()
178- gitignore = repo_path / ".gitignore"
179- if not gitignore .exists ():
180- return dirs
181- try :
182- for line in gitignore .read_text (encoding = "utf-8" ).splitlines ():
183- line = line .strip ()
184- if not line or line .startswith ("#" ):
185- continue
186- # lines ending with / are directories
187- if line .endswith ("/" ):
188- dirs .add (line .rstrip ("/" ))
189- # bare names that exist as dirs
190- elif "/" not in line and "*" not in line and "!" not in line :
191- candidate = repo_path / line
192- if candidate .is_dir ():
193- dirs .add (line )
194- except Exception as e :
195- logger .debug ("Error reading .gitignore: %s" , e )
192+ # check both ignore files -- .saarignore takes same syntax as .gitignore
193+ for ignore_file in [repo_path / ".gitignore" , repo_path / ".saarignore" ]:
194+ if not ignore_file .exists ():
195+ continue
196+ try :
197+ for line in ignore_file .read_text (encoding = "utf-8" ).splitlines ():
198+ line = line .strip ()
199+ if not line or line .startswith ("#" ):
200+ continue
201+ # lines ending with / are explicitly directories
202+ if line .endswith ("/" ):
203+ dirs .add (line .rstrip ("/" ))
204+ # bare names without glob chars that exist as dirs in the repo
205+ elif "/" not in line and "*" not in line and "!" not in line :
206+ candidate = repo_path / line
207+ if candidate .is_dir ():
208+ dirs .add (line )
209+ except Exception as e :
210+ logger .debug ("Error reading %s: %s" , ignore_file .name , e )
196211 if dirs :
197- logger .info ( "Gitignore dirs to skip: %s" , dirs )
212+ logger .debug ( "Ignore file dirs to skip: %s" , dirs )
198213 return dirs
199214
200215 # -- team rules -------------------------------------------------------
@@ -652,9 +667,14 @@ def extract(
652667 skip = set (self .SKIP_DIRS )
653668 if exclude_dirs :
654669 skip .update (exclude_dirs )
655- skip .update (self ._read_gitignore_dirs (path ))
670+ skip .update (self ._read_ignore_dirs (path ))
656671 self ._active_skip_dirs = skip
657672
673+ # show user-added skips at info level so --verbose surfaces them
674+ extra_skips = skip - self .SKIP_DIRS
675+ if extra_skips :
676+ logger .info ("Extra dirs excluded: %s" , sorted (extra_skips ))
677+
658678 self ._reset_cache ()
659679 repo_name = path .name
660680 logger .info ("Extracting DNA from %s" , repo_name )
0 commit comments