Skip to content

Commit 1e9de19

Browse files
committed
perf optimization; code refactor
1 parent 8438f2f commit 1e9de19

File tree

3 files changed

+95
-83
lines changed

3 files changed

+95
-83
lines changed

cycode/cli/files_collector/path_documents.py

Lines changed: 14 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,5 @@
11
import os
2-
from collections import defaultdict
3-
from typing import TYPE_CHECKING, Iterable, List, Set, Tuple
2+
from typing import TYPE_CHECKING, Iterable, List, Optional, Tuple
43

54
import pathspec
65

@@ -11,6 +10,7 @@
1110
is_iac,
1211
is_tfplan_file,
1312
)
13+
from cycode.cli.files_collector.walk_ignore import walk_ignore
1414
from cycode.cli.models import Document
1515
from cycode.cli.utils.path_utils import get_absolute_path, get_file_content
1616
from cycode.cyclient import logger
@@ -19,82 +19,18 @@
1919
from cycode.cli.utils.progress_bar import BaseProgressBar, ProgressBarSection
2020

2121

22-
def _walk_to_top(path: str) -> Iterable[str]:
23-
while os.path.dirname(path) != path:
24-
yield path
25-
path = os.path.dirname(path)
22+
def _get_all_existing_files_in_directory(path: str, *, walk_with_ignore_patterns: bool = True) -> List[str]:
23+
files: List[str] = []
2624

27-
if path:
28-
yield path # Include the top-level directory
29-
30-
31-
_SUPPORTED_IGNORE_PATTERN_FILES = {'.gitignore', '.cycodeignore'}
32-
33-
34-
def _collect_top_level_ignore_files(path: str) -> List[str]:
35-
ignore_files = []
36-
for dir_path in _walk_to_top(path):
37-
for ignore_file in _SUPPORTED_IGNORE_PATTERN_FILES:
38-
ignore_file_path = os.path.join(dir_path, ignore_file)
39-
if os.path.exists(ignore_file_path):
40-
logger.debug('Found top level ignore file: %s', ignore_file_path)
41-
ignore_files.append(ignore_file_path)
42-
return ignore_files
43-
44-
45-
def _get_global_ignore_patterns(path: str) -> List[str]:
46-
ignore_patterns = []
47-
for ignore_file in _collect_top_level_ignore_files(path):
48-
file_patterns = get_file_content(ignore_file).splitlines()
49-
ignore_patterns.extend(file_patterns)
50-
return ignore_patterns
51-
52-
53-
def _apply_ignore_patterns(ignore_patterns: List[str], files: Set[str]) -> Set[str]:
54-
if not ignore_patterns:
55-
return files
56-
57-
path_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ignore_patterns)
58-
excluded_file_paths = set(path_spec.match_files(files))
59-
60-
return files - excluded_file_paths
61-
62-
63-
def _get_all_existing_files_in_directory(path: str, *, apply_ignore_patterns: bool = True) -> Set[str]:
64-
files: Set[str] = set()
65-
66-
global_ignore_patterns = _get_global_ignore_patterns(path)
67-
path_to_ignore_patterns = defaultdict(list)
68-
69-
for root, _, filenames in os.walk(path):
25+
walk_func = walk_ignore if walk_with_ignore_patterns else os.walk
26+
for root, _, filenames in walk_func(path):
7027
for filename in filenames:
71-
filepath = os.path.join(root, filename)
72-
73-
if filepath in _SUPPORTED_IGNORE_PATTERN_FILES:
74-
logger.debug('Found ignore file: %s', filepath)
75-
# TODO(MarshalX): accumulate ignore pattern from previous levels
76-
path_to_ignore_patterns[root].extend(get_file_content(filepath).splitlines())
77-
78-
if apply_ignore_patterns and root in path_to_ignore_patterns:
79-
filtered_paths = _apply_ignore_patterns(
80-
path_to_ignore_patterns[root],
81-
{
82-
filepath,
83-
},
84-
)
85-
if filtered_paths:
86-
files.update(filtered_paths)
87-
else:
88-
files.add(os.path.join(root, filename))
89-
90-
if apply_ignore_patterns:
91-
logger.debug('Applying global ignore patterns %s', {'global_ignore_patterns': global_ignore_patterns})
92-
return _apply_ignore_patterns(global_ignore_patterns, files)
28+
files.append(os.path.join(root, filename))
9329

9430
return files
9531

9632

97-
def _get_relevant_files_in_path(path: str, exclude_patterns: Iterable[str]) -> List[str]:
33+
def _get_relevant_files_in_path(path: str, exclude_patterns: Optional[Iterable[str]] = None) -> List[str]:
9834
absolute_path = get_absolute_path(path)
9935

10036
if not os.path.isfile(absolute_path) and not os.path.isdir(absolute_path):
@@ -103,24 +39,21 @@ def _get_relevant_files_in_path(path: str, exclude_patterns: Iterable[str]) -> L
10339
if os.path.isfile(absolute_path):
10440
return [absolute_path]
10541

106-
all_file_paths = _get_all_existing_files_in_directory(absolute_path)
107-
108-
path_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, exclude_patterns)
109-
excluded_file_paths = set(path_spec.match_files(all_file_paths))
42+
file_paths = _get_all_existing_files_in_directory(absolute_path)
11043

111-
relevant_file_paths = all_file_paths - excluded_file_paths
44+
if exclude_patterns:
45+
path_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, exclude_patterns)
46+
file_paths = path_spec.match_files(file_paths, negate=True)
11247

113-
return [file_path for file_path in relevant_file_paths if os.path.isfile(file_path)]
48+
return [file_path for file_path in file_paths if os.path.isfile(file_path)]
11449

11550

11651
def _get_relevant_files(
11752
progress_bar: 'BaseProgressBar', progress_bar_section: 'ProgressBarSection', scan_type: str, paths: Tuple[str]
11853
) -> List[str]:
11954
all_files_to_scan = []
12055
for path in paths:
121-
all_files_to_scan.extend(
122-
_get_relevant_files_in_path(path=path, exclude_patterns=['**/.git/**', '**/.cycode/**'])
123-
)
56+
all_files_to_scan.extend(_get_relevant_files_in_path(path))
12457

12558
# we are double the progress bar section length because we are going to process the files twice
12659
# first time to get the file list with respect of excluded patterns (excluding takes seconds to execute)
Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
import os
2+
from collections import defaultdict
3+
from typing import Iterable, List
4+
5+
import pathspec
6+
from pathspec.util import StrPath
7+
8+
from cycode.cli.utils.path_utils import get_file_content
9+
from cycode.cyclient import logger
10+
11+
_SUPPORTED_IGNORE_PATTERN_FILES = {'.gitignore', '.cycodeignore'}
12+
_DEFAULT_GLOBAL_IGNORE_PATTERNS = [
13+
'.git',
14+
'.cycode',
15+
'**/.git/**',
16+
'**/.cycode/**',
17+
]
18+
19+
20+
def _walk_to_top(path: str) -> Iterable[str]:
21+
while os.path.dirname(path) != path:
22+
yield path
23+
path = os.path.dirname(path)
24+
25+
if path:
26+
yield path # Include the top-level directory
27+
28+
29+
def _collect_top_level_ignore_files(path: str) -> List[str]:
30+
ignore_files = []
31+
for dir_path in _walk_to_top(path):
32+
for ignore_file in _SUPPORTED_IGNORE_PATTERN_FILES:
33+
ignore_file_path = os.path.join(dir_path, ignore_file)
34+
if os.path.exists(ignore_file_path):
35+
logger.debug('Apply top level ignore file: %s', ignore_file_path)
36+
ignore_files.append(ignore_file_path)
37+
return ignore_files
38+
39+
40+
def _get_global_ignore_patterns(path: str) -> List[str]:
41+
ignore_patterns = _DEFAULT_GLOBAL_IGNORE_PATTERNS.copy()
42+
for ignore_file in _collect_top_level_ignore_files(path):
43+
file_patterns = get_file_content(ignore_file).splitlines()
44+
ignore_patterns.extend(file_patterns)
45+
return ignore_patterns
46+
47+
48+
def _should_include_path(ignore_patterns: List[str], path: StrPath) -> bool:
49+
path_spec = pathspec.PathSpec.from_lines(pathspec.patterns.GitWildMatchPattern, ignore_patterns)
50+
return not path_spec.match_file(path) # works with both files and directories; negative match
51+
52+
53+
def walk_ignore(path: str) -> List[str]:
54+
global_ignore_patterns = _get_global_ignore_patterns(path)
55+
path_to_ignore_patterns = defaultdict(list)
56+
57+
for dirpath, dirnames, filenames in os.walk(path, topdown=True):
58+
# finds and processes ignore files first to get the patterns
59+
for filename in filenames:
60+
filepath = os.path.join(dirpath, filename)
61+
if filename in _SUPPORTED_IGNORE_PATTERN_FILES:
62+
logger.debug('Apply ignore file: %s', filepath)
63+
# TODO(MarshalX): accumulate ignore pattern from previous levels
64+
path_to_ignore_patterns[dirpath].extend(get_file_content(filepath).splitlines())
65+
66+
ignore_patterns = global_ignore_patterns + path_to_ignore_patterns.get(dirpath, [])
67+
68+
# decrease recursion depth of os.walk() because of topdown=True by changing the list in-place
69+
# slicing ([:]) is mandatory to change dict in-place!
70+
dirnames[:] = [d for d in dirnames if _should_include_path(ignore_patterns, d)]
71+
filenames[:] = [f for f in filenames if _should_include_path(ignore_patterns, f)]
72+
73+
yield dirpath, dirnames, filenames

tests/cli/files_collector/test_path_documents.py renamed to tests/cli/files_collector/test_walk_ignore.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
from os.path import normpath
22
from typing import TYPE_CHECKING
33

4-
from cycode.cli.files_collector.path_documents import (
4+
from cycode.cli.files_collector.walk_ignore import (
55
_collect_top_level_ignore_files,
66
_get_global_ignore_patterns,
77
_walk_to_top,
@@ -87,7 +87,13 @@ def test_get_global_ignore_patterns(fs: 'FakeFilesystem') -> None:
8787
_create_mocked_file_structure(fs)
8888
ignore_patterns = _get_global_ignore_patterns('/home/user/project/subdir')
8989

90-
assert len(ignore_patterns) == 3
90+
assert len(ignore_patterns) == 7
91+
# default global:
92+
assert '.git' in ignore_patterns
93+
assert '.cycode' in ignore_patterns
94+
assert '**/.git/**' in ignore_patterns
95+
assert '**/.cycode/**' in ignore_patterns
96+
# additional:
9197
assert '*.txt' in ignore_patterns
9298
assert '*.pyc' in ignore_patterns
9399
assert '*.log' in ignore_patterns

0 commit comments

Comments
 (0)