codespell-project · yarikoptic · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023 · Dec 22, 2023
@@ -23,7 +23,19 @@
 import re
 import sys
 import textwrap
-from typing import Any, Dict, List, Match, Optional, Pattern, Sequence, Set, Tuple
+from multiprocessing import Pool
+from typing import (
+    Any,
+    Dict,
+    Generator,
+    List,
+    Match,
+    Optional,
+    Pattern,
+    Sequence,
+    Set,
+    Tuple,
+)
 
 # autogenerated by setuptools_scm
 from ._version import __version__ as VERSION  # type: ignore  # noqa: N812
@@ -455,6 +467,20 @@ def parse_options(
         "should match the to-be-excluded lines exactly",
     )
 
+    parser.add_argument(
+        "-J",
+        "--jobs",
+        action="store",
+        type=int,
+        default=0,
+        help="set number of jobs to parallelize processing - one "
+        "subprocess per file:\n"
+        "- 0: no parallelization (default)"
+        "- positive integer: number of sub-processes to use\n"
+        "- -1: use all available CPUs\n"
+        "Interactive mode is not compatible with parallel processing",
+    )
+
     parser.add_argument(
         "-i",
         "--interactive",
@@ -1021,12 +1047,58 @@ def parse_file(
     return bad_count
 
 
+class _FileParser:
+    """A helper class to provide top level closure for parse_file()"""
+
+    def __init__(
+        self,
+        colors: TermColors,
+        summary: Optional[Summary],
+        misspellings: Dict[str, Misspelling],
+        exclude_lines: Set[str],
+        file_opener: FileOpener,
+        word_regex: Pattern[str],
+        ignore_word_regex: Optional[Pattern[str]],
+        uri_regex: Pattern[str],
+        uri_ignore_words: Set[str],
+        context: Optional[Tuple[int, int]],
+        options: argparse.Namespace,
+    ) -> None:
+        self.colors = colors
+        self.summary = summary
+        self.misspellings = misspellings
+        self.exclude_lines = exclude_lines
+        self.file_opener = file_opener
+        self.word_regex = word_regex
+        self.ignore_word_regex = ignore_word_regex
+        self.uri_regex = uri_regex
+        self.uri_ignore_words = uri_ignore_words
+        self.context = context
+        self.options = options
+
+    def __call__(self, filename: str) -> int:
+        return parse_file(
+            filename,
+            self.colors,
+            self.summary,
+            self.misspellings,
+            self.exclude_lines,
+            self.file_opener,
+            self.word_regex,
+            self.ignore_word_regex,
+            self.uri_regex,
+            self.uri_ignore_words,
+            self.context,
+            self.options,
+        )
+
+
 def _script_main() -> int:
     """Wrap to main() for setuptools."""
     return main(*sys.argv[1:])
 
 
-def main(*args: str) -> int:
+def main(*args: str) -> int:  # noqa: C901,PLR0915,PLR0911
     """Contains flow control"""
     try:
         options, parser, used_cfg_files = parse_options(args)
@@ -1138,6 +1210,25 @@ def main(*args: str) -> int:
     else:
         summary = None
 
+    if options.jobs and options.interactive:
+        print(
+            "ERROR: do not enable parallelization in interactive mode",
+            file=sys.stderr,
+        )
+        # no point to parser.print_help() - just hides ERROR away here
+        return EX_USAGE
+
+    jobs = options.jobs
+    if jobs == -1:
+        jobs = os.cpu_count()
+    elif jobs < -1:
+        print(
+            f"ERROR: invalid number of jobs: {jobs}",
+            file=sys.stderr,
+        )
+        parser.print_help()
+        return EX_USAGE
+
     context = None
     if options.context is not None:
         if (options.before_context is not None) or (options.after_context is not None):
@@ -1176,66 +1267,68 @@ def main(*args: str) -> int:
         )
         return EX_USAGE
 
-    bad_count = 0
-    for filename in sorted(options.files):
-        # ignore hidden files
-        if is_hidden(filename, options.check_hidden):
-            continue
-
-        if os.path.isdir(filename):
-            for root, dirs, files in os.walk(filename):
-                if glob_match.match(root):  # skip (absolute) directories
-                    dirs.clear()
-                    continue
-                if is_hidden(root, options.check_hidden):  # dir itself hidden
-                    continue
-                for file_ in sorted(files):
-                    # ignore hidden files in directories
-                    if is_hidden(file_, options.check_hidden):
-                        continue
-                    if glob_match.match(file_):  # skip files
+    def _find_files() -> Generator[str, None, None]:
+        """Yields filename for the parsing"""
+        for filename in sorted(options.files):
+            # ignore hidden files
+            if is_hidden(filename, options.check_hidden):
+                continue
+
+            if os.path.isdir(filename):
+                for root, dirs, files in os.walk(filename):
+                    if glob_match.match(root):  # skip (absolute) directories
+                        dirs.clear()
                         continue
-                    fname = os.path.join(root, file_)
-                    if glob_match.match(fname):  # skip paths
+                    if is_hidden(root, options.check_hidden):  # dir itself hidden
                         continue
-                    bad_count += parse_file(
-                        fname,
-                        colors,
-                        summary,
-                        misspellings,
-                        exclude_lines,
-                        file_opener,
-                        word_regex,
-                        ignore_word_regex,
-                        uri_regex,
-                        uri_ignore_words,
-                        context,
-                        options,
-                    )
+                    for file_ in sorted(files):
+                        # ignore hidden files in directories
+                        if is_hidden(file_, options.check_hidden):
+                            continue
+                        if glob_match.match(file_):  # skip files
+                            continue
+                        fname = os.path.join(root, file_)
+                        if glob_match.match(fname):  # skip paths
+                            continue
+                        yield fname
+
+                    # skip (relative) directories
+                    dirs[:] = [
+                        dir_
+                        for dir_ in dirs
+                        if not glob_match.match(dir_)
+                        and not is_hidden(dir_, options.check_hidden)
+                    ]
+
+            elif not glob_match.match(filename):  # skip files
+                yield filename
+
+    # closure to pass only relevant to the job filename
+    file_parser = _FileParser(
+        colors,
+        summary,
+        misspellings,
+        exclude_lines,
+        file_opener,
+        word_regex,
+        ignore_word_regex,
+        uri_regex,
+        uri_ignore_words,
+        context,
+        options,
+    )
 
-                # skip (relative) directories
-                dirs[:] = [
-                    dir_
-                    for dir_ in dirs
-                    if not glob_match.match(dir_)
-                    and not is_hidden(dir_, options.check_hidden)
-                ]
-
-        elif not glob_match.match(filename):  # skip files
-            bad_count += parse_file(
-                filename,
-                colors,
-                summary,
-                misspellings,
-                exclude_lines,
-                file_opener,
-                word_regex,
-                ignore_word_regex,
-                uri_regex,
-                uri_ignore_words,
-                context,
-                options,
-            )
+    if jobs:
+        # parse_file would be in subprocess(es)
+        with Pool(jobs) as pool:
+            results = pool.map(file_parser, _find_files())
+            for result in results:
+                if isinstance(result, Exception):
+                    raise result
+            bad_count = sum(results)
+    else:
+        # serial
+        bad_count = sum(map(file_parser, _find_files()))
 
     if summary:
         print("\n-------8<-------\nSUMMARY:")