IdentifierFinder: now a Protocol

elifarley · elifarley · commit 5d8c317f6021 · 2024-11-04T00:54:21.000-03:00
diff --git a/src/cedarscript_editor/cedarscript_editor.py b/src/cedarscript_editor/cedarscript_editor.py
@@ -11,10 +11,10 @@
 from cedarscript_ast_parser.cedarscript_ast_parser import MarkerCompatible, RelativeMarker, \
     RelativePositionType, Region, SingleFileClause
 from text_manipulation import (
-    IndentationInfo, IdentifierBoundaries, RangeSpec, read_file, write_file, bow_to_search_range
+    IndentationInfo, IdentifierBoundaries, RangeSpec, read_file, write_file, bow_to_search_range, IdentifierFinder
 )
 
-from .tree_sitter_identifier_finder import IdentifierFinder
+from .tree_sitter_identifier_finder import TreeSitterIdentifierFinder
 
 
 class CEDARScriptEditorException(Exception):
@@ -109,7 +109,7 @@ def _update_command(self, cmd: UpdateCommand):
         src = read_file(file_path)
         lines = src.splitlines()
 
-        identifier_finder = IdentifierFinder(file_path, src, RangeSpec.EMPTY)
+        identifier_finder = TreeSitterIdentifierFinder(file_path, src, RangeSpec.EMPTY)
             
         search_range = RangeSpec.EMPTY
         move_src_range = None
@@ -247,7 +247,7 @@ def _create_command(self, cmd: CreateCommand) -> str:
 
 def find_index_range_for_region(region: BodyOrWhole | Marker | Segment | RelativeMarker,
                                 lines: Sequence[str],
-                                identifier_finder: IdentifierFinder,
+                                identifier_finder_IS_IT_USED: IdentifierFinder,
                                 search_range: RangeSpec | IdentifierBoundaries | None = None,
                                 ) -> RangeSpec:
     # BodyOrWhole | RelativeMarker | MarkerOrSegment
@@ -267,7 +267,7 @@ def find_index_range_for_region(region: BodyOrWhole | Marker | Segment | Relativ
                             pass
                         case _:
                             # TODO transform to RangeSpec
-                            mos = IdentifierFinder("TODO?.py", lines, RangeSpec.EMPTY)(mos, search_range).body
+                            mos = TreeSitterIdentifierFinder("TODO?.py", lines, RangeSpec.EMPTY)(mos, search_range).body
             index_range = mos.to_search_range(
                 lines,
                 search_range.start if search_range else 0,
diff --git a/src/cedarscript_editor/tree_sitter_identifier_finder.py b/src/cedarscript_editor/tree_sitter_identifier_finder.py
@@ -7,6 +7,7 @@
 from grep_ast import filename_to_lang
 from text_manipulation.indentation_kit import get_line_indent_count
 from text_manipulation.range_spec import IdentifierBoundaries, RangeSpec, ParentInfo, ParentRestriction
+from text_manipulation import IdentifierFinder
 from tree_sitter_languages import get_language, get_parser
 
 from .tree_sitter_identifier_queries import LANG_TO_TREE_SITTER_QUERY
@@ -20,7 +21,7 @@
 _log = logging.getLogger(__name__)
 
 
-class IdentifierFinder:
+class TreeSitterIdentifierFinder(IdentifierFinder):
     """Finds identifiers in source code based on markers and parent restrictions.
 
     Attributes:
@@ -44,11 +45,11 @@ def __init__(self, fname: str, source: str | Sequence[str], parent_restriction:
         if langstr is None:
             self.language = None
             self.query_info = None
-            _log.info(f"[IdentifierFinder] NO LANGUAGE for `{fname}`")
+            _log.info(f"[TreeSitterIdentifierFinder] NO LANGUAGE for `{fname}`")
             return
         self.query_info: dict[str, dict[str, str]] = LANG_TO_TREE_SITTER_QUERY[langstr]
         self.language = get_language(langstr)
-        _log.info(f"[IdentifierFinder] Selected {self.language}")
+        _log.info(f"[TreeSitterIdentifierFinder] Selected {self.language}")
         self.tree = get_parser(langstr).parse(bytes(source, "utf-8"))
 
     def __call__(
diff --git a/src/text_manipulation/__init__.py b/src/text_manipulation/__init__.py
@@ -1,10 +1,13 @@
-from .indentation_kit import IndentationInfo
-from .range_spec import IdentifierBoundaries, RangeSpec
+from .line_kit import get_line_indent_count, extract_indentation
+from .range_spec import RangeSpec, IdentifierBoundaries
 from .text_editor_kit import read_file, write_file, bow_to_search_range
+from .cst_kit import IdentifierFinder
+from .indentation_kit import IndentationInfo
 
 __all__ = [
     "IndentationInfo",
     "IdentifierBoundaries",
+    "IdentifierFinder",
     "RangeSpec",
     "read_file",
     "write_file",
diff --git a/src/text_manipulation/cst_kit.py b/src/text_manipulation/cst_kit.py
@@ -0,0 +1,17 @@
+from typing import runtime_checkable, Protocol, Sequence
+
+from cedarscript_ast_parser import Marker, Segment, RelativeMarker, RelativePositionType, MarkerType, BodyOrWhole
+
+from .range_spec import IdentifierBoundaries, RangeSpec, ParentRestriction
+from .text_editor_kit import read_file, write_file, bow_to_search_range
+
+
+@runtime_checkable
+class IdentifierFinder(Protocol):
+    """Protocol for finding identifiers in source code."""
+
+    def __call__(
+            self, mos: Marker | Segment, parent_restriction: ParentRestriction = None
+    ) -> IdentifierBoundaries | RangeSpec | None:
+        """Find identifier boundaries for a given marker or segment."""
+        pass
diff --git a/src/text_manipulation/indentation_kit.py b/src/text_manipulation/indentation_kit.py
@@ -19,53 +19,9 @@
 from math import gcd
 from typing import NamedTuple
 
+from .cst_kit import IdentifierFinder
 
-def get_line_indent_count_from_lines(lines: Sequence[str], index: int) -> int:
-    return get_line_indent_count(lines[index])
-
-
-def get_line_indent_count(line: str) -> int:
-    """
-    Count the number of leading whitespace characters in a line.
-
-    Args:
-        line (str): The input line to analyze.
-
-    Returns:
-        int: The number of leading whitespace characters.
-
-    Example:
-        >>> get_line_indent_count("    Hello")
-        4
-        >>> get_line_indent_count("\t\tWorld")
-        2
-    """
-    return len(line) - len(line.lstrip())
-
-
-def extract_indentation(line: str) -> str:
-    """
-    Extract the leading whitespace from a given line.
-
-    This function identifies and returns the leading whitespace characters
-    (spaces or tabs) from the beginning of the input line.
-
-    Args:
-        line (str): The input line to process.
-
-    Returns:
-        str: The leading whitespace of the line.
-
-    Examples:
-        >>> extract_indentation("    Hello")
-        '    '
-        >>> extract_indentation("\t\tWorld")
-        '\t\t'
-        >>> extract_indentation("No indentation")
-        ''
-    """
-    return line[:len(line) - len(line.lstrip())]
-
+from .line_kit import get_line_indent_count, extract_indentation
 
 class IndentationInfo(NamedTuple):
     """
@@ -117,7 +73,8 @@ def default(cls) -> 'IndentationInfo':
     @classmethod
     def shift_indentation(cls,
         content: Sequence[str], target_lines: Sequence[str], target_reference_indentation_count: int,
-        relindent_level: int | None
+        relindent_level: int | None = None,
+        identifier_finder: IdentifierFinder | None = None
     ) -> list[str]:
         """
         Returns 'content' with shifted indentation based on a relative indent level and a reference indentation count.
@@ -165,7 +122,10 @@ def _shift_indentation(
         return [raw_line_adjuster(line) for line in content]
 
     @classmethod
-    def from_content(cls, content: str | Sequence[str]) -> 'IndentationInfo':
+    def from_content(
+        cls, content: str | Sequence[str],
+        identifier_finder: IdentifierFinder | None = None
+    ) -> 'IndentationInfo':
         """
         Analyzes the indentation in the given content and creates an IndentationInfo instance.
 
diff --git a/src/text_manipulation/line_kit.py b/src/text_manipulation/line_kit.py
@@ -0,0 +1,47 @@
+from typing import Sequence
+
+from collections.abc import Sequence
+
+def get_line_indent_count_from_lines(lines: Sequence[str], index: int) -> int:
+    return get_line_indent_count(lines[index])
+
+def get_line_indent_count(line: str) -> int:
+    """
+    Count the number of leading whitespace characters in a line.
+
+    Args:
+        line (str): The input line to analyze.
+
+    Returns:
+        int: The number of leading whitespace characters.
+
+    Example:
+        >>> get_line_indent_count("    Hello")
+        4
+        >>> get_line_indent_count("\t\tWorld")
+        2
+    """
+    return len(line) - len(line.lstrip())
+
+def extract_indentation(line: str) -> str:
+    """
+    Extract the leading whitespace from a given line.
+
+    This function identifies and returns the leading whitespace characters
+    (spaces or tabs) from the beginning of the input line.
+
+    Args:
+        line (str): The input line to process.
+
+    Returns:
+        str: The leading whitespace of the line.
+
+    Examples:
+        >>> extract_indentation("    Hello")
+        '    '
+        >>> extract_indentation("\t\tWorld")
+        '\t\t'
+        >>> extract_indentation("No indentation")
+        ''
+    """
+    return line[:len(line) - len(line.lstrip())]
diff --git a/src/text_manipulation/range_spec.py b/src/text_manipulation/range_spec.py
@@ -17,7 +17,7 @@
 
 from cedarscript_ast_parser import Marker, RelativeMarker, RelativePositionType, MarkerType, BodyOrWhole
 
-from .indentation_kit import get_line_indent_count_from_lines
+from .line_kit import get_line_indent_count_from_lines
 
 MATCH_TYPES = ('exact', 'stripped', 'normalized', 'partial')