CST-based indentation discovery

elifarley · elifarley · commit be6cc64c7a86 · 2024-11-04T02:52:41.000-03:00
indentation_kit.py: prefer CST-based indentation discovery. Fall back to text-based discovery if unavailable.
Passes indentation tests for codeeditor.py
diff --git a/src/cedarscript_editor/cedarscript_editor.py b/src/cedarscript_editor/cedarscript_editor.py
@@ -156,21 +156,26 @@ def _update_command(self, cmd: UpdateCommand):
                     region, action, lines, RangeSpec.EMPTY, identifier_finder
                 )
                 content = IndentationInfo.shift_indentation(
-                    content_range.read(lines), lines, search_range.indent, relindent_level
+                    content_range.read(lines), lines, search_range.indent, relindent_level,
+                    identifier_finder
                 )
                 content = (region, content)
             case _:
                 match action:
                     case MoveClause(insert_position=region, relative_indentation=relindent_level):
                         content = IndentationInfo.shift_indentation(
-                            move_src_range.read(lines), lines, search_range.indent, relindent_level
+                            move_src_range.read(lines), lines, search_range.indent, relindent_level,
+                            identifier_finder
                         )
                     case DeleteClause():
                         pass
                     case _:
                         raise ValueError(f'Invalid content: {content}')
 
-        self._apply_action(action, lines, search_range, content, range_spec_to_delete=move_src_range)
+        self._apply_action(
+            action, lines, search_range, content,
+            range_spec_to_delete=move_src_range, identifier_finder=identifier_finder
+        )
 
         write_file(file_path, lines)
 
@@ -179,7 +184,8 @@ def _update_command(self, cmd: UpdateCommand):
     @staticmethod
     def _apply_action(
         action: EditingAction, lines: Sequence[str], range_spec: RangeSpec, content: str | None = None,
-        range_spec_to_delete: RangeSpec | None = None
+        range_spec_to_delete: RangeSpec | None = None,
+        identifier_finder: IdentifierFinder | None = None
     ):
         match action:
 
@@ -199,7 +205,7 @@ def _apply_action(
             case ReplaceClause() | InsertClause():
                 match content:
                     case str():
-                        content = IndentationInfo.from_content(lines).apply_relative_indents(
+                        content = IndentationInfo.from_content(lines, identifier_finder).apply_relative_indents(
                             content, range_spec.indent
                         )
                     case Sequence():
diff --git a/src/cedarscript_editor/tree_sitter_identifier_finder.py b/src/cedarscript_editor/tree_sitter_identifier_finder.py
@@ -34,6 +34,7 @@ class TreeSitterIdentifierFinder(IdentifierFinder):
     """
 
     def __init__(self, fname: str, source: str | Sequence[str], parent_restriction: ParentRestriction = None):
+        super().__init__()
         self.parent_restriction = parent_restriction
         match source:
             case str() as s:
@@ -65,6 +66,7 @@ def __call__(
                 # Returns IdentifierBoundaries
                 return self._find_identifier(marker, parent_restriction)
 
+
     def _find_identifier(self,
         marker: Marker,
         parent_restriction: ParentRestriction
@@ -84,39 +86,16 @@ def _find_identifier(self,
         """
         query_info_key = marker.type
         identifier_name = marker.value
-        match marker.type:
-            case 'method':
-                query_info_key = 'function'
         try:
             all_restrictions: list[ParentRestriction] = [parent_restriction]
             # Extract parent name if using dot notation
             if '.' in identifier_name:
                 *parent_parts, identifier_name = identifier_name.split('.')
                 all_restrictions.append("." + '.'.join(reversed(parent_parts)))
 
+            identifier_type = marker.type
             # Get all node candidates first
-            candidate_nodes = (
-                self.language.query(self.query_info[query_info_key].format(name=identifier_name))
-                .captures(self.tree.root_node)
-            )
-            if not candidate_nodes:
-                return None
-
-            # Convert captures to boundaries and filter by parent
-            candidates: list[IdentifierBoundaries] = []
-            for ib in capture2identifier_boundaries(candidate_nodes, self.lines):
-                # For methods, verify the immediate parent is a class
-                if marker.type == 'method':
-                    if not ib.parents or not ib.parents[0].parent_type.startswith('class'):
-                        continue
-                # Check parent restriction (e.g., specific class name)
-                candidate_matched_all_restrictions = True
-                for pr in all_restrictions:
-                    if not ib.match_parent(pr):
-                        candidate_matched_all_restrictions = False
-                        break
-                if candidate_matched_all_restrictions:
-                    candidates.append(ib)
+            candidates = self.find_identifiers(query_info_key, identifier_name, all_restrictions)
         except Exception as e:
             raise ValueError(f"Unable to capture nodes for {marker}: {e}") from e
 
@@ -141,6 +120,34 @@ def _find_identifier(self,
                 return result.location_to_search_range(relative_position_type)
         return result
 
+    def find_identifiers(
+        self, identifier_type: str, name: str, all_restrictions: list[ParentRestriction] = []
+    ) -> list[IdentifierBoundaries]:
+        if not self.language:
+            return []
+        match identifier_type:
+            case 'method':
+                identifier_type = 'function'
+        candidate_nodes = self.language.query(self.query_info[identifier_type].format(name=name)).captures(self.tree.root_node)
+        if not candidate_nodes:
+            return []
+        # Convert captures to boundaries and filter by parent
+        candidates: list[IdentifierBoundaries] = []
+        for ib in capture2identifier_boundaries(candidate_nodes, self.lines):
+            # For methods, verify the immediate parent is a class
+            if identifier_type == 'method':
+                if not ib.parents or not ib.parents[0].parent_type.startswith('class'):
+                    continue
+            # Check parent restriction (e.g., specific class name)
+            candidate_matched_all_restrictions = True
+            for pr in all_restrictions:
+                if not ib.match_parent(pr):
+                    candidate_matched_all_restrictions = False
+                    break
+            if candidate_matched_all_restrictions:
+                candidates.append(ib)
+        return candidates
+
 
 def _get_by_offset(obj: Sequence, offset: int):
     if 0 <= offset < len(obj):
diff --git a/src/text_manipulation/cst_kit.py b/src/text_manipulation/cst_kit.py
@@ -1,5 +1,5 @@
 from typing import runtime_checkable, Protocol, Sequence
-
+from functools import cached_property
 from cedarscript_ast_parser import Marker, Segment, RelativeMarker, RelativePositionType, MarkerType, BodyOrWhole
 
 from .range_spec import IdentifierBoundaries, RangeSpec, ParentRestriction
@@ -15,3 +15,12 @@ def __call__(
     ) -> IdentifierBoundaries | RangeSpec | None:
         """Find identifier boundaries for a given marker or segment."""
         pass
+
+    def find_identifiers(
+        self, identifier_type: str, name: str, all_restrictions: list[ParentRestriction] | None = None
+    ) -> list[IdentifierBoundaries]:
+        pass
+
+    @cached_property
+    def find_all_callables(self) -> list[IdentifierBoundaries]:
+        return self.find_identifiers('function', r'.*')
diff --git a/src/text_manipulation/indentation_kit.py b/src/text_manipulation/indentation_kit.py
@@ -104,9 +104,9 @@ def shift_indentation(cls,
             ['        def example():', '            print('Hello')']
             :param target_lines:
         """
-        context_indent_char_count = cls.from_content(target_lines).char_count
+        context_indent_char_count = cls.from_content(target_lines, identifier_finder).char_count
         return (cls.
-            from_content(content).
+            from_content(content, identifier_finder).
             _replace(char_count=context_indent_char_count).
             _shift_indentation(
                 content, target_reference_indentation_count, relindent_level
@@ -146,19 +146,31 @@ def from_content(
               character count by analyzing patterns and using GCD.
         """
         # TODO Always send str?
-        lines = [x for x in content.splitlines() if x.strip()] if isinstance(content, str) else content
-
-        indentations = [extract_indentation(line) for line in lines if line.strip()]
-        has_zero_indent = any((i == '' for i in indentations))
-        indentations = [indent for indent in indentations if indent]
-
-        if not indentations:
-            return cls(4, ' ', 0, True, "No indentation found. Assuming 4 spaces (PEP 8).")
-
-        indent_chars = Counter(indent[0] for indent in indentations)
-        dominant_char = ' ' if indent_chars.get(' ', 0) >= indent_chars.get('\t', 0) else '\t'
-
-        indent_lengths = [len(indent) for indent in indentations]
+        indent_lengths = []
+        if identifier_finder:
+            indent_lengths = []
+            for ib in identifier_finder.find_all_callables:
+                if ib.whole and ib.whole.indent:
+                    indent_lengths.append(ib.whole.indent)
+                if ib.body and ib.body.indent:
+                    indent_lengths.append(ib.body.indent)
+            has_zero_indent = any((i == 0 for i in indent_lengths))
+
+        if not (indent_lengths):
+            lines = [x for x in content.splitlines() if x.strip()] if isinstance(content, str) else content
+            indentations = [extract_indentation(line) for line in lines if line.strip()]
+            has_zero_indent = any((i == '' for i in indentations))
+            indentations = [indent for indent in indentations if indent]
+
+            if not indentations:
+                return cls(4, ' ', 0, True, "No indentation found. Assuming 4 spaces (PEP 8).")
+
+            indent_chars = Counter(indent[0] for indent in indentations)
+            dominant_char = ' ' if indent_chars.get(' ', 0) >= indent_chars.get('\t', 0) else '\t'
+
+            indent_lengths = [len(indent) for indent in indentations]
+        else:
+            dominant_char = ' '
 
         char_count = 1
         if dominant_char != '\t':
@@ -167,7 +179,7 @@ def from_content(
         min_indent_chars = 0 if has_zero_indent else min(indent_lengths) if indent_lengths else 0
         min_indent_level = min_indent_chars // char_count
 
-        consistency = all(len(indent) % char_count == 0 for indent in indentations if indent)
+        consistency = all(indent_len % char_count == 0 for indent_len in indent_lengths if indent_len)
         match dominant_char:
             case ' ':
                 domcharstr = 'space'
diff --git a/src/text_manipulation/line_kit.py b/src/text_manipulation/line_kit.py
@@ -44,4 +44,4 @@ def extract_indentation(line: str) -> str:
         >>> extract_indentation("No indentation")
         ''
     """
-    return line[:len(line) - len(line.lstrip())]
+    return line[:get_line_indent_count(line)]
diff --git a/tests/corpus/refactor-benchmark.indentation-size-discovery/autosave.py b/tests/corpus/refactor-benchmark.indentation-size-discovery/autosave.py
diff --git a/tests/corpus/refactor-benchmark.indentation-size-discovery/base.py b/tests/corpus/refactor-benchmark.indentation-size-discovery/base.py
diff --git a/tests/corpus/refactor-benchmark.indentation-size-discovery/chat.xml b/tests/corpus/refactor-benchmark.indentation-size-discovery/chat.xml
@@ -8,7 +8,6 @@ FROM FILE "codeeditor.py"
 MOVE METHOD "__get_brackets"
 INSERT BEFORE CLASS "CodeEditor"
 RELATIVE INDENTATION 0;
-```
 
 -- 1. Move the method to become a top-level function.
 UPDATE CLASS "AutosaveForPlugin"
@@ -23,5 +22,5 @@ FROM FILE "base.py"
 MOVE METHOD "adapt_method_mode"
 INSERT BEFORE CLASS "BaseHandler"
 RELATIVE INDENTATION 0;
-
+```
 </no-train>