diff --git a/skills/xlsx/recalc.py b/skills/xlsx/recalc.py index ee9108ced7..c4e69c1f98 100644 --- a/skills/xlsx/recalc.py +++ b/skills/xlsx/recalc.py @@ -9,6 +9,8 @@ import subprocess import os import platform +import zipfile +import re from pathlib import Path from openpyxl import load_workbook @@ -50,6 +52,52 @@ def setup_libreoffice_macro(): return False +def count_formulas(filename): + """ + Count formulas in an Excel file by scanning XML directly. + Much faster than loading the workbook with openpyxl. + """ + formula_count = 0 + # Pattern matches tag which indicates a formula. + # Matches or (with attributes) or (empty shared formula) + # Regex looks for or space or / + pattern = re.compile(rb'\s/]') + + try: + with zipfile.ZipFile(filename, 'r') as z: + for name in z.namelist(): + if name.startswith('xl/worksheets/sheet') and name.endswith('.xml'): + with z.open(name) as f: + # Chunked reading for memory efficiency + chunk_size = 1024 * 1024 # 1MB + buffer = b'' + while True: + chunk = f.read(chunk_size) + if not chunk: + break + buffer += chunk + + # Find last '>' to split safely + last_tag_end = buffer.rfind(b'>') + if last_tag_end != -1: + to_process = buffer[:last_tag_end+1] + formula_count += len(pattern.findall(to_process)) + buffer = buffer[last_tag_end+1:] + else: + # No '>' found, keep accumulating + pass + + # Process remaining buffer + formula_count += len(pattern.findall(buffer)) + except Exception as e: + sys.stderr.write(f"Error counting formulas via zip: {e}\n") + # Return 0 on error, or maybe -1 to indicate failure? + # Returning 0 is safer for now as it's just a stat. + return 0 + + return formula_count + + def recalc(filename, timeout=30): """ Recalculate formulas in Excel file and report any errors @@ -137,20 +185,8 @@ def recalc(filename, timeout=30): 'locations': locations[:20] # Show up to 20 locations } - # Add formula count for context - also check ALL cells - formula_count = 0 - wb_formulas = load_workbook(filename, data_only=False, read_only=True) - try: - for sheet_name in wb_formulas.sheetnames: - ws = wb_formulas[sheet_name] - for row in ws.iter_rows(): - for cell in row: - if cell.value and isinstance(cell.value, str) and cell.value.startswith('='): - formula_count += 1 - finally: - wb_formulas.close() - - result['total_formulas'] = formula_count + # Add formula count for context - optimized + result['total_formulas'] = count_formulas(filename) return result diff --git a/tests/test_recalc_perf.py b/tests/test_recalc_perf.py new file mode 100644 index 0000000000..a5aaff7f8f --- /dev/null +++ b/tests/test_recalc_perf.py @@ -0,0 +1,96 @@ + +import unittest +import os +import sys +from openpyxl import Workbook +from unittest.mock import MagicMock, patch + +# Add repo root to path +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) + +try: + from skills.xlsx.recalc import count_formulas +except ImportError: + count_formulas = None + +class TestRecalcPerf(unittest.TestCase): + def setUp(self): + self.filename = "test_perf_formulas.xlsx" + self.create_test_file(self.filename) + + def tearDown(self): + if os.path.exists(self.filename): + os.remove(self.filename) + + def create_test_file(self, filename, rows=100, cols=10): + wb = Workbook() + ws = wb.active + + # Add formulas + # A1 to A{rows} will have formulas + for r in range(1, rows + 1): + ws[f'A{r}'] = f'=B{r}+C{r}' + + # Add values (no formula) + for r in range(1, rows + 1): + ws[f'B{r}'] = 1 + ws[f'C{r}'] = 2 + + wb.save(filename) + # Expected formulas: rows + self.expected_count = rows + + def test_count_formulas(self): + if count_formulas is None: + # If function doesn't exist, we can't test it. + # This allows creating the test file before the implementation. + print("count_formulas not found, skipping test") + return + + count = count_formulas(self.filename) + self.assertEqual(count, self.expected_count, f"Expected {self.expected_count} formulas, got {count}") + + def test_small_chunks(self): + """Test that formula counting works even when file is read in small chunks (splitting tags)""" + if count_formulas is None: + self.skipTest("count_formulas not implemented yet") + + # XML content with formulas split in various ways + # in simple form + # with attributes + # split across chunk boundary + xml_content = b'SUM(A1)SUM(A1)A1+1' + # Total 3 formulas. + + # Mocking zipfile + with patch('zipfile.ZipFile') as MockZip: + mock_zip = MockZip.return_value + mock_zip.__enter__.return_value = mock_zip + mock_zip.namelist.return_value = ['xl/worksheets/sheet1.xml'] + + mock_file = MagicMock() + mock_file.__enter__.return_value = mock_file + mock_zip.open.return_value = mock_file + + # Use small chunk size to force splitting (e.g., 5 bytes) + chunks = [] + chunk_size = 5 + for i in range(0, len(xml_content), chunk_size): + chunks.append(xml_content[i:i+chunk_size]) + + iter_chunks = iter(chunks) + + def side_effect(size=-1): + try: + return next(iter_chunks) + except StopIteration: + return b'' + + mock_file.read.side_effect = side_effect + + # Filename doesn't matter as we mock ZipFile + count = count_formulas("dummy.xlsx") + self.assertEqual(count, 3) + +if __name__ == '__main__': + unittest.main()