diff --git a/.jules/bolt.md b/.jules/bolt.md index 61511e5..048cb85 100644 --- a/.jules/bolt.md +++ b/.jules/bolt.md @@ -6,3 +6,6 @@ ## 2025-05-20 - Pre-compiling Regex in Loops **Learning:** `re.findall(pattern, string)` recompiles (or retrieves from cache) the pattern on every call. In high-frequency functions called inside loops (like complexity estimation), this overhead adds up. **Action:** Always pre-compile regexes (`re.compile`) into module-level or class-level constants if they are used repeatedly, especially in tight loops or recursive functions. +## 2025-05-20 - O(N*M) loop to O(N) dict lookup in MaterialExtractor +**Learning:** In `evolutia/material_extractor.py`, finding the solution for each exercise was using an O(N*M) nested loop. When scaling the number of exercises per material (e.g. from 10 to 500), extraction time jumped significantly due to quadratic complexity. +**Action:** Always replace nested matching loops (especially those matching IDs or labels) with pre-computed O(N) lookup dictionaries. To preserve the original `break` behavior (first match wins), populate the dictionary using `if key not in dict: dict[key] = value`. diff --git a/evolutia/material_extractor.py b/evolutia/material_extractor.py index 28b1756..bc04d58 100644 --- a/evolutia/material_extractor.py +++ b/evolutia/material_extractor.py @@ -6,28 +6,28 @@ from typing import Dict, List, Optional, Union import logging import time - -try: - from utils.markdown_parser import ( - read_markdown_file, - extract_frontmatter, - extract_exercise_blocks, - extract_solution_blocks, - resolve_include_path - ) -except ImportError: - from .utils.markdown_parser import ( - read_markdown_file, - extract_frontmatter, - extract_exercise_blocks, - extract_solution_blocks, - resolve_include_path - ) - - -logger = logging.getLogger(__name__) - - + +try: + from utils.markdown_parser import ( + read_markdown_file, + extract_frontmatter, + extract_exercise_blocks, + extract_solution_blocks, + resolve_include_path + ) +except ImportError: + from .utils.markdown_parser import ( + read_markdown_file, + extract_frontmatter, + extract_exercise_blocks, + extract_solution_blocks, + resolve_include_path + ) + + +logger = logging.getLogger(__name__) + + class MaterialExtractor: """Extrae ejercicios y soluciones de materiales didácticos.""" @@ -47,7 +47,7 @@ def __init__(self, base_path: Union[Path, str]): self._last_scan_timestamp: float = 0 # TTL del caché en segundos (5 minutos) self._cache_ttl = 300 - + def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict: """ Extrae ejercicios y soluciones de un archivo Markdown. @@ -65,19 +65,19 @@ def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict: return self._file_cache[file_path]['data'] try: - content = read_markdown_file(file_path) - frontmatter, content_body = extract_frontmatter(content) - - exercises = extract_exercise_blocks(content_body) - solutions = extract_solution_blocks(content_body) - - # Resolver includes de ejercicios - for exercise in exercises: - if exercise['include_path']: - include_path = resolve_include_path( - exercise['include_path'], - file_path.parent - ) + content = read_markdown_file(file_path) + frontmatter, content_body = extract_frontmatter(content) + + exercises = extract_exercise_blocks(content_body) + solutions = extract_solution_blocks(content_body) + + # Resolver includes de ejercicios + for exercise in exercises: + if exercise['include_path']: + include_path = resolve_include_path( + exercise['include_path'], + file_path.parent + ) if include_path.exists(): exercise['resolved_content'] = read_markdown_file(include_path) else: @@ -103,7 +103,7 @@ def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict: solution['resolved_content'] = '\n\n---\n\n'.join(resolved_content_parts) else: solution['resolved_content'] = solution['content'] - + return { 'file_path': file_path, 'frontmatter': frontmatter, @@ -153,118 +153,121 @@ def extract_from_directory(self, directory: Path, pattern: str = "*.md") -> List directory = Path(directory) if not directory.exists(): logger.warning(f"[MaterialExtractor] Directorio no existe: {directory}") - return [] - - materials = [] - for md_file in directory.rglob(pattern): - # Ignorar archivos en _build y otros directorios temporales - if '_build' in md_file.parts or 'node_modules' in md_file.parts: - continue - - material = self.extract_from_file(md_file) - # Incluirlos si tienen ejercicios/soluciones O si parecen ser materiales de lectura/teoría - if material['exercises'] or material['solutions'] or 'lectura' in md_file.name.lower() or 'teoria' in md_file.name.lower(): - materials.append(material) - - return materials - - def extract_by_topic(self, topic: str) -> List[Dict]: - """ - Extrae materiales de un tema específico. - - Busca en: - - {topic}/semana*_practica.md - - {topic}/semana*_lectura.md - - tareas/tarea*/tarea*.md - - Args: - topic: Nombre del tema (ej: "analisis_vectorial") - - Returns: - Lista de materiales extraídos - """ - materials = [] - - # Buscar en directorio del tema - topic_dir = self.base_path / topic - if topic_dir.exists(): - # Buscar prácticas - practice_files = list(topic_dir.glob("*practica*.md")) - for file in practice_files: - materials.append(self.extract_from_file(file)) - - # Buscar lecturas (pueden tener ejercicios) - reading_files = list(topic_dir.glob("*lectura*.md")) - for file in reading_files: - materials.append(self.extract_from_file(file)) - - # Buscar en tareas (pueden ser de múltiples temas) - tareas_dir = self.base_path / "tareas" - if tareas_dir.exists(): - for tarea_dir in tareas_dir.iterdir(): - if tarea_dir.is_dir(): - tarea_file = tarea_dir / f"{tarea_dir.name}.md" - if tarea_file.exists(): - material = self.extract_from_file(tarea_file) - # Filtrar por tema si es relevante (checking subject or tags) - subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1 - tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', [])) - if subject_match or tags_match: - materials.append(material) - - # Buscar en examenes (pueden ser de múltiples temas) - examenes_dir = self.base_path / "examenes" - if examenes_dir.exists(): - for examen_dir in examenes_dir.iterdir(): - if examen_dir.is_dir(): - examen_file = examen_dir / f"{examen_dir.name}.md" - if examen_file.exists(): - material = self.extract_from_file(examen_file) - # Filtrar por tema si es relevante - subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1 - tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', [])) - - # Si es examen, a veces no tiene subject especifico o tiene "Examen X". - # Si no hay match explícito, tal vez incluirlo si no se encontraron otros materiales? - # Para seguridad, requerimos algún match en subject, tags o keywords - keywords_match = any(topic.lower() in kw.lower() for kw in material['frontmatter'].get('keywords', [])) - - if subject_match or tags_match or keywords_match: - materials.append(material) - - return materials - - def get_all_exercises(self, materials: List[Dict]) -> List[Dict]: - """ - Obtiene todos los ejercicios de una lista de materiales. - - Args: - materials: Lista de materiales extraídos - - Returns: - Lista de ejercicios con sus metadatos - """ - all_exercises = [] - - for material in materials: - for exercise in material['exercises']: - # Buscar solución correspondiente - solution = None - for sol in material['solutions']: - if sol['exercise_label'] == exercise['label']: - solution = sol - break - - exercise_data = { - 'label': exercise['label'], - 'content': exercise['resolved_content'], - 'source_file': material['file_path'], - 'frontmatter': material['frontmatter'], - 'solution': solution['resolved_content'] if solution else None, - 'solution_label': solution['label'] if solution else None - } - all_exercises.append(exercise_data) - + return [] + + materials = [] + for md_file in directory.rglob(pattern): + # Ignorar archivos en _build y otros directorios temporales + if '_build' in md_file.parts or 'node_modules' in md_file.parts: + continue + + material = self.extract_from_file(md_file) + # Incluirlos si tienen ejercicios/soluciones O si parecen ser materiales de lectura/teoría + if material['exercises'] or material['solutions'] or 'lectura' in md_file.name.lower() or 'teoria' in md_file.name.lower(): + materials.append(material) + + return materials + + def extract_by_topic(self, topic: str) -> List[Dict]: + """ + Extrae materiales de un tema específico. + + Busca en: + - {topic}/semana*_practica.md + - {topic}/semana*_lectura.md + - tareas/tarea*/tarea*.md + + Args: + topic: Nombre del tema (ej: "analisis_vectorial") + + Returns: + Lista de materiales extraídos + """ + materials = [] + + # Buscar en directorio del tema + topic_dir = self.base_path / topic + if topic_dir.exists(): + # Buscar prácticas + practice_files = list(topic_dir.glob("*practica*.md")) + for file in practice_files: + materials.append(self.extract_from_file(file)) + + # Buscar lecturas (pueden tener ejercicios) + reading_files = list(topic_dir.glob("*lectura*.md")) + for file in reading_files: + materials.append(self.extract_from_file(file)) + + # Buscar en tareas (pueden ser de múltiples temas) + tareas_dir = self.base_path / "tareas" + if tareas_dir.exists(): + for tarea_dir in tareas_dir.iterdir(): + if tarea_dir.is_dir(): + tarea_file = tarea_dir / f"{tarea_dir.name}.md" + if tarea_file.exists(): + material = self.extract_from_file(tarea_file) + # Filtrar por tema si es relevante (checking subject or tags) + subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1 + tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', [])) + if subject_match or tags_match: + materials.append(material) + + # Buscar en examenes (pueden ser de múltiples temas) + examenes_dir = self.base_path / "examenes" + if examenes_dir.exists(): + for examen_dir in examenes_dir.iterdir(): + if examen_dir.is_dir(): + examen_file = examen_dir / f"{examen_dir.name}.md" + if examen_file.exists(): + material = self.extract_from_file(examen_file) + # Filtrar por tema si es relevante + subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1 + tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', [])) + + # Si es examen, a veces no tiene subject especifico o tiene "Examen X". + # Si no hay match explícito, tal vez incluirlo si no se encontraron otros materiales? + # Para seguridad, requerimos algún match en subject, tags o keywords + keywords_match = any(topic.lower() in kw.lower() for kw in material['frontmatter'].get('keywords', [])) + + if subject_match or tags_match or keywords_match: + materials.append(material) + + return materials + + def get_all_exercises(self, materials: List[Dict]) -> List[Dict]: + """ + Obtiene todos los ejercicios de una lista de materiales. + + Args: + materials: Lista de materiales extraídos + + Returns: + Lista de ejercicios con sus metadatos + """ + all_exercises = [] + + for material in materials: + # OPTIMIZATION: Pre-compute solutions dictionary for O(1) lookup + # instead of O(N*M) nested loops to significantly improve performance. + solutions_by_ex = {} + for sol in material['solutions']: + if sol['exercise_label'] not in solutions_by_ex: + solutions_by_ex[sol['exercise_label']] = sol + + for exercise in material['exercises']: + # Buscar solución correspondiente usando búsqueda O(1) + solution = solutions_by_ex.get(exercise['label']) + + exercise_data = { + 'label': exercise['label'], + 'content': exercise['resolved_content'], + 'source_file': material['file_path'], + 'frontmatter': material['frontmatter'], + 'solution': solution['resolved_content'] if solution else None, + 'solution_label': solution['label'] if solution else None + } + all_exercises.append(exercise_data) + return all_exercises def clear_cache(self):