Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .jules/bolt.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@
## 2025-05-20 - Pre-compiling Regex in Loops
**Learning:** `re.findall(pattern, string)` recompiles (or retrieves from cache) the pattern on every call. In high-frequency functions called inside loops (like complexity estimation), this overhead adds up.
**Action:** Always pre-compile regexes (`re.compile`) into module-level or class-level constants if they are used repeatedly, especially in tight loops or recursive functions.
Copy link

Copilot AI Apr 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Falta una línea en blanco entre la sección previa y este nuevo encabezado "## 2025-05-20 - O(N*M)...". Agregar una línea en blanco mejora la legibilidad y mantiene el formato consistente con las entradas anteriores del archivo.

Suggested change
**Action:** Always pre-compile regexes (`re.compile`) into module-level or class-level constants if they are used repeatedly, especially in tight loops or recursive functions.
**Action:** Always pre-compile regexes (`re.compile`) into module-level or class-level constants if they are used repeatedly, especially in tight loops or recursive functions.

Copilot uses AI. Check for mistakes.
## 2025-05-20 - O(N*M) loop to O(N) dict lookup in MaterialExtractor
**Learning:** In `evolutia/material_extractor.py`, finding the solution for each exercise was using an O(N*M) nested loop. When scaling the number of exercises per material (e.g. from 10 to 500), extraction time jumped significantly due to quadratic complexity.
**Action:** Always replace nested matching loops (especially those matching IDs or labels) with pre-computed O(N) lookup dictionaries. To preserve the original `break` behavior (first match wins), populate the dictionary using `if key not in dict: dict[key] = value`.
301 changes: 152 additions & 149 deletions evolutia/material_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,28 @@
from typing import Dict, List, Optional, Union
import logging
import time
try:
from utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)
except ImportError:
from .utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)
logger = logging.getLogger(__name__)

try:
from utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)
except ImportError:
from .utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)


logger = logging.getLogger(__name__)


class MaterialExtractor:
"""Extrae ejercicios y soluciones de materiales didácticos."""

Expand All @@ -47,7 +47,7 @@ def __init__(self, base_path: Union[Path, str]):
self._last_scan_timestamp: float = 0
# TTL del caché en segundos (5 minutos)
self._cache_ttl = 300

def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict:
"""
Extrae ejercicios y soluciones de un archivo Markdown.
Expand All @@ -65,19 +65,19 @@ def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict:
return self._file_cache[file_path]['data']

try:
content = read_markdown_file(file_path)
frontmatter, content_body = extract_frontmatter(content)
exercises = extract_exercise_blocks(content_body)
solutions = extract_solution_blocks(content_body)
# Resolver includes de ejercicios
for exercise in exercises:
if exercise['include_path']:
include_path = resolve_include_path(
exercise['include_path'],
file_path.parent
)
content = read_markdown_file(file_path)
frontmatter, content_body = extract_frontmatter(content)

exercises = extract_exercise_blocks(content_body)
solutions = extract_solution_blocks(content_body)

# Resolver includes de ejercicios
for exercise in exercises:
if exercise['include_path']:
include_path = resolve_include_path(
exercise['include_path'],
file_path.parent
)
if include_path.exists():
exercise['resolved_content'] = read_markdown_file(include_path)
else:
Expand All @@ -103,7 +103,7 @@ def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict:
solution['resolved_content'] = '\n\n---\n\n'.join(resolved_content_parts)
else:
solution['resolved_content'] = solution['content']

return {
'file_path': file_path,
'frontmatter': frontmatter,
Expand Down Expand Up @@ -153,118 +153,121 @@ def extract_from_directory(self, directory: Path, pattern: str = "*.md") -> List
directory = Path(directory)
if not directory.exists():
logger.warning(f"[MaterialExtractor] Directorio no existe: {directory}")
return []

materials = []
for md_file in directory.rglob(pattern):
# Ignorar archivos en _build y otros directorios temporales
if '_build' in md_file.parts or 'node_modules' in md_file.parts:
continue

material = self.extract_from_file(md_file)
# Incluirlos si tienen ejercicios/soluciones O si parecen ser materiales de lectura/teoría
if material['exercises'] or material['solutions'] or 'lectura' in md_file.name.lower() or 'teoria' in md_file.name.lower():
materials.append(material)

return materials

def extract_by_topic(self, topic: str) -> List[Dict]:
"""
Extrae materiales de un tema específico.

Busca en:
- {topic}/semana*_practica.md
- {topic}/semana*_lectura.md
- tareas/tarea*/tarea*.md

Args:
topic: Nombre del tema (ej: "analisis_vectorial")

Returns:
Lista de materiales extraídos
"""
materials = []

# Buscar en directorio del tema
topic_dir = self.base_path / topic
if topic_dir.exists():
# Buscar prácticas
practice_files = list(topic_dir.glob("*practica*.md"))
for file in practice_files:
materials.append(self.extract_from_file(file))

# Buscar lecturas (pueden tener ejercicios)
reading_files = list(topic_dir.glob("*lectura*.md"))
for file in reading_files:
materials.append(self.extract_from_file(file))

# Buscar en tareas (pueden ser de múltiples temas)
tareas_dir = self.base_path / "tareas"
if tareas_dir.exists():
for tarea_dir in tareas_dir.iterdir():
if tarea_dir.is_dir():
tarea_file = tarea_dir / f"{tarea_dir.name}.md"
if tarea_file.exists():
material = self.extract_from_file(tarea_file)
# Filtrar por tema si es relevante (checking subject or tags)
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))
if subject_match or tags_match:
materials.append(material)

# Buscar en examenes (pueden ser de múltiples temas)
examenes_dir = self.base_path / "examenes"
if examenes_dir.exists():
for examen_dir in examenes_dir.iterdir():
if examen_dir.is_dir():
examen_file = examen_dir / f"{examen_dir.name}.md"
if examen_file.exists():
material = self.extract_from_file(examen_file)
# Filtrar por tema si es relevante
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))

# Si es examen, a veces no tiene subject especifico o tiene "Examen X".
# Si no hay match explícito, tal vez incluirlo si no se encontraron otros materiales?
# Para seguridad, requerimos algún match en subject, tags o keywords
keywords_match = any(topic.lower() in kw.lower() for kw in material['frontmatter'].get('keywords', []))

if subject_match or tags_match or keywords_match:
materials.append(material)

return materials

def get_all_exercises(self, materials: List[Dict]) -> List[Dict]:
"""
Obtiene todos los ejercicios de una lista de materiales.

Args:
materials: Lista de materiales extraídos

Returns:
Lista de ejercicios con sus metadatos
"""
all_exercises = []

for material in materials:
for exercise in material['exercises']:
# Buscar solución correspondiente
solution = None
for sol in material['solutions']:
if sol['exercise_label'] == exercise['label']:
solution = sol
break

exercise_data = {
'label': exercise['label'],
'content': exercise['resolved_content'],
'source_file': material['file_path'],
'frontmatter': material['frontmatter'],
'solution': solution['resolved_content'] if solution else None,
'solution_label': solution['label'] if solution else None
}
all_exercises.append(exercise_data)

return []

materials = []
for md_file in directory.rglob(pattern):
# Ignorar archivos en _build y otros directorios temporales
if '_build' in md_file.parts or 'node_modules' in md_file.parts:
continue

material = self.extract_from_file(md_file)
# Incluirlos si tienen ejercicios/soluciones O si parecen ser materiales de lectura/teoría
if material['exercises'] or material['solutions'] or 'lectura' in md_file.name.lower() or 'teoria' in md_file.name.lower():
materials.append(material)

return materials

def extract_by_topic(self, topic: str) -> List[Dict]:
"""
Extrae materiales de un tema específico.

Busca en:
- {topic}/semana*_practica.md
- {topic}/semana*_lectura.md
- tareas/tarea*/tarea*.md

Args:
topic: Nombre del tema (ej: "analisis_vectorial")

Returns:
Lista de materiales extraídos
"""
materials = []

# Buscar en directorio del tema
topic_dir = self.base_path / topic
if topic_dir.exists():
# Buscar prácticas
practice_files = list(topic_dir.glob("*practica*.md"))
for file in practice_files:
materials.append(self.extract_from_file(file))

# Buscar lecturas (pueden tener ejercicios)
reading_files = list(topic_dir.glob("*lectura*.md"))
for file in reading_files:
materials.append(self.extract_from_file(file))

# Buscar en tareas (pueden ser de múltiples temas)
tareas_dir = self.base_path / "tareas"
if tareas_dir.exists():
for tarea_dir in tareas_dir.iterdir():
if tarea_dir.is_dir():
tarea_file = tarea_dir / f"{tarea_dir.name}.md"
if tarea_file.exists():
material = self.extract_from_file(tarea_file)
# Filtrar por tema si es relevante (checking subject or tags)
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))
if subject_match or tags_match:
materials.append(material)

# Buscar en examenes (pueden ser de múltiples temas)
examenes_dir = self.base_path / "examenes"
if examenes_dir.exists():
for examen_dir in examenes_dir.iterdir():
if examen_dir.is_dir():
examen_file = examen_dir / f"{examen_dir.name}.md"
if examen_file.exists():
material = self.extract_from_file(examen_file)
# Filtrar por tema si es relevante
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))

# Si es examen, a veces no tiene subject especifico o tiene "Examen X".
# Si no hay match explícito, tal vez incluirlo si no se encontraron otros materiales?
# Para seguridad, requerimos algún match en subject, tags o keywords
keywords_match = any(topic.lower() in kw.lower() for kw in material['frontmatter'].get('keywords', []))

if subject_match or tags_match or keywords_match:
materials.append(material)

return materials

def get_all_exercises(self, materials: List[Dict]) -> List[Dict]:
"""
Obtiene todos los ejercicios de una lista de materiales.

Args:
materials: Lista de materiales extraídos

Returns:
Lista de ejercicios con sus metadatos
"""
all_exercises = []

for material in materials:
# OPTIMIZATION: Pre-compute solutions dictionary for O(1) lookup
# instead of O(N*M) nested loops to significantly improve performance.
Comment on lines +250 to +251
Copy link

Copilot AI Apr 26, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Los comentarios del archivo están mayormente en español; este bloque de comentario de optimización está en inglés. Para mantener consistencia del código, cambia el comentario a español (o al menos al mismo idioma que el resto del módulo).

Suggested change
# OPTIMIZATION: Pre-compute solutions dictionary for O(1) lookup
# instead of O(N*M) nested loops to significantly improve performance.
# OPTIMIZACIÓN: Precalcular el diccionario de soluciones para búsqueda O(1)
# en lugar de usar bucles anidados O(N*M), mejorando significativamente el rendimiento.

Copilot uses AI. Check for mistakes.
solutions_by_ex = {}
for sol in material['solutions']:
if sol['exercise_label'] not in solutions_by_ex:
solutions_by_ex[sol['exercise_label']] = sol

for exercise in material['exercises']:
# Buscar solución correspondiente usando búsqueda O(1)
solution = solutions_by_ex.get(exercise['label'])

exercise_data = {
'label': exercise['label'],
'content': exercise['resolved_content'],
'source_file': material['file_path'],
'frontmatter': material['frontmatter'],
'solution': solution['resolved_content'] if solution else None,
'solution_label': solution['label'] if solution else None
}
all_exercises.append(exercise_data)

return all_exercises

def clear_cache(self):
Expand Down