Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
301 changes: 151 additions & 150 deletions evolutia/material_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,28 +6,28 @@
from typing import Dict, List, Optional, Union
import logging
import time
try:
from utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)
except ImportError:
from .utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)
logger = logging.getLogger(__name__)

try:
from utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)
except ImportError:
from .utils.markdown_parser import (
read_markdown_file,
extract_frontmatter,
extract_exercise_blocks,
extract_solution_blocks,
resolve_include_path
)


logger = logging.getLogger(__name__)


class MaterialExtractor:
"""Extrae ejercicios y soluciones de materiales didácticos."""

Expand All @@ -47,7 +47,7 @@ def __init__(self, base_path: Union[Path, str]):
self._last_scan_timestamp: float = 0
# TTL del caché en segundos (5 minutos)
self._cache_ttl = 300

def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict:
"""
Extrae ejercicios y soluciones de un archivo Markdown.
Expand All @@ -65,19 +65,19 @@ def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict:
return self._file_cache[file_path]['data']

try:
content = read_markdown_file(file_path)
frontmatter, content_body = extract_frontmatter(content)
exercises = extract_exercise_blocks(content_body)
solutions = extract_solution_blocks(content_body)
# Resolver includes de ejercicios
for exercise in exercises:
if exercise['include_path']:
include_path = resolve_include_path(
exercise['include_path'],
file_path.parent
)
content = read_markdown_file(file_path)
frontmatter, content_body = extract_frontmatter(content)

exercises = extract_exercise_blocks(content_body)
solutions = extract_solution_blocks(content_body)

# Resolver includes de ejercicios
for exercise in exercises:
if exercise['include_path']:
include_path = resolve_include_path(
exercise['include_path'],
file_path.parent
)
if include_path.exists():
exercise['resolved_content'] = read_markdown_file(include_path)
else:
Expand All @@ -103,8 +103,8 @@ def extract_from_file(self, file_path: Path, use_cache: bool = True) -> Dict:
solution['resolved_content'] = '\n\n---\n\n'.join(resolved_content_parts)
else:
solution['resolved_content'] = solution['content']
return {

result = {
'file_path': file_path,
'frontmatter': frontmatter,
'exercises': exercises,
Comment on lines +106 to 110
Expand Down Expand Up @@ -153,118 +153,119 @@ def extract_from_directory(self, directory: Path, pattern: str = "*.md") -> List
directory = Path(directory)
if not directory.exists():
logger.warning(f"[MaterialExtractor] Directorio no existe: {directory}")
return []

materials = []
for md_file in directory.rglob(pattern):
# Ignorar archivos en _build y otros directorios temporales
if '_build' in md_file.parts or 'node_modules' in md_file.parts:
continue

material = self.extract_from_file(md_file)
# Incluirlos si tienen ejercicios/soluciones O si parecen ser materiales de lectura/teoría
if material['exercises'] or material['solutions'] or 'lectura' in md_file.name.lower() or 'teoria' in md_file.name.lower():
materials.append(material)

return materials

def extract_by_topic(self, topic: str) -> List[Dict]:
"""
Extrae materiales de un tema específico.

Busca en:
- {topic}/semana*_practica.md
- {topic}/semana*_lectura.md
- tareas/tarea*/tarea*.md

Args:
topic: Nombre del tema (ej: "analisis_vectorial")

Returns:
Lista de materiales extraídos
"""
materials = []

# Buscar en directorio del tema
topic_dir = self.base_path / topic
if topic_dir.exists():
# Buscar prácticas
practice_files = list(topic_dir.glob("*practica*.md"))
for file in practice_files:
materials.append(self.extract_from_file(file))

# Buscar lecturas (pueden tener ejercicios)
reading_files = list(topic_dir.glob("*lectura*.md"))
for file in reading_files:
materials.append(self.extract_from_file(file))

# Buscar en tareas (pueden ser de múltiples temas)
tareas_dir = self.base_path / "tareas"
if tareas_dir.exists():
for tarea_dir in tareas_dir.iterdir():
if tarea_dir.is_dir():
tarea_file = tarea_dir / f"{tarea_dir.name}.md"
if tarea_file.exists():
material = self.extract_from_file(tarea_file)
# Filtrar por tema si es relevante (checking subject or tags)
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))
if subject_match or tags_match:
materials.append(material)

# Buscar en examenes (pueden ser de múltiples temas)
examenes_dir = self.base_path / "examenes"
if examenes_dir.exists():
for examen_dir in examenes_dir.iterdir():
if examen_dir.is_dir():
examen_file = examen_dir / f"{examen_dir.name}.md"
if examen_file.exists():
material = self.extract_from_file(examen_file)
# Filtrar por tema si es relevante
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))

# Si es examen, a veces no tiene subject especifico o tiene "Examen X".
# Si no hay match explícito, tal vez incluirlo si no se encontraron otros materiales?
# Para seguridad, requerimos algún match en subject, tags o keywords
keywords_match = any(topic.lower() in kw.lower() for kw in material['frontmatter'].get('keywords', []))

if subject_match or tags_match or keywords_match:
materials.append(material)

return materials

def get_all_exercises(self, materials: List[Dict]) -> List[Dict]:
"""
Obtiene todos los ejercicios de una lista de materiales.

Args:
materials: Lista de materiales extraídos

Returns:
Lista de ejercicios con sus metadatos
"""
all_exercises = []

for material in materials:
for exercise in material['exercises']:
# Buscar solución correspondiente
solution = None
for sol in material['solutions']:
if sol['exercise_label'] == exercise['label']:
solution = sol
break

exercise_data = {
'label': exercise['label'],
'content': exercise['resolved_content'],
'source_file': material['file_path'],
'frontmatter': material['frontmatter'],
'solution': solution['resolved_content'] if solution else None,
'solution_label': solution['label'] if solution else None
}
all_exercises.append(exercise_data)

return []

materials = []
for md_file in directory.rglob(pattern):
# Ignorar archivos en _build y otros directorios temporales
if '_build' in md_file.parts or 'node_modules' in md_file.parts:
continue

material = self.extract_from_file(md_file)
# Incluirlos si tienen ejercicios/soluciones O si parecen ser materiales de lectura/teoría
if material['exercises'] or material['solutions'] or 'lectura' in md_file.name.lower() or 'teoria' in md_file.name.lower():
materials.append(material)

return materials

def extract_by_topic(self, topic: str) -> List[Dict]:
"""
Extrae materiales de un tema específico.

Busca en:
- {topic}/semana*_practica.md
- {topic}/semana*_lectura.md
- tareas/tarea*/tarea*.md

Args:
topic: Nombre del tema (ej: "analisis_vectorial")

Returns:
Lista de materiales extraídos
"""
materials = []

# Buscar en directorio del tema
topic_dir = self.base_path / topic
if topic_dir.exists():
# Buscar prácticas
practice_files = list(topic_dir.glob("*practica*.md"))
for file in practice_files:
materials.append(self.extract_from_file(file))

# Buscar lecturas (pueden tener ejercicios)
reading_files = list(topic_dir.glob("*lectura*.md"))
for file in reading_files:
materials.append(self.extract_from_file(file))

# Buscar en tareas (pueden ser de múltiples temas)
tareas_dir = self.base_path / "tareas"
if tareas_dir.exists():
for tarea_dir in tareas_dir.iterdir():
if tarea_dir.is_dir():
tarea_file = tarea_dir / f"{tarea_dir.name}.md"
if tarea_file.exists():
material = self.extract_from_file(tarea_file)
# Filtrar por tema si es relevante (checking subject or tags)
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))
if subject_match or tags_match:
materials.append(material)

# Buscar en examenes (pueden ser de múltiples temas)
examenes_dir = self.base_path / "examenes"
if examenes_dir.exists():
for examen_dir in examenes_dir.iterdir():
if examen_dir.is_dir():
examen_file = examen_dir / f"{examen_dir.name}.md"
if examen_file.exists():
material = self.extract_from_file(examen_file)
# Filtrar por tema si es relevante
subject_match = material['frontmatter'].get('subject', '').lower().find(topic.lower()) != -1
tags_match = any(topic.lower() in tag.lower() for tag in material['frontmatter'].get('tags', []))

# Si es examen, a veces no tiene subject especifico o tiene "Examen X".
# Si no hay match explícito, tal vez incluirlo si no se encontraron otros materiales?
# Para seguridad, requerimos algún match en subject, tags o keywords
keywords_match = any(topic.lower() in kw.lower() for kw in material['frontmatter'].get('keywords', []))

if subject_match or tags_match or keywords_match:
materials.append(material)

return materials

def get_all_exercises(self, materials: List[Dict]) -> List[Dict]:
"""
Obtiene todos los ejercicios de una lista de materiales.

Args:
materials: Lista de materiales extraídos

Returns:
Lista de ejercicios con sus metadatos
"""
all_exercises = []

for material in materials:
# Precompute solution lookup for O(1) matching, preserving first-match behavior
solutions_by_exercise = {}
for sol in material['solutions']:
if sol['exercise_label'] not in solutions_by_exercise:
solutions_by_exercise[sol['exercise_label']] = sol

for exercise in material['exercises']:
solution = solutions_by_exercise.get(exercise['label'])

exercise_data = {
'label': exercise['label'],
'content': exercise['resolved_content'],
'source_file': material['file_path'],
'frontmatter': material['frontmatter'],
'solution': solution['resolved_content'] if solution else None,
'solution_label': solution['label'] if solution else None
}
all_exercises.append(exercise_data)

return all_exercises

def clear_cache(self):
Expand Down