ContextLab
diff --git a/‎data/software.xlsx‎
7.65 KB b/‎data/software.xlsx‎
7.65 KB
diff --git a/‎scripts/build_software.py‎
Lines changed: 135 additions & 0 deletions b/‎scripts/build_software.py‎
Lines changed: 135 additions & 0 deletions
diff --git a/‎scripts/extract_software.py‎
Lines changed: 150 additions & 0 deletions b/‎scripts/extract_software.py‎
Lines changed: 150 additions & 0 deletions
@@ -0,0 +1,135 @@
+#!/usr/bin/env python3
+"""Build software.html from spreadsheet data.
+
+Reads data from data/software.xlsx and generates software.html
+using the template in templates/software.html.
+"""
+from pathlib import Path
+from typing import List, Dict, Any
+import openpyxl
+
+from utils import inject_content
+
+
+def load_software(xlsx_path: Path) -> Dict[str, List[Dict[str, Any]]]:
+    """Load all software data from Excel spreadsheet.
+
+    Args:
+        xlsx_path: Path to the software.xlsx file
+
+    Returns:
+        Dictionary with keys for each section (python, javascript, matlab)
+        Each value is a list of software item dictionaries.
+    """
+    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
+
+    data = {}
+    for sheet_name in wb.sheetnames:
+        sheet = wb[sheet_name]
+
+        # Get headers from first row
+        headers = [cell.value for cell in sheet[1]]
+
+        rows = []
+        for row in sheet.iter_rows(min_row=2, values_only=True):
+            # Skip empty rows
+            if not any(cell is not None for cell in row):
+                continue
+
+            row_dict = {}
+            for header, value in zip(headers, row):
+                if value is None:
+                    row_dict[header] = ''
+                else:
+                    row_dict[header] = value
+            rows.append(row_dict)
+
+        data[sheet_name] = rows
+
+    wb.close()
+    return data
+
+
+def generate_software_item(item: Dict[str, Any]) -> str:
+    """Generate HTML for a single software item.
+
+    Args:
+        item: Dictionary with software data (name, description, links_html)
+
+    Returns:
+        HTML string for the software item paragraph
+    """
+    name = item.get('name', '')
+    description = item.get('description', '')
+    links_html = item.get('links_html', '')
+
+    # Build the paragraph
+    parts = []
+    if name:
+        parts.append(f'<strong>{name}.</strong>')
+    if description:
+        parts.append(description)
+    if links_html:
+        parts.append(links_html)
+
+    return f'<p>{" ".join(parts)}</p>'
+
+
+def generate_section_content(items: List[Dict[str, Any]]) -> str:
+    """Generate HTML content for a software section.
+
+    Args:
+        items: List of software item dictionaries
+
+    Returns:
+        HTML string with all software items for the section
+    """
+    if not items:
+        return ''
+
+    paragraphs = [generate_software_item(item) for item in items]
+    return '\n\n                '.join(paragraphs)
+
+
+def build_software(
+    data_path: Path,
+    template_path: Path,
+    output_path: Path
+) -> None:
+    """Build software.html from data and template.
+
+    Args:
+        data_path: Path to software.xlsx
+        template_path: Path to template HTML file
+        output_path: Path for generated HTML file
+    """
+    # Load data
+    data = load_software(data_path)
+
+    # Generate content for each section
+    replacements = {
+        'PYTHON_CONTENT': generate_section_content(data.get('python', [])),
+        'JAVASCRIPT_CONTENT': generate_section_content(data.get('javascript', [])),
+        'MATLAB_CONTENT': generate_section_content(data.get('matlab', [])),
+    }
+
+    # Inject into template
+    inject_content(template_path, output_path, replacements)
+
+    # Report
+    total = sum(len(items) for items in data.values())
+    print(f"Generated {output_path} with {total} software items")
+
+
+def main():
+    """Main entry point for CLI usage."""
+    project_root = Path(__file__).parent.parent
+    data_path = project_root / 'data' / 'software.xlsx'
+    template_path = project_root / 'templates' / 'software.html'
+    output_path = project_root / 'software.html'
+
+    build_software(data_path, template_path, output_path)
+
+
+if __name__ == '__main__':
+    main()
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+"""Extract software data from existing HTML into Excel spreadsheet.
+
+This is a one-time script to migrate existing HTML data to the spreadsheet format.
+"""
+import re
+from pathlib import Path
+from bs4 import BeautifulSoup
+import openpyxl
+
+
+def extract_software(html_path: Path) -> dict:
+    """Extract all software data from HTML file.
+
+    Returns dict with keys: python, javascript, matlab
+    """
+    with open(html_path, 'r', encoding='utf-8') as f:
+        soup = BeautifulSoup(f.read(), 'html.parser')
+
+    sections = {
+        'python': 'python',
+        'javascript': 'javascript',
+        'matlab': 'matlab'
+    }
+
+    data = {}
+
+    for key, section_id in sections.items():
+        section = soup.find('section', id=section_id)
+        if not section:
+            print(f"Warning: Section '{section_id}' not found")
+            data[key] = []
+            continue
+
+        software_list = section.find('div', class_='software-list')
+        if not software_list:
+            print(f"Warning: No software-list in '{section_id}'")
+            data[key] = []
+            continue
+
+        items = []
+        for p in software_list.find_all('p', recursive=False):
+            item = extract_software_item(p)
+            if item['name']:
+                items.append(item)
+
+        data[key] = items
+        print(f"Extracted {len(items)} items from '{key}'")
+
+    return data
+
+
+def extract_software_item(p) -> dict:
+    """Extract data from a single software paragraph.
+
+    Format: <strong>Name.</strong> Description. [<a>Link</a>]
+    """
+    item = {
+        'name': '',
+        'description': '',
+        'links_html': ''
+    }
+
+    # Get the name from <strong> tag
+    strong = p.find('strong')
+    if strong:
+        name = strong.get_text(strip=True)
+        # Remove trailing period if present
+        item['name'] = name.rstrip('.')
+
+    # Get the full inner HTML
+    inner_html = get_inner_html(p)
+
+    # Extract links at the end (text in square brackets)
+    # Links are usually at the end like [<a href="...">GitHub</a>]
+    link_match = re.search(r'\[([^\]]*<a[^>]*>[^<]*</a>[^\]]*)\]', inner_html)
+    if link_match:
+        item['links_html'] = f'[{link_match.group(1)}]'
+        # Remove the link from the HTML to get description
+        inner_html = inner_html[:link_match.start()].strip()
+
+    # Now extract description (everything after the name)
+    # Remove the <strong>...</strong> part
+    desc_html = re.sub(r'<strong>[^<]*</strong>\s*', '', inner_html)
+    # Clean up the description
+    desc_html = desc_html.strip()
+    # Remove leading period if present
+    if desc_html.startswith('.'):
+        desc_html = desc_html[1:].strip()
+
+    item['description'] = desc_html
+
+    return item
+
+
+def get_inner_html(element) -> str:
+    """Get the inner HTML of an element as a string."""
+    return ''.join(str(child) for child in element.children).strip()
+
+
+def save_to_excel(data: dict, output_path: Path):
+    """Save extracted data to Excel spreadsheet with multiple sheets."""
+    wb = openpyxl.Workbook()
+    wb.remove(wb.active)
+
+    sheet_configs = [
+        ('python', ['name', 'description', 'links_html']),
+        ('javascript', ['name', 'description', 'links_html']),
+        ('matlab', ['name', 'description', 'links_html']),
+    ]
+
+    for sheet_name, columns in sheet_configs:
+        ws = wb.create_sheet(title=sheet_name)
+
+        # Write headers
+        for col, header in enumerate(columns, 1):
+            ws.cell(row=1, column=col, value=header)
+
+        # Write data
+        items = data.get(sheet_name, [])
+        for row_num, item in enumerate(items, 2):
+            for col, header in enumerate(columns, 1):
+                value = item.get(header, '')
+                ws.cell(row=row_num, column=col, value=value)
+
+        # Adjust column widths
+        for col in range(1, len(columns) + 1):
+            ws.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 60
+
+    wb.save(output_path)
+    print(f"Saved to {output_path}")
+
+
+def main():
+    project_root = Path(__file__).parent.parent
+    html_path = project_root / 'software.html'
+    output_path = project_root / 'data' / 'software.xlsx'
+
+    print(f"Extracting from: {html_path}")
+    data = extract_software(html_path)
+
+    total = sum(len(items) for items in data.values())
+    print(f"\nTotal items extracted: {total}")
+
+    save_to_excel(data, output_path)
+    print("Done!")
+
+
+if __name__ == '__main__':
+    main()