Skip to content

Commit d8cebf7

Browse files
jeremymanningclaude
andcommitted
Add software.html build system
- extract_software.py: One-time extraction from HTML to Excel - build_software.py: Generate HTML from data/software.xlsx - templates/software.html: Template with content markers - data/software.xlsx: Extracted data (9 python, 1 javascript, 10 matlab tools) - test_build_software.py: 12 passing tests 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <[email protected]>
1 parent 0e40b6e commit d8cebf7

File tree

5 files changed

+681
-0
lines changed

5 files changed

+681
-0
lines changed

data/software.xlsx

7.65 KB
Binary file not shown.

scripts/build_software.py

Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
#!/usr/bin/env python3
2+
"""Build software.html from spreadsheet data.
3+
4+
Reads data from data/software.xlsx and generates software.html
5+
using the template in templates/software.html.
6+
"""
7+
from pathlib import Path
8+
from typing import List, Dict, Any
9+
import openpyxl
10+
11+
from utils import inject_content
12+
13+
14+
def load_software(xlsx_path: Path) -> Dict[str, List[Dict[str, Any]]]:
15+
"""Load all software data from Excel spreadsheet.
16+
17+
Args:
18+
xlsx_path: Path to the software.xlsx file
19+
20+
Returns:
21+
Dictionary with keys for each section (python, javascript, matlab)
22+
Each value is a list of software item dictionaries.
23+
"""
24+
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
25+
26+
data = {}
27+
for sheet_name in wb.sheetnames:
28+
sheet = wb[sheet_name]
29+
30+
# Get headers from first row
31+
headers = [cell.value for cell in sheet[1]]
32+
33+
rows = []
34+
for row in sheet.iter_rows(min_row=2, values_only=True):
35+
# Skip empty rows
36+
if not any(cell is not None for cell in row):
37+
continue
38+
39+
row_dict = {}
40+
for header, value in zip(headers, row):
41+
if value is None:
42+
row_dict[header] = ''
43+
else:
44+
row_dict[header] = value
45+
rows.append(row_dict)
46+
47+
data[sheet_name] = rows
48+
49+
wb.close()
50+
return data
51+
52+
53+
def generate_software_item(item: Dict[str, Any]) -> str:
54+
"""Generate HTML for a single software item.
55+
56+
Args:
57+
item: Dictionary with software data (name, description, links_html)
58+
59+
Returns:
60+
HTML string for the software item paragraph
61+
"""
62+
name = item.get('name', '')
63+
description = item.get('description', '')
64+
links_html = item.get('links_html', '')
65+
66+
# Build the paragraph
67+
parts = []
68+
if name:
69+
parts.append(f'<strong>{name}.</strong>')
70+
if description:
71+
parts.append(description)
72+
if links_html:
73+
parts.append(links_html)
74+
75+
return f'<p>{" ".join(parts)}</p>'
76+
77+
78+
def generate_section_content(items: List[Dict[str, Any]]) -> str:
79+
"""Generate HTML content for a software section.
80+
81+
Args:
82+
items: List of software item dictionaries
83+
84+
Returns:
85+
HTML string with all software items for the section
86+
"""
87+
if not items:
88+
return ''
89+
90+
paragraphs = [generate_software_item(item) for item in items]
91+
return '\n\n '.join(paragraphs)
92+
93+
94+
def build_software(
95+
data_path: Path,
96+
template_path: Path,
97+
output_path: Path
98+
) -> None:
99+
"""Build software.html from data and template.
100+
101+
Args:
102+
data_path: Path to software.xlsx
103+
template_path: Path to template HTML file
104+
output_path: Path for generated HTML file
105+
"""
106+
# Load data
107+
data = load_software(data_path)
108+
109+
# Generate content for each section
110+
replacements = {
111+
'PYTHON_CONTENT': generate_section_content(data.get('python', [])),
112+
'JAVASCRIPT_CONTENT': generate_section_content(data.get('javascript', [])),
113+
'MATLAB_CONTENT': generate_section_content(data.get('matlab', [])),
114+
}
115+
116+
# Inject into template
117+
inject_content(template_path, output_path, replacements)
118+
119+
# Report
120+
total = sum(len(items) for items in data.values())
121+
print(f"Generated {output_path} with {total} software items")
122+
123+
124+
def main():
125+
"""Main entry point for CLI usage."""
126+
project_root = Path(__file__).parent.parent
127+
data_path = project_root / 'data' / 'software.xlsx'
128+
template_path = project_root / 'templates' / 'software.html'
129+
output_path = project_root / 'software.html'
130+
131+
build_software(data_path, template_path, output_path)
132+
133+
134+
if __name__ == '__main__':
135+
main()

scripts/extract_software.py

Lines changed: 150 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,150 @@
1+
#!/usr/bin/env python3
2+
"""Extract software data from existing HTML into Excel spreadsheet.
3+
4+
This is a one-time script to migrate existing HTML data to the spreadsheet format.
5+
"""
6+
import re
7+
from pathlib import Path
8+
from bs4 import BeautifulSoup
9+
import openpyxl
10+
11+
12+
def extract_software(html_path: Path) -> dict:
13+
"""Extract all software data from HTML file.
14+
15+
Returns dict with keys: python, javascript, matlab
16+
"""
17+
with open(html_path, 'r', encoding='utf-8') as f:
18+
soup = BeautifulSoup(f.read(), 'html.parser')
19+
20+
sections = {
21+
'python': 'python',
22+
'javascript': 'javascript',
23+
'matlab': 'matlab'
24+
}
25+
26+
data = {}
27+
28+
for key, section_id in sections.items():
29+
section = soup.find('section', id=section_id)
30+
if not section:
31+
print(f"Warning: Section '{section_id}' not found")
32+
data[key] = []
33+
continue
34+
35+
software_list = section.find('div', class_='software-list')
36+
if not software_list:
37+
print(f"Warning: No software-list in '{section_id}'")
38+
data[key] = []
39+
continue
40+
41+
items = []
42+
for p in software_list.find_all('p', recursive=False):
43+
item = extract_software_item(p)
44+
if item['name']:
45+
items.append(item)
46+
47+
data[key] = items
48+
print(f"Extracted {len(items)} items from '{key}'")
49+
50+
return data
51+
52+
53+
def extract_software_item(p) -> dict:
54+
"""Extract data from a single software paragraph.
55+
56+
Format: <strong>Name.</strong> Description. [<a>Link</a>]
57+
"""
58+
item = {
59+
'name': '',
60+
'description': '',
61+
'links_html': ''
62+
}
63+
64+
# Get the name from <strong> tag
65+
strong = p.find('strong')
66+
if strong:
67+
name = strong.get_text(strip=True)
68+
# Remove trailing period if present
69+
item['name'] = name.rstrip('.')
70+
71+
# Get the full inner HTML
72+
inner_html = get_inner_html(p)
73+
74+
# Extract links at the end (text in square brackets)
75+
# Links are usually at the end like [<a href="...">GitHub</a>]
76+
link_match = re.search(r'\[([^\]]*<a[^>]*>[^<]*</a>[^\]]*)\]', inner_html)
77+
if link_match:
78+
item['links_html'] = f'[{link_match.group(1)}]'
79+
# Remove the link from the HTML to get description
80+
inner_html = inner_html[:link_match.start()].strip()
81+
82+
# Now extract description (everything after the name)
83+
# Remove the <strong>...</strong> part
84+
desc_html = re.sub(r'<strong>[^<]*</strong>\s*', '', inner_html)
85+
# Clean up the description
86+
desc_html = desc_html.strip()
87+
# Remove leading period if present
88+
if desc_html.startswith('.'):
89+
desc_html = desc_html[1:].strip()
90+
91+
item['description'] = desc_html
92+
93+
return item
94+
95+
96+
def get_inner_html(element) -> str:
97+
"""Get the inner HTML of an element as a string."""
98+
return ''.join(str(child) for child in element.children).strip()
99+
100+
101+
def save_to_excel(data: dict, output_path: Path):
102+
"""Save extracted data to Excel spreadsheet with multiple sheets."""
103+
wb = openpyxl.Workbook()
104+
wb.remove(wb.active)
105+
106+
sheet_configs = [
107+
('python', ['name', 'description', 'links_html']),
108+
('javascript', ['name', 'description', 'links_html']),
109+
('matlab', ['name', 'description', 'links_html']),
110+
]
111+
112+
for sheet_name, columns in sheet_configs:
113+
ws = wb.create_sheet(title=sheet_name)
114+
115+
# Write headers
116+
for col, header in enumerate(columns, 1):
117+
ws.cell(row=1, column=col, value=header)
118+
119+
# Write data
120+
items = data.get(sheet_name, [])
121+
for row_num, item in enumerate(items, 2):
122+
for col, header in enumerate(columns, 1):
123+
value = item.get(header, '')
124+
ws.cell(row=row_num, column=col, value=value)
125+
126+
# Adjust column widths
127+
for col in range(1, len(columns) + 1):
128+
ws.column_dimensions[openpyxl.utils.get_column_letter(col)].width = 60
129+
130+
wb.save(output_path)
131+
print(f"Saved to {output_path}")
132+
133+
134+
def main():
135+
project_root = Path(__file__).parent.parent
136+
html_path = project_root / 'software.html'
137+
output_path = project_root / 'data' / 'software.xlsx'
138+
139+
print(f"Extracting from: {html_path}")
140+
data = extract_software(html_path)
141+
142+
total = sum(len(items) for items in data.values())
143+
print(f"\nTotal items extracted: {total}")
144+
145+
save_to_excel(data, output_path)
146+
print("Done!")
147+
148+
149+
if __name__ == '__main__':
150+
main()

0 commit comments

Comments
 (0)