From b6bbcac7bd388da7412dbe644b2e066adf87068e Mon Sep 17 00:00:00 2001 From: Claude Date: Tue, 9 Jun 2026 12:00:15 +0000 Subject: [PATCH] Add project-level MarkItDown Claude Code skill Install the community MarkItDown skill (SKILL.md, references, scripts) under .claude/skills/markitdown so it travels with the repository. Source: github.com/julianobarbosa/claude-code-skills (skills/markitdown), unmodified. --- .claude/skills/markitdown/SKILL.md | 310 +++++++++ .../references/advanced-features.md | 322 +++++++++ .../markitdown/references/api-reference.md | 416 +++++++++++ .../markitdown/references/cli-reference.md | 204 ++++++ .../skills/markitdown/references/examples.md | 648 ++++++++++++++++++ .../markitdown/scripts/batch-convert.py | 190 +++++ .../markitdown/scripts/batch-convert.sh | 87 +++ .../markitdown/scripts/convert-jupyter.py | 114 +++ 8 files changed, 2291 insertions(+) create mode 100644 .claude/skills/markitdown/SKILL.md create mode 100644 .claude/skills/markitdown/references/advanced-features.md create mode 100644 .claude/skills/markitdown/references/api-reference.md create mode 100644 .claude/skills/markitdown/references/cli-reference.md create mode 100644 .claude/skills/markitdown/references/examples.md create mode 100755 .claude/skills/markitdown/scripts/batch-convert.py create mode 100755 .claude/skills/markitdown/scripts/batch-convert.sh create mode 100644 .claude/skills/markitdown/scripts/convert-jupyter.py diff --git a/.claude/skills/markitdown/SKILL.md b/.claude/skills/markitdown/SKILL.md new file mode 100644 index 000000000..dc864a080 --- /dev/null +++ b/.claude/skills/markitdown/SKILL.md @@ -0,0 +1,310 @@ +--- +name: markitdown +description: Guide for using Microsoft MarkItDown - a Python utility for converting files to Markdown. Use when converting PDF, Word, PowerPoint, Excel, images, audio, HTML, CSV, JSON, XML, ZIP, YouTube URLs, EPubs, Jupyter notebooks, RSS feeds, or Wikipedia pages to Markdown format. Also use for document processing pipelines, LLM preprocessing, or text extraction tasks. +--- + +# MarkItDown Skill + +Microsoft's Python utility for converting various file formats to Markdown +for LLM and text analysis pipelines. + +## Overview + +MarkItDown converts documents while preserving structure (headings, lists, +tables, links). It's optimized for LLM consumption rather than +human-readable output. + +### Supported Formats + +| Category | Formats | +|----------|---------| +| Documents | PDF, Word (DOCX), PowerPoint (PPTX), Excel (XLSX, XLS) | +| Media | Images (EXIF + OCR), Audio (WAV, MP3 transcription) | +| Web | HTML, YouTube URLs, Wikipedia, RSS/Atom feeds | +| Data | CSV, JSON, XML, Jupyter notebooks (.ipynb) | +| Archives | ZIP (iterates contents), EPub | +| Email | Outlook MSG files | + +## Quick Start + +### Installation + +```bash +# Full installation (recommended) +pip install 'markitdown[all]' + +# Minimal with specific formats +pip install 'markitdown[pdf,docx,pptx]' + +# Using uv +uv pip install 'markitdown[all]' +``` + +#### Optional Dependencies + +| Extra | Description | +|-------|-------------| +| `[all]` | All optional dependencies | +| `[pdf]` | PDF file support | +| `[docx]` | Word documents | +| `[pptx]` | PowerPoint presentations | +| `[xlsx]` | Excel spreadsheets | +| `[xls]` | Legacy Excel files | +| `[outlook]` | Outlook MSG files | +| `[az-doc-intel]` | Azure Document Intelligence | +| `[audio-transcription]` | WAV/MP3 transcription | +| `[youtube-transcription]` | YouTube video transcripts | + +### Command-Line Usage + +```bash +# Basic conversion +markitdown document.pdf > output.md + +# Specify output file +markitdown document.pdf -o output.md + +# Pipe input +cat document.pdf | markitdown > output.md + +# With Azure Document Intelligence +markitdown document.pdf -o output.md -d -e "" +``` + +### Python API + +```python +from markitdown import MarkItDown + +# Basic conversion +md = MarkItDown() +result = md.convert("document.xlsx") +print(result.text_content) + +# With LLM for image descriptions +from openai import OpenAI + +client = OpenAI() +md = MarkItDown( + llm_client=client, + llm_model="gpt-4o", + llm_prompt="Describe this image in detail" +) +result = md.convert("image.jpg") +print(result.text_content) + +# With Azure Document Intelligence +md = MarkItDown(docintel_endpoint="") +result = md.convert("complex-document.pdf") +print(result.text_content) +``` + +## Common Use Cases + +### Batch Convert Directory + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() +input_dir = Path("./documents") +output_dir = Path("./markdown") +output_dir.mkdir(exist_ok=True) + +for file in input_dir.glob("*"): + if file.is_file(): + try: + result = md.convert(str(file)) + output_file = output_dir / f"{file.stem}.md" + output_file.write_text(result.text_content) + print(f"Converted: {file.name}") + except Exception as e: + print(f"Failed: {file.name} - {e}") +``` + +### Process for LLM Context + +```python +from markitdown import MarkItDown + +def prepare_for_llm(file_path: str) -> str: + """Convert document to LLM-ready markdown.""" + md = MarkItDown() + result = md.convert(file_path) + + # Add source reference + content = f"# Source: {file_path}\n\n{result.text_content}" + return content + +# Use with your LLM +context = prepare_for_llm("report.pdf") +``` + +### Extract YouTube Transcript + +```bash +# CLI +markitdown "https://www.youtube.com/watch?v=VIDEO_ID" > transcript.md +``` + +```python +# Python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("https://www.youtube.com/watch?v=VIDEO_ID") +print(result.text_content) +``` + +### Image OCR with AI Description + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Initialize with LLM support +client = OpenAI() +md = MarkItDown( + llm_client=client, + llm_model="gpt-4o" +) + +# Convert image with AI description +result = md.convert("screenshot.png") +print(result.text_content) +``` + +### Convert Jupyter Notebook + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("analysis.ipynb") +print(result.text_content) # Code cells, outputs, markdown +``` + +### Extract Wikipedia Content + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("https://en.wikipedia.org/wiki/Python") +print(result.text_content) # Main article content only +``` + +### Parse RSS Feed + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("https://example.com/feed.xml") +print(result.text_content) # Feed entries as markdown +``` + +## Plugin System + +MarkItDown supports third-party plugins for extended functionality. + +```bash +# List installed plugins +markitdown --list-plugins + +# Enable plugins during conversion +markitdown --use-plugins document.pdf +``` + +```python +# Enable plugins in Python +md = MarkItDown(enable_plugins=True) +result = md.convert("document.pdf") +``` + +> Search GitHub for `#markitdown-plugin` to find available plugins. + +## MCP Server Integration + +MarkItDown offers an MCP (Model Context Protocol) server for integration +with LLM applications like Claude Desktop. + +```bash +# Install MCP server +pip install markitdown-mcp + +# Or from source +git clone https://github.com/microsoft/markitdown.git +cd markitdown/packages/markitdown-mcp +pip install -e . +``` + +See [markitdown-mcp][mcp-repo] for configuration details. + +[mcp-repo]: https://github.com/microsoft/markitdown/tree/main/packages/markitdown-mcp + +## Docker Usage + +```bash +# Build image +docker build -t markitdown:latest . + +# Convert file +docker run --rm -i markitdown:latest < document.pdf > output.md +``` + +## Troubleshooting + +| Issue | Solution | +|-------|----------| +| Missing dependencies | Install with `pip install 'markitdown[all]'` | +| PDF extraction fails | Try Azure Document Intelligence for complex PDFs | +| Image text not extracted | Ensure OCR dependencies installed or use LLM mode | +| Large file timeout | Process in chunks or use streaming | +| Plugin not found | Run `markitdown --list-plugins` to verify installation | + +### Common Errors + +```bash +# ModuleNotFoundError for specific format +pip install 'markitdown[pdf]' # Install missing dependency + +# Azure authentication +export AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="" +export AZURE_DOCUMENT_INTELLIGENCE_KEY="" +``` + +## Requirements + +- Python >= 3.10 +- Virtual environment recommended + +```bash +# Create virtual environment +python -m venv .venv +source .venv/bin/activate # Linux/macOS +.venv\Scripts\activate # Windows + +# Install +pip install 'markitdown[all]' +``` + +## References + +- `references/cli-reference.md` - Complete CLI options +- `references/api-reference.md` - Python API details +- `references/examples.md` - Extended examples +- `references/advanced-features.md` - Custom converters, URI handling +- GitHub: +- PyPI: + +--- + +## Gotchas + +- **DOCX with embedded images: images extract to separate files; markdown uses absolute paths** — moving the markdown file alone breaks the image refs. +- **PDF OCR confidence isn't surfaced** — low-confidence text is returned as if certain; downstream LLM use can be confidently wrong. +- **XLSX merged cells extract as separate cells with empty values for non-anchor positions** — pivoted reports lose their column groupings invisibly. +- **HTML to markdown loses CSS-driven layout** — column-positioned tables collapse to row-major linear output; complex tables become unparseable. +- **The `--use-llm` flag for image descriptions silently falls back to filename if no OPENAI_API_KEY** — outputs look populated but contain no real description. diff --git a/.claude/skills/markitdown/references/advanced-features.md b/.claude/skills/markitdown/references/advanced-features.md new file mode 100644 index 000000000..8530342cd --- /dev/null +++ b/.claude/skills/markitdown/references/advanced-features.md @@ -0,0 +1,322 @@ +# MarkItDown Advanced Features + +Advanced functionality for custom converters, URI handling, and plugins. + +## Conversion Methods + +MarkItDown provides multiple conversion entry points. + +### convert() + +Universal method that auto-detects source type. + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# File path +result = md.convert("document.pdf") + +# URL +result = md.convert("https://example.com/page.html") + +# HTTP Response object +import requests +response = requests.get("https://example.com/doc.pdf") +result = md.convert(response) +``` + +### convert_local() + +For local file paths only. + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert_local("./documents/report.pdf") +print(result.text_content) +``` + +### convert_stream() + +For binary file-like objects. + +```python +from markitdown import MarkItDown +import io + +md = MarkItDown() + +# From bytes +with open("document.pdf", "rb") as f: + content = f.read() + +stream = io.BytesIO(content) +result = md.convert_stream(stream) + +# From HTTP response +import requests +response = requests.get("https://example.com/doc.pdf") +stream = io.BytesIO(response.content) +result = md.convert_stream(stream) +``` + +> **Note:** `convert_stream()` requires binary streams only (v0.1.0+). +> Text streams (`io.StringIO`) are not supported. + +### convert_url() + +For HTTP/HTTPS URLs. + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert_url("https://example.com/document.pdf") +print(result.text_content) +``` + +### convert_uri() + +For any URI scheme (http, https, file, data). + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# HTTP URL +result = md.convert_uri("https://example.com/page.html") + +# Local file URI +result = md.convert_uri("file:///path/to/document.pdf") + +# Data URI +result = md.convert_uri("data:text/plain;base64,SGVsbG8gV29ybGQ=") +``` + +### convert_response() + +For `requests.Response` objects. + +```python +from markitdown import MarkItDown +import requests + +md = MarkItDown() +response = requests.get("https://example.com/report.pdf") +result = md.convert_response(response) +print(result.text_content) +``` + +## Result Object + +The `DocumentConverterResult` contains conversion output. + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("document.pdf") + +# Main content +print(result.text_content) + +# Alias for text_content +print(result.markdown) + +# Document title (if available) +if result.title: + print(f"Title: {result.title}") +``` + +## Custom Converters + +Create custom converters for unsupported formats. + +### Basic Custom Converter + +```python +from markitdown import MarkItDown +from markitdown._base_converter import DocumentConverter +from markitdown._base_converter import DocumentConverterResult +from typing import BinaryIO + +class MyFormatConverter(DocumentConverter): + """Converter for .myformat files.""" + + def accepts( + self, + file_stream: BinaryIO, + stream_info: "StreamInfo", + **kwargs + ) -> bool: + """Check if this converter handles the file.""" + # Check by extension + if stream_info.extension: + return stream_info.extension.lower() == ".myformat" + # Check by MIME type + if stream_info.mime_type: + return stream_info.mime_type == "application/x-myformat" + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: "StreamInfo", + **kwargs + ) -> DocumentConverterResult: + """Convert the file to markdown.""" + content = file_stream.read().decode("utf-8") + + # Process content... + markdown = f"# My Format\n\n{content}" + + return DocumentConverterResult( + title="My Document", + text_content=markdown + ) + +# Register the converter +md = MarkItDown() +md.register_converter(MyFormatConverter()) + +# Use it +result = md.convert("document.myformat") +``` + +### Converter Priority + +Converters are matched in priority order (lower = higher priority). + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Register with high priority (runs first) +md.register_converter(MyConverter(), priority=10) + +# Register with default priority +md.register_converter(AnotherConverter()) # Default priority + +# Register with low priority (fallback) +md.register_converter(FallbackConverter(), priority=100) +``` + +## Plugin Development + +Create distributable plugins for MarkItDown. + +### Plugin Structure + +```text +markitdown-my-plugin/ + src/ + markitdown_my_plugin/ + __init__.py + __about__.py + _plugin.py + pyproject.toml +``` + +### Plugin Entry Point + +```python +# _plugin.py +from markitdown import MarkItDown +from ._converter import MyConverter + +def register_converters(markitdown: MarkItDown, **kwargs): + """Called when MarkItDown is instantiated with plugins.""" + markitdown.register_converter(MyConverter()) +``` + +### pyproject.toml + +```toml +[project] +name = "markitdown-my-plugin" +version = "0.1.0" +dependencies = ["markitdown>=0.1.0"] + +[project.entry-points."markitdown.plugin"] +my_plugin = "markitdown_my_plugin._plugin:register_converters" +``` + +### Using Plugins + +```python +from markitdown import MarkItDown + +# Enable all installed plugins +md = MarkItDown(enable_plugins=True) +result = md.convert("document.myformat") +``` + +```bash +# CLI: List installed plugins +markitdown --list-plugins + +# CLI: Enable plugins +markitdown --use-plugins document.myformat +``` + +## Built-in Converters Reference + +| Converter | Formats | Notes | +|-----------|---------|-------| +| `PdfConverter` | .pdf | Requires `[pdf]` extra | +| `DocxConverter` | .docx | Requires `[docx]` extra | +| `PptxConverter` | .pptx | Requires `[pptx]` extra | +| `XlsxConverter` | .xlsx | Requires `[xlsx]` extra | +| `XlsConverter` | .xls | Requires `[xls]` extra | +| `HtmlConverter` | .html, .htm | Built-in | +| `CsvConverter` | .csv | Built-in | +| `PlainTextConverter` | .txt, .md | Built-in | +| `ImageConverter` | .jpg, .png, .gif | EXIF + optional LLM | +| `AudioConverter` | .wav, .mp3 | Requires `[audio-transcription]` | +| `YouTubeConverter` | YouTube URLs | Requires `[youtube-transcription]` | +| `WikipediaConverter` | Wikipedia URLs | Built-in | +| `RssConverter` | RSS/Atom feeds | Built-in | +| `IpynbConverter` | .ipynb | Built-in | +| `EpubConverter` | .epub | Built-in | +| `ZipConverter` | .zip | Built-in | +| `OutlookMsgConverter` | .msg | Requires `[outlook]` extra | +| `DocumentIntelligenceConverter` | Various | Requires `[az-doc-intel]` | + +## MCP Server + +MarkItDown provides an MCP server for LLM integration. + +### Installation + +```bash +pip install markitdown-mcp +``` + +### Claude Desktop Configuration + +Add to `claude_desktop_config.json`: + +```json +{ + "mcpServers": { + "markitdown": { + "command": "markitdown-mcp" + } + } +} +``` + +### MCP Tool + +The MCP server exposes a single tool: + +```python +convert_to_markdown(uri: str) -> str +``` + +Accepts `http:`, `https:`, `file:`, or `data:` URIs. diff --git a/.claude/skills/markitdown/references/api-reference.md b/.claude/skills/markitdown/references/api-reference.md new file mode 100644 index 000000000..6969e6f02 --- /dev/null +++ b/.claude/skills/markitdown/references/api-reference.md @@ -0,0 +1,416 @@ +# MarkItDown Python API Reference + +Complete Python API documentation for MarkItDown. + +## Installation + +```bash +pip install 'markitdown[all]' +``` + +## Core Classes + +### MarkItDown + +Main class for document conversion. + +```python +from markitdown import MarkItDown + +md = MarkItDown( + enable_plugins: bool = False, + llm_client: Any = None, + llm_model: str = None, + llm_prompt: str = None, + docintel_endpoint: str = None +) +``` + +#### Constructor Parameters + +| Parameter | Type | Default | Description | +|-----------|------|---------|-------------| +| `enable_plugins` | `bool` | `False` | Enable third-party plugins | +| `llm_client` | `Any` | `None` | OpenAI-compatible client for images | +| `llm_model` | `str` | `None` | Model name (e.g., "gpt-4o") | +| `llm_prompt` | `str` | `None` | Custom prompt for image descriptions | +| `docintel_endpoint` | `str` | `None` | Azure Document Intelligence endpoint | + +#### Methods + +##### convert() + +Convert a file or URL to Markdown. + +```python +result = md.convert( + source: str | Path, + **kwargs +) -> DocumentConverterResult +``` + +| Parameter | Type | Description | +|-----------|------|-------------| +| `source` | `str` or `Path` | File path or URL to convert | + +Returns: `DocumentConverterResult` + +##### convert_stream() + +Convert from a binary file-like object. + +```python +result = md.convert_stream( + stream: BinaryIO, + **kwargs +) -> DocumentConverterResult +``` + +| Parameter | Type | Description | +|-----------|------|-------------| +| `stream` | `BinaryIO` | Binary file-like object (e.g., `io.BytesIO`) | + +> **Note:** As of v0.1.0, `convert_stream()` requires binary streams only. +> Text streams (`io.StringIO`) are no longer supported. + +### DocumentConverterResult + +Result object from conversion. + +```python +@dataclass +class DocumentConverterResult: + text_content: str # The converted Markdown content + title: str | None # Document title if available +``` + +## Basic Usage + +### Simple Conversion + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Convert file +result = md.convert("document.pdf") +print(result.text_content) + +# Access title if available +if result.title: + print(f"Title: {result.title}") +``` + +### Convert from Stream + +```python +from markitdown import MarkItDown +import io + +md = MarkItDown() + +# From bytes +with open("document.pdf", "rb") as f: + content = f.read() + +stream = io.BytesIO(content) +result = md.convert_stream(stream) +print(result.text_content) + +# From HTTP response +import requests + +response = requests.get("https://example.com/document.pdf") +stream = io.BytesIO(response.content) +result = md.convert_stream(stream) +``` + +### Convert URL + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# YouTube video +result = md.convert("https://www.youtube.com/watch?v=VIDEO_ID") +print(result.text_content) + +# Web page +result = md.convert("https://example.com/article.html") +print(result.text_content) +``` + +## Advanced Usage + +### LLM Image Descriptions + +Use OpenAI or compatible API for intelligent image descriptions. + +```python +from markitdown import MarkItDown +from openai import OpenAI + +# Initialize OpenAI client +client = OpenAI() # Uses OPENAI_API_KEY env var + +# Create MarkItDown with LLM support +md = MarkItDown( + llm_client=client, + llm_model="gpt-4o", + llm_prompt="Describe this image in detail, including any text visible." +) + +# Convert image with AI description +result = md.convert("screenshot.png") +print(result.text_content) + +# Convert PowerPoint with AI-described images +result = md.convert("presentation.pptx") +print(result.text_content) +``` + +### Azure Document Intelligence + +For complex PDFs with tables, forms, and scanned content. + +```python +from markitdown import MarkItDown + +# Using endpoint directly +md = MarkItDown( + docintel_endpoint="https://your-resource.cognitiveservices.azure.com/" +) + +result = md.convert("complex-form.pdf") +print(result.text_content) +``` + +```python +# With environment variables +import os + +os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"] = "https://..." +os.environ["AZURE_DOCUMENT_INTELLIGENCE_KEY"] = "your-key" + +md = MarkItDown(docintel_endpoint=os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"]) +result = md.convert("scanned-document.pdf") +``` + +### Plugin System + +```python +from markitdown import MarkItDown + +# Enable all installed plugins +md = MarkItDown(enable_plugins=True) + +result = md.convert("document.pdf") +print(result.text_content) +``` + +## Batch Processing + +### Process Directory + +```python +from markitdown import MarkItDown +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed + +def convert_file( + md: MarkItDown, file_path: Path, output_dir: Path +) -> tuple[Path, bool, str]: + """Convert a single file and return status.""" + try: + result = md.convert(str(file_path)) + output_file = output_dir / f"{file_path.stem}.md" + output_file.write_text(result.text_content) + return file_path, True, "" + except Exception as e: + return file_path, False, str(e) + +def batch_convert( + input_dir: str, + output_dir: str, + extensions: list[str] = None, + max_workers: int = 4 +) -> dict: + """Convert all files in directory.""" + input_path = Path(input_dir) + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + md = MarkItDown() + results = {"success": [], "failed": []} + + # Collect files + if extensions: + files = [] + for ext in extensions: + files.extend(input_path.glob(f"*.{ext}")) + else: + files = [f for f in input_path.iterdir() if f.is_file()] + + # Process in parallel + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(convert_file, md, f, output_path): f + for f in files + } + + for future in as_completed(futures): + file_path, success, error = future.result() + if success: + results["success"].append(str(file_path)) + else: + results["failed"].append({"file": str(file_path), "error": error}) + + return results + +# Usage +results = batch_convert( + input_dir="./documents", + output_dir="./markdown", + extensions=["pdf", "docx", "pptx"], + max_workers=4 +) +print(f"Converted: {len(results['success'])}") +print(f"Failed: {len(results['failed'])}") +``` + +### Process with Progress + +```python +from markitdown import MarkItDown +from pathlib import Path +from tqdm import tqdm + +def convert_with_progress(input_dir: str, output_dir: str): + """Convert files with progress bar.""" + input_path = Path(input_dir) + output_path = Path(output_dir) + output_path.mkdir(parents=True, exist_ok=True) + + md = MarkItDown() + files = list(input_path.glob("*")) + + for file in tqdm(files, desc="Converting"): + if file.is_file(): + try: + result = md.convert(str(file)) + output_file = output_path / f"{file.stem}.md" + output_file.write_text(result.text_content) + except Exception as e: + tqdm.write(f"Error: {file.name} - {e}") + +# Usage +convert_with_progress("./documents", "./markdown") +``` + +## Error Handling + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +try: + result = md.convert("document.pdf") + print(result.text_content) +except FileNotFoundError: + print("File not found") +except ValueError as e: + print(f"Conversion error: {e}") +except Exception as e: + print(f"Unexpected error: {e}") +``` + +### Safe Conversion Function + +```python +from markitdown import MarkItDown +from typing import Optional + +def safe_convert(file_path: str) -> Optional[str]: + """Safely convert file, returning None on error.""" + md = MarkItDown() + try: + result = md.convert(file_path) + return result.text_content + except Exception: + return None + +# Usage +content = safe_convert("document.pdf") +if content: + print(content) +else: + print("Conversion failed") +``` + +## Integration Examples + +### FastAPI Endpoint + +```python +from fastapi import FastAPI, UploadFile, HTTPException +from markitdown import MarkItDown +import io + +app = FastAPI() +md = MarkItDown() + +@app.post("/convert") +async def convert_document(file: UploadFile): + """Convert uploaded document to Markdown.""" + try: + content = await file.read() + stream = io.BytesIO(content) + result = md.convert_stream(stream) + return {"markdown": result.text_content, "title": result.title} + except Exception as e: + raise HTTPException(status_code=400, detail=str(e)) +``` + +### LangChain Document Loader + +```python +from markitdown import MarkItDown +from langchain.schema import Document + +def load_document(file_path: str) -> Document: + """Load document as LangChain Document.""" + md = MarkItDown() + result = md.convert(file_path) + + return Document( + page_content=result.text_content, + metadata={ + "source": file_path, + "title": result.title or "" + } + ) + +# Usage +doc = load_document("report.pdf") +print(doc.page_content[:500]) +``` + +## Type Hints + +```python +from markitdown import MarkItDown, DocumentConverterResult +from pathlib import Path +from typing import BinaryIO + +def convert_file(path: str | Path) -> DocumentConverterResult: + md = MarkItDown() + return md.convert(str(path)) + +def convert_stream(stream: BinaryIO) -> str: + md = MarkItDown() + result = md.convert_stream(stream) + return result.text_content +``` diff --git a/.claude/skills/markitdown/references/cli-reference.md b/.claude/skills/markitdown/references/cli-reference.md new file mode 100644 index 000000000..70c33e8bd --- /dev/null +++ b/.claude/skills/markitdown/references/cli-reference.md @@ -0,0 +1,204 @@ +# MarkItDown CLI Reference + +Complete command-line interface documentation for MarkItDown. + +## Basic Syntax + +```bash +markitdown [OPTIONS] [INPUT_FILE] +``` + +## Arguments + +| Argument | Description | +|----------|-------------| +| `INPUT_FILE` | Path to file or URL to convert. Can also be piped via stdin. | + +## Options + +| Option | Short | Description | +|--------|-------|-------------| +| `--output` | `-o` | Output file path (default: stdout) | +| `--use-plugins` | | Enable third-party plugins | +| `--list-plugins` | | List installed plugins and exit | +| `--docintel` | `-d` | Use Azure Document Intelligence | +| `--endpoint` | `-e` | Azure Document Intelligence endpoint URL | +| `--help` | `-h` | Show help message | +| `--version` | | Show version | + +## Usage Examples + +### Basic Conversion + +```bash +# Convert PDF to markdown (stdout) +markitdown document.pdf + +# Save to file +markitdown document.pdf -o output.md + +# Convert Word document +markitdown report.docx -o report.md + +# Convert PowerPoint +markitdown presentation.pptx -o slides.md + +# Convert Excel +markitdown data.xlsx -o data.md +``` + +### Piping Input + +```bash +# Pipe file content +cat document.pdf | markitdown > output.md + +# Download and convert +curl -s "https://example.com/doc.pdf" | markitdown > output.md + +# From clipboard (macOS) +pbpaste | markitdown +``` + +### Batch Processing + +```bash +# Convert all PDFs in directory +for f in *.pdf; do + markitdown "$f" -o "${f%.pdf}.md" +done + +# Using find for recursive conversion +find . -name "*.docx" -exec sh -c 'markitdown "$1" -o "${1%.docx}.md"' _ {} \; + +# Parallel processing with xargs +ls *.pdf | xargs -P 4 -I {} sh -c 'markitdown "$1" -o "${1%.pdf}.md"' _ {} +``` + +### URL Conversion + +```bash +# YouTube video transcript +markitdown "https://www.youtube.com/watch?v=VIDEO_ID" -o transcript.md + +# Web page +markitdown "https://example.com/article" -o article.md +``` + +### Azure Document Intelligence + +```bash +# Set environment variables +ENDPOINT="https://your-resource.cognitiveservices.azure.com/" +export AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="$ENDPOINT" +export AZURE_DOCUMENT_INTELLIGENCE_KEY="your-key" + +# Convert with Document Intelligence +markitdown complex-document.pdf -d \ + -e "$AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT" \ + -o output.md + +# For better PDF extraction +markitdown scanned-document.pdf -d \ + -e "$AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT" +``` + +### Plugin Usage + +```bash +# List available plugins +markitdown --list-plugins + +# Convert with plugins enabled +markitdown --use-plugins document.pdf -o output.md + +# Check if specific plugin is installed +markitdown --list-plugins | grep "plugin-name" +``` + +## Exit Codes + +| Code | Description | +|------|-------------| +| 0 | Success | +| 1 | General error | +| 2 | Invalid arguments | + +## Environment Variables + +| Variable | Description | +|----------|-------------| +| `AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT` | Azure Document Intelligence endpoint | +| `AZURE_DOCUMENT_INTELLIGENCE_KEY` | Azure Document Intelligence API key | +| `OPENAI_API_KEY` | OpenAI API key for LLM image descriptions | + +## File Format Detection + +MarkItDown automatically detects file formats based on: + +1. File extension +2. MIME type +3. File magic bytes + +Supported extensions: + +| Extension | Format | +|-----------|--------| +| `.pdf` | PDF documents | +| `.docx` | Word documents | +| `.doc` | Legacy Word | +| `.pptx` | PowerPoint | +| `.ppt` | Legacy PowerPoint | +| `.xlsx` | Excel spreadsheets | +| `.xls` | Legacy Excel | +| `.csv` | CSV data | +| `.json` | JSON data | +| `.xml` | XML data | +| `.html`, `.htm` | HTML pages | +| `.msg` | Outlook messages | +| `.epub` | EPub books | +| `.zip` | ZIP archives | +| `.jpg`, `.jpeg`, `.png`, `.gif`, `.bmp`, `.webp` | Images | +| `.mp3`, `.wav` | Audio files | + +## Tips and Best Practices + +### Performance + +```bash +# For large files, redirect to file instead of stdout +markitdown large-file.pdf -o output.md # Faster than > output.md + +# Process multiple files in parallel +parallel markitdown {} -o {.}.md ::: *.pdf +``` + +### Error Handling + +```bash +# Check if conversion succeeded +if markitdown document.pdf -o output.md; then + echo "Conversion successful" +else + echo "Conversion failed" >&2 +fi + +# Capture errors +markitdown document.pdf -o output.md 2>errors.log +``` + +### Integration with Other Tools + +```bash +# Convert and search +markitdown document.pdf | grep -i "keyword" + +# Convert and count words +markitdown document.pdf | wc -w + +# Convert and extract links +markitdown document.pdf | grep -oE '\[.*?\]\(.*?\)' + +# Convert and send to LLM +markitdown document.pdf | llm "Summarize this document" +``` diff --git a/.claude/skills/markitdown/references/examples.md b/.claude/skills/markitdown/references/examples.md new file mode 100644 index 000000000..9ba2d2401 --- /dev/null +++ b/.claude/skills/markitdown/references/examples.md @@ -0,0 +1,648 @@ +# MarkItDown Examples + +Practical examples for common use cases. + +## Document Processing + +### Extract Text from PDF + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("report.pdf") + +# Get plain text +text = result.text_content +print(f"Extracted {len(text)} characters") +``` + +### Convert Word Document + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("document.docx") + +# Save as markdown file +with open("document.md", "w") as f: + f.write(result.text_content) +``` + +### Process PowerPoint Presentation + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("slides.pptx") + +# Each slide becomes a section +print(result.text_content) +``` + +### Extract Data from Excel + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("data.xlsx") + +# Tables are converted to Markdown tables +print(result.text_content) +``` + +## Image Processing + +### Basic OCR + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("screenshot.png") + +# Extracts EXIF metadata and OCR text +print(result.text_content) +``` + +### AI-Powered Image Description + +```python +from markitdown import MarkItDown +from openai import OpenAI + +client = OpenAI() +md = MarkItDown( + llm_client=client, + llm_model="gpt-4o", + llm_prompt="Describe this image in detail." +) + +result = md.convert("diagram.png") +print(result.text_content) +``` + +### Process Multiple Images + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() +image_dir = Path("./images") + +for image in image_dir.glob("*.png"): + result = md.convert(str(image)) + output = image_dir / f"{image.stem}.md" + output.write_text(f"# {image.name}\n\n{result.text_content}") + print(f"Processed: {image.name}") +``` + +## Audio Processing + +### Transcribe Audio File + +```python +from markitdown import MarkItDown + +# Requires [audio-transcription] extra +md = MarkItDown() +result = md.convert("interview.mp3") + +print(result.text_content) +``` + +### Process Meeting Recording + +```python +from markitdown import MarkItDown +from datetime import datetime + +md = MarkItDown() +result = md.convert("meeting.wav") + +# Add metadata +output = f"""# Meeting Transcript +Date: {datetime.now().strftime("%Y-%m-%d")} + +## Transcript + +{result.text_content} +""" + +with open("meeting-notes.md", "w") as f: + f.write(output) +``` + +## Web Content + +### YouTube Video Transcript + +```python +from markitdown import MarkItDown + +# Requires [youtube-transcription] extra +md = MarkItDown() +result = md.convert("https://www.youtube.com/watch?v=dQw4w9WgXcQ") + +print(result.text_content) +``` + +### Extract YouTube Video for Notes + +```python +from markitdown import MarkItDown + +def youtube_to_notes(video_url: str, output_path: str): + """Convert YouTube video to study notes.""" + md = MarkItDown() + result = md.convert(video_url) + + # Structure the output + content = f"""# Video Notes + +Source: {video_url} + +## Transcript + +{result.text_content} + +## Key Points + +[Add your notes here] + +## Questions + +[Add questions for review] +""" + + with open(output_path, "w") as f: + f.write(content) + +# Usage +youtube_to_notes( + "https://www.youtube.com/watch?v=VIDEO_ID", + "lecture-notes.md" +) +``` + +### Convert HTML Page + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("page.html") + +# Preserves structure: headings, lists, links +print(result.text_content) +``` + +## Data Files + +### CSV to Markdown Table + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("data.csv") + +# Data becomes a Markdown table +print(result.text_content) +``` + +### JSON to Markdown + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("config.json") + +print(result.text_content) +``` + +### XML to Markdown + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("data.xml") + +print(result.text_content) +``` + +## Archive Processing + +### Process ZIP Contents + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Iterates through ZIP contents +result = md.convert("documents.zip") +print(result.text_content) +``` + +### Extract Documentation from ZIP + +```python +from markitdown import MarkItDown +from pathlib import Path +import zipfile +import tempfile + +def extract_docs_from_zip(zip_path: str) -> str: + """Extract and convert all documents from ZIP.""" + md = MarkItDown() + all_content = [] + + with tempfile.TemporaryDirectory() as temp_dir: + with zipfile.ZipFile(zip_path, 'r') as zf: + zf.extractall(temp_dir) + + for file in Path(temp_dir).rglob("*"): + if file.is_file() and file.suffix in ['.pdf', '.docx', '.txt']: + try: + result = md.convert(str(file)) + all_content.append(f"## {file.name}\n\n{result.text_content}") + except Exception as e: + all_content.append(f"## {file.name}\n\nError: {e}") + + return "\n\n---\n\n".join(all_content) + +# Usage +content = extract_docs_from_zip("project-docs.zip") +with open("all-docs.md", "w") as f: + f.write(content) +``` + +## LLM Integration + +### Prepare Documents for RAG + +```python +from markitdown import MarkItDown +from pathlib import Path + +def prepare_for_rag(docs_dir: str) -> list[dict]: + """Prepare documents for RAG indexing.""" + md = MarkItDown() + documents = [] + + for file in Path(docs_dir).glob("*"): + if file.is_file(): + try: + result = md.convert(str(file)) + documents.append({ + "source": str(file), + "title": result.title or file.stem, + "content": result.text_content + }) + except Exception as e: + print(f"Skipped {file}: {e}") + + return documents + +# Usage +docs = prepare_for_rag("./knowledge-base") +print(f"Prepared {len(docs)} documents") +``` + +### Summarize Document with LLM + +```python +from markitdown import MarkItDown +from openai import OpenAI + +def summarize_document(file_path: str) -> str: + """Convert document and generate summary.""" + # Convert to markdown + md = MarkItDown() + result = md.convert(file_path) + + # Summarize with LLM + client = OpenAI() + response = client.chat.completions.create( + model="gpt-4o", + messages=[ + {"role": "system", "content": "Summarize the following document concisely."}, + {"role": "user", "content": result.text_content[:10000]} + ] + ) + + return response.choices[0].message.content + +# Usage +summary = summarize_document("long-report.pdf") +print(summary) +``` + +### Q&A Over Documents + +```python +from markitdown import MarkItDown +from openai import OpenAI + +class DocumentQA: + def __init__(self): + self.md = MarkItDown() + self.client = OpenAI() + self.documents = {} + + def load_document(self, file_path: str, doc_id: str): + """Load and convert document.""" + result = self.md.convert(file_path) + self.documents[doc_id] = result.text_content + + def ask(self, doc_id: str, question: str) -> str: + """Ask question about loaded document.""" + if doc_id not in self.documents: + raise ValueError(f"Document {doc_id} not loaded") + + response = self.client.chat.completions.create( + model="gpt-4o", + messages=[ + { + "role": "system", + "content": f"Answer questions based on this document:\n\n{self.documents[doc_id][:15000]}" + }, + {"role": "user", "content": question} + ] + ) + + return response.choices[0].message.content + +# Usage +qa = DocumentQA() +qa.load_document("contract.pdf", "contract") +answer = qa.ask("contract", "What are the payment terms?") +print(answer) +``` + +## Jupyter Notebooks + +### Convert Notebook to Markdown + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("analysis.ipynb") + +# Output includes code cells, outputs, and markdown cells +print(result.text_content) +``` + +### Extract Code from Notebook + +```python +from markitdown import MarkItDown +import re + +def extract_code_cells(notebook_path: str) -> list[str]: + """Extract only code from notebook.""" + md = MarkItDown() + result = md.convert(notebook_path) + + # Find code blocks + code_blocks = re.findall( + r"```python\n(.*?)```", + result.text_content, + re.DOTALL + ) + return code_blocks + +# Usage +code = extract_code_cells("data_analysis.ipynb") +for i, block in enumerate(code, 1): + print(f"--- Cell {i} ---") + print(block) +``` + +### Batch Convert Notebooks + +```python +from markitdown import MarkItDown +from pathlib import Path + +md = MarkItDown() + +for notebook in Path("./notebooks").glob("*.ipynb"): + result = md.convert(str(notebook)) + output = notebook.with_suffix(".md") + output.write_text(result.text_content) + print(f"Converted: {notebook.name}") +``` + +## RSS and Atom Feeds + +### Parse RSS Feed + +```python +from markitdown import MarkItDown + +md = MarkItDown() +result = md.convert("https://example.com/feed.rss") + +# Feed entries converted to markdown +print(result.text_content) +``` + +### Monitor Multiple Feeds + +```python +from markitdown import MarkItDown +from datetime import datetime + +def aggregate_feeds(feed_urls: list[str]) -> str: + """Combine multiple RSS feeds into one document.""" + md = MarkItDown() + sections = [] + + for url in feed_urls: + try: + result = md.convert(url) + sections.append(f"## Feed: {url}\n\n{result.text_content}") + except Exception as e: + sections.append(f"## Feed: {url}\n\nError: {e}") + + header = f"# Feed Aggregation\nGenerated: {datetime.now()}\n\n" + return header + "\n\n---\n\n".join(sections) + +# Usage +feeds = [ + "https://news.ycombinator.com/rss", + "https://example.com/blog/feed.xml" +] +content = aggregate_feeds(feeds) +print(content) +``` + +## Wikipedia + +### Extract Article Content + +```python +from markitdown import MarkItDown + +md = MarkItDown() + +# Extracts main article content, excludes navigation/sidebars +result = md.convert("https://en.wikipedia.org/wiki/Python") +print(result.text_content) +``` + +### Research Multiple Topics + +```python +from markitdown import MarkItDown +from pathlib import Path + +def research_topics(topics: list[str], output_dir: str): + """Download Wikipedia articles for research.""" + md = MarkItDown() + output_path = Path(output_dir) + output_path.mkdir(exist_ok=True) + + for topic in topics: + url = f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}" + try: + result = md.convert(url) + filename = f"{topic.lower().replace(' ', '-')}.md" + (output_path / filename).write_text(result.text_content) + print(f"Downloaded: {topic}") + except Exception as e: + print(f"Failed: {topic} - {e}") + +# Usage +research_topics( + ["Machine learning", "Neural network", "Deep learning"], + "./research" +) +``` + +### Build Knowledge Base from Wikipedia + +```python +from markitdown import MarkItDown + +def build_wiki_context(topics: list[str]) -> str: + """Build LLM context from Wikipedia articles.""" + md = MarkItDown() + sections = [] + + for topic in topics: + url = f"https://en.wikipedia.org/wiki/{topic.replace(' ', '_')}" + try: + result = md.convert(url) + # Truncate long articles + content = result.text_content[:5000] + sections.append(f"## {topic}\n\n{content}") + except Exception: + continue + + return "\n\n---\n\n".join(sections) + +# Usage +context = build_wiki_context(["Python", "JavaScript", "Rust"]) +``` + +## Azure Document Intelligence + +### Complex PDF with Tables + +```python +from markitdown import MarkItDown +import os + +# Set up credentials +os.environ["AZURE_DOCUMENT_INTELLIGENCE_KEY"] = "your-key" + +md = MarkItDown( + docintel_endpoint="https://your-resource.cognitiveservices.azure.com/" +) + +# Better extraction for complex layouts +result = md.convert("financial-report.pdf") +print(result.text_content) +``` + +### Scanned Document Processing + +```python +from markitdown import MarkItDown + +md = MarkItDown( + docintel_endpoint="https://your-resource.cognitiveservices.azure.com/" +) + +# OCR and layout analysis for scanned docs +result = md.convert("scanned-contract.pdf") +print(result.text_content) +``` + +## CLI Examples + +### Basic Conversion + +```bash +# PDF to stdout +markitdown report.pdf + +# Save to file +markitdown report.pdf -o report.md + +# Word document +markitdown document.docx -o document.md + +# PowerPoint +markitdown slides.pptx -o slides.md +``` + +### Batch Processing + +```bash +# Convert all PDFs +for f in *.pdf; do markitdown "$f" -o "${f%.pdf}.md"; done + +# With parallel processing +ls *.pdf | xargs -P 4 -I {} sh -c 'markitdown "$1" -o "${1%.pdf}.md"' _ {} + +# Recursive conversion +find . -name "*.docx" -exec sh -c 'markitdown "$1" -o "${1%.docx}.md"' _ {} \; +``` + +### Piping and Integration + +```bash +# Pipe to grep +markitdown document.pdf | grep -i "important" + +# Count words +markitdown document.pdf | wc -w + +# Send to LLM CLI +markitdown document.pdf | llm "Summarize this document" + +# Download and convert +curl -sL "https://example.com/doc.pdf" | markitdown > doc.md +``` + +### Document Intelligence CLI + +```bash +# Set credentials +export AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT="https://..." +export AZURE_DOCUMENT_INTELLIGENCE_KEY="..." + +# Convert with Document Intelligence +markitdown complex.pdf -d -e "$AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT" -o output.md +``` diff --git a/.claude/skills/markitdown/scripts/batch-convert.py b/.claude/skills/markitdown/scripts/batch-convert.py new file mode 100755 index 000000000..e45cf1c52 --- /dev/null +++ b/.claude/skills/markitdown/scripts/batch-convert.py @@ -0,0 +1,190 @@ +#!/usr/bin/env python3 +""" +Batch convert documents to Markdown using MarkItDown. + +Usage: + python batch-convert.py ./documents ./output + python batch-convert.py ./documents ./output --extensions pdf,docx,pptx + python batch-convert.py ./documents ./output --workers 8 +""" + +import argparse +import sys +from pathlib import Path +from concurrent.futures import ThreadPoolExecutor, as_completed +from typing import Optional + +try: + from markitdown import MarkItDown +except ImportError: + print("Error: markitdown not installed. Run: pip install 'markitdown[all]'") + sys.exit(1) + +try: + from tqdm import tqdm + HAS_TQDM = True +except ImportError: + HAS_TQDM = False + + +def convert_file( + md: MarkItDown, + input_file: Path, + output_dir: Path +) -> tuple[Path, bool, str]: + """Convert a single file and return status.""" + try: + result = md.convert(str(input_file)) + output_file = output_dir / f"{input_file.stem}.md" + output_file.write_text(result.text_content, encoding="utf-8") + return input_file, True, "" + except Exception as e: + return input_file, False, str(e) + + +def batch_convert( + input_dir: Path, + output_dir: Path, + extensions: Optional[list[str]] = None, + max_workers: int = 4, + recursive: bool = False, + enable_plugins: bool = False +) -> dict: + """ + Convert all matching files in directory. + + Args: + input_dir: Source directory + output_dir: Destination directory + extensions: File extensions to process (None = all) + max_workers: Number of parallel workers + recursive: Search subdirectories + enable_plugins: Enable MarkItDown plugins + + Returns: + Dictionary with success and failed lists + """ + output_dir.mkdir(parents=True, exist_ok=True) + md = MarkItDown(enable_plugins=enable_plugins) + + # Collect files + files = [] + if extensions: + for ext in extensions: + pattern = f"**/*.{ext}" if recursive else f"*.{ext}" + files.extend(input_dir.glob(pattern)) + else: + if recursive: + files = [f for f in input_dir.rglob("*") if f.is_file()] + else: + files = [f for f in input_dir.iterdir() if f.is_file()] + + if not files: + print(f"No files found in {input_dir}") + return {"success": [], "failed": []} + + results = {"success": [], "failed": []} + + # Process with progress bar if tqdm available + if HAS_TQDM: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(convert_file, md, f, output_dir): f + for f in files + } + + for future in tqdm(as_completed(futures), total=len(files), desc="Converting"): + file_path, success, error = future.result() + if success: + results["success"].append(str(file_path)) + else: + results["failed"].append({"file": str(file_path), "error": error}) + tqdm.write(f"Failed: {file_path.name} - {error}") + else: + with ThreadPoolExecutor(max_workers=max_workers) as executor: + futures = { + executor.submit(convert_file, md, f, output_dir): f + for f in files + } + + for i, future in enumerate(as_completed(futures), 1): + file_path, success, error = future.result() + if success: + results["success"].append(str(file_path)) + print(f"[{i}/{len(files)}] Converted: {file_path.name}") + else: + results["failed"].append({"file": str(file_path), "error": error}) + print(f"[{i}/{len(files)}] Failed: {file_path.name} - {error}") + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Batch convert documents to Markdown using MarkItDown" + ) + parser.add_argument("input_dir", type=Path, help="Input directory") + parser.add_argument("output_dir", type=Path, help="Output directory") + parser.add_argument( + "-e", "--extensions", + type=str, + help="Comma-separated file extensions (e.g., pdf,docx,pptx)" + ) + parser.add_argument( + "-w", "--workers", + type=int, + default=4, + help="Number of parallel workers (default: 4)" + ) + parser.add_argument( + "-r", "--recursive", + action="store_true", + help="Search subdirectories recursively" + ) + parser.add_argument( + "--plugins", + action="store_true", + help="Enable MarkItDown plugins" + ) + + args = parser.parse_args() + + if not args.input_dir.exists(): + print(f"Error: Input directory does not exist: {args.input_dir}") + sys.exit(1) + + extensions = None + if args.extensions: + extensions = [ext.strip().lstrip(".") for ext in args.extensions.split(",")] + + print(f"Input: {args.input_dir}") + print(f"Output: {args.output_dir}") + if extensions: + print(f"Extensions: {', '.join(extensions)}") + print(f"Workers: {args.workers}") + print(f"Recursive: {args.recursive}") + print() + + results = batch_convert( + input_dir=args.input_dir, + output_dir=args.output_dir, + extensions=extensions, + max_workers=args.workers, + recursive=args.recursive, + enable_plugins=args.plugins + ) + + print() + print(f"Successfully converted: {len(results['success'])}") + print(f"Failed: {len(results['failed'])}") + + if results["failed"]: + print("\nFailed files:") + for item in results["failed"]: + print(f" - {item['file']}: {item['error']}") + + sys.exit(0 if not results["failed"] else 1) + + +if __name__ == "__main__": + main() diff --git a/.claude/skills/markitdown/scripts/batch-convert.sh b/.claude/skills/markitdown/scripts/batch-convert.sh new file mode 100755 index 000000000..85e3fbd95 --- /dev/null +++ b/.claude/skills/markitdown/scripts/batch-convert.sh @@ -0,0 +1,87 @@ +#!/bin/bash +# Batch convert documents to Markdown using MarkItDown CLI +# +# Usage: +# ./batch-convert.sh ./documents ./output +# ./batch-convert.sh ./documents ./output pdf,docx,pptx +# ./batch-convert.sh ./documents ./output pdf 4 # 4 parallel jobs + +set -euo pipefail + +INPUT_DIR="${1:?Usage: $0 [extensions] [parallel_jobs]}" +OUTPUT_DIR="${2:?Usage: $0 [extensions] [parallel_jobs]}" +EXTENSIONS="${3:-}" +PARALLEL_JOBS="${4:-4}" + +# Check if markitdown is installed +if ! command -v markitdown &>/dev/null; then + echo "Error: markitdown not installed. Run: pip install 'markitdown[all]'" + exit 1 +fi + +# Create output directory +mkdir -p "$OUTPUT_DIR" + +# Build find command based on extensions +if [[ -n "$EXTENSIONS" ]]; then + # Convert comma-separated extensions to find arguments + IFS=',' read -ra EXT_ARRAY <<< "$EXTENSIONS" + FIND_ARGS=() + for i in "${!EXT_ARRAY[@]}"; do + ext="${EXT_ARRAY[$i]}" + ext="${ext#.}" # Remove leading dot if present + if [[ $i -eq 0 ]]; then + FIND_ARGS+=("-name" "*.$ext") + else + FIND_ARGS+=("-o" "-name" "*.$ext") + fi + done + FILES=$(find "$INPUT_DIR" -maxdepth 1 -type f \( "${FIND_ARGS[@]}" \)) +else + FILES=$(find "$INPUT_DIR" -maxdepth 1 -type f) +fi + +# Count files +FILE_COUNT=$(echo "$FILES" | grep -c . || echo 0) + +if [[ "$FILE_COUNT" -eq 0 ]]; then + echo "No files found in $INPUT_DIR" + exit 0 +fi + +echo "Input: $INPUT_DIR" +echo "Output: $OUTPUT_DIR" +[[ -n "$EXTENSIONS" ]] && echo "Extensions: $EXTENSIONS" +echo "Files: $FILE_COUNT" +echo "Parallel jobs: $PARALLEL_JOBS" +echo "" + +# Convert function for parallel execution +convert_file() { + local input_file="$1" + local output_dir="$2" + local filename + filename=$(basename "$input_file") + local name="${filename%.*}" + local output_file="$output_dir/$name.md" + + if markitdown "$input_file" -o "$output_file" 2>/dev/null; then + echo "OK: $filename" + else + echo "FAIL: $filename" >&2 + fi +} + +export -f convert_file +export OUTPUT_DIR + +# Check if GNU parallel is available +if command -v parallel &>/dev/null; then + echo "$FILES" | parallel -j "$PARALLEL_JOBS" convert_file {} "$OUTPUT_DIR" +else + # Fallback to xargs + echo "$FILES" | xargs -P "$PARALLEL_JOBS" -I {} bash -c 'convert_file "$@"' _ {} "$OUTPUT_DIR" +fi + +echo "" +echo "Conversion complete. Output in: $OUTPUT_DIR" diff --git a/.claude/skills/markitdown/scripts/convert-jupyter.py b/.claude/skills/markitdown/scripts/convert-jupyter.py new file mode 100644 index 000000000..17c60e3ce --- /dev/null +++ b/.claude/skills/markitdown/scripts/convert-jupyter.py @@ -0,0 +1,114 @@ +#!/usr/bin/env python3 +""" +Convert Jupyter notebooks to Markdown with optional code extraction. + +Usage: + python convert-jupyter.py notebook.ipynb + python convert-jupyter.py notebook.ipynb -o output.md + python convert-jupyter.py notebook.ipynb --code-only + python convert-jupyter.py ./notebooks/ --batch +""" + +import argparse +import re +import sys +from pathlib import Path + +try: + from markitdown import MarkItDown +except ImportError: + print("Error: markitdown not installed. Run: pip install 'markitdown[all]'") + sys.exit(1) + + +def extract_code_cells(markdown_content: str) -> list[str]: + """Extract Python code blocks from markdown content.""" + return re.findall(r"```python\n(.*?)```", markdown_content, re.DOTALL) + + +def convert_notebook( + input_path: Path, + output_path: Path | None = None, + code_only: bool = False, +) -> str: + """Convert a single Jupyter notebook to markdown.""" + md = MarkItDown() + result = md.convert(str(input_path)) + content = result.text_content + + if code_only: + code_blocks = extract_code_cells(content) + content = "\n\n".join( + f"# Cell {i}\n{block}" for i, block in enumerate(code_blocks, 1) + ) + + if output_path: + output_path.write_text(content) + print(f"Converted: {input_path.name} -> {output_path.name}") + else: + print(content) + + return content + + +def batch_convert( + input_dir: Path, + output_dir: Path | None = None, + code_only: bool = False, +) -> dict: + """Convert all notebooks in a directory.""" + if output_dir is None: + output_dir = input_dir + + output_dir.mkdir(parents=True, exist_ok=True) + results = {"success": [], "failed": []} + + for notebook in input_dir.glob("*.ipynb"): + suffix = ".py" if code_only else ".md" + output_path = output_dir / f"{notebook.stem}{suffix}" + try: + convert_notebook(notebook, output_path, code_only) + results["success"].append(str(notebook)) + except Exception as e: + results["failed"].append({"file": str(notebook), "error": str(e)}) + print(f"Failed: {notebook.name} - {e}", file=sys.stderr) + + return results + + +def main(): + parser = argparse.ArgumentParser( + description="Convert Jupyter notebooks to Markdown" + ) + parser.add_argument("input", help="Notebook file or directory") + parser.add_argument("-o", "--output", help="Output file or directory") + parser.add_argument( + "--code-only", + action="store_true", + help="Extract only code cells", + ) + parser.add_argument( + "--batch", + action="store_true", + help="Process all notebooks in directory", + ) + + args = parser.parse_args() + input_path = Path(args.input) + + if not input_path.exists(): + print(f"Error: {input_path} not found", file=sys.stderr) + sys.exit(1) + + if args.batch or input_path.is_dir(): + output_dir = Path(args.output) if args.output else None + results = batch_convert(input_path, output_dir, args.code_only) + print(f"\nConverted: {len(results['success'])}") + print(f"Failed: {len(results['failed'])}") + else: + output_path = Path(args.output) if args.output else None + convert_notebook(input_path, output_path, args.code_only) + + +if __name__ == "__main__": + main()