FastMARC

High-performance MARC record reader designed for fast retrieval, search, and analysis.

Features

Indexing: Index for both exact and fuzzy matching, useful for looking up identifiers or searching for text
Custom charsets: Optimize memory by adjusting the character set used for indexing.
Optimized for MARC binary: Taking full advantage of the binary MARC format, fastmarc rips through hundreds of thousands of records in the blink of an eye.
Compatibility with PyMARC: Returns pymarc.Record objects for familiar record access

Quick Start

from fastmarc import MARCReader

with open("records.mrc", "rb") as f:
    reader = (MARCReader(f)
        .add_index("title", "245$a")     # Register named index
        .build_index())                        # Build index

    # Search returns list of record indices
    indices = reader.search("245$a", "music")

    # Use get_record() to retrieve records by index
    for idx in indices[:5]:
        record = reader.get_record(idx)
        print(record['245']['a'])

Core Workflow

Open file with MARCReader(fp)
Register indexes with .add_index()
Build with .build_index(charset=...)
Search with .search() or iterate with for record in reader

Examples

Basic Search

with open("records.mrc", "rb") as f:
    reader = MARCReader(f).add_index("title", "245$a").build_index()

    # Search returns list of matching record indices
    indices = reader.search("245$a", "music")
    print(f"Found {len(indices)} matching records")

    # Retrieve each record by index
    for idx in indices[:5]:
        record = reader.get_record(idx)
        print(f"Record {idx}: {record['245']['a']}")

Subject Analysis

from collections import Counter

subject_counts = Counter()

def count_subjects(fields):
    for subject in fields.get("650$a", []):
        subject_counts[subject.strip()] += 1

with open("records.mrc", "rb") as f:
    reader = MARCReader(f)
    reader.foreach(["650$a"], count_subjects)

    print(f"Analyzed {len(reader):,} records")
    print(f"Unique subjects: {len(subject_counts)}\n")

    for subject, count in subject_counts.most_common(10):
        print(f"{count:6d}  {subject}")

Duplicate Detection

with open("records.mrc", "rb") as f:
    reader = MARCReader(f).add_index("control_num", "001", mode="map").build_index()

    # Map index: values to lists of record indices
    id_index = reader.get_index("control_num")
    duplicates = {id_: idxs for id_, idxs in id_index.items() if len(idxs) > 1}

    print(f"Found {len(duplicates)} duplicate IDs\n")
    for id_, indices in list(duplicates.items())[:3]:
        print(f"ID '{id_}' appears in {len(indices)} records:")
        # Retrieve each duplicate record by index
        for idx in indices[:3]:
            record = reader.get_record(idx)
            print(f"  Record {idx}: {record['245']['a']}")

Multi-Field Year Extraction

from collections import Counter
import re

years = Counter()
source = Counter()

def extract_year(fields):
    year = None
    # Try 008 first
    if "008" in fields and fields["008"]:
        field_008 = fields["008"][0]
        if len(field_008) >= 11:
            year = field_008[7:11]
            if year.isdigit():
                source['008'] += 1
    # Fall back to 264$c
    if year is None and "264$c" in fields and fields["264$c"]:
        match = re.search(r'(19|20)\d{2}', fields["264$c"][0])
        if match:
            year = match.group(0)
            source['264$c'] += 1

    if year and year.isdigit():
        years[year] += 1

with open("records.mrc", "rb") as f:
    reader = MARCReader(f)
    reader.foreach(["008", "264$c"], extract_year)

    print(f"Total years: {sum(years.values()):,}")
    print(f"Sources: {source}")
    print("\nTop 10 years:")
    for year, count in years.most_common(10):
        print(f"{year}: {count:6d}")

Finding OCLC Numbers

with open("records.mrc", "rb") as f:
    # Search for OCLC numbers in 035$a (System Control Number)
    reader = MARCReader(f).add_index("sys_control", "035$a").build_index()

    # Search returns indices of records with OCLC numbers
    indices = reader.search("035$a", "oc")
    print(f"Found {len(indices)} records with OCLC numbers")

    # Show first few OCLC records
    for idx in indices[:5]:
        record = reader.get_record(idx)
        print(f"Record {idx}: {record['035']['a']}")

Reuse reader for multiple searches

with open("records.mrc", "rb") as f:
    reader = (MARCReader(f)
        .add_index("title", "245$a")
        .add_index("author", "100$a")
        .add_index("subject", "650$a")
        .build_index())

    # Each search returns list of record indices
    title_indices = reader.search("245$a", "history")
    author_indices = reader.search("100$a", "smith")
    subject_indices = reader.search("650$a", "music")

    print(f"Found {len(title_indices)} titles with 'history'")
    print(f"Found {len(author_indices)} authors named 'smith'")
    print(f"Found {len(subject_indices)} subjects about 'music'")

    # Retrieve records from any search result
    for idx in title_indices[:5]:
        record = reader.get_record(idx)
        print(f"  {record['245']['a']}")

Extract All Field Values

from collections import Counter
from itertools import chain

with open(\"records.mrc\", \"rb\") as f:
    reader = MARCReader(f).build_index()

    # Get all titles - returns list of lists (one per record)
    all_titles = reader.get_all_values(\"245$a\")
    print(f\"Total records: {len(all_titles)}\")
    records_with_titles = sum(1 for titles in all_titles if titles)
    print(f\"Records with titles: {records_with_titles}\")

    # Get all subject headings (repeating field)
    all_subjects = reader.get_all_values(\"650$a\")

    # Find records with multiple subjects
    multi_subject_records = [(i, subs) for i, subs in enumerate(all_subjects) if len(subs) > 3]
    print(f\"\\nRecords with >3 subjects: {len(multi_subject_records)}\")

    # Flatten and count all subjects
    flattened_subjects = list(chain.from_iterable(all_subjects))
    subject_counts = Counter(flattened_subjects)
    print(f\"Total subject entries: {len(flattened_subjects)}\")
    print(f\"Unique subjects: {len(subject_counts)}\")
    print(\"\\nTop 10 subjects:\")
    for subject, count in subject_counts.most_common(10):
        print(f\"{count:6d}  {subject}\")

    # Find records missing ISBNs
    all_isbns = reader.get_all_values(\"020$a\")
    records_without_isbn = sum(1 for isbns in all_isbns if not isbns)
    print(f\"\\nRecords without ISBN: {records_without_isbn}\")

Development

Building from Source

For development, you need to rebuild the Cython extension after modifying reader.pyx:

# Initial setup
uv pip install -e .

# After modifying reader.pyx, rebuild the extension
uv run python setup.py build_ext --inplace

The --inplace flag puts the compiled .so file directly in the source tree, so your editable install picks it up immediately.

Documentation

API.org - Complete API reference

Name		Name	Last commit message	Last commit date
Latest commit History 32 Commits
fastmarc		fastmarc
tests		tests
.gitignore		.gitignore
API.org		API.org
MANIFEST.in		MANIFEST.in
README.org		README.org
pyproject.toml		pyproject.toml
setup.py		setup.py
test.py		test.py
uv.lock		uv.lock

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Repository files navigation

FastMARC

Features

Quick Start

Core Workflow

Examples

Basic Search

Subject Analysis

Duplicate Detection

Multi-Field Year Extraction

Finding OCLC Numbers

Reuse reader for multiple searches

Extract All Field Values

Development

Building from Source

Documentation

About

Uh oh!

Releases

Packages

Languages

RvanB/fastmarc

Folders and files

Latest commit

History

Repository files navigation

FastMARC

Features

Quick Start

Core Workflow

Examples

Basic Search

Subject Analysis

Duplicate Detection

Multi-Field Year Extraction

Finding OCLC Numbers

Reuse reader for multiple searches

Extract All Field Values

Development

Building from Source

Documentation

About

Resources

Uh oh!

Stars

Watchers

Forks

Releases

Packages 0

Languages

Packages