Skip to content

RvanB/fastmarc

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

32 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

FastMARC

High-performance MARC record reader designed for fast retrieval, search, and analysis.

Features

  • Indexing: Index for both exact and fuzzy matching, useful for looking up identifiers or searching for text
  • Custom charsets: Optimize memory by adjusting the character set used for indexing.
  • Optimized for MARC binary: Taking full advantage of the binary MARC format, fastmarc rips through hundreds of thousands of records in the blink of an eye.
  • Compatibility with PyMARC: Returns pymarc.Record objects for familiar record access

Quick Start

from fastmarc import MARCReader

with open("records.mrc", "rb") as f:
    reader = (MARCReader(f)
        .add_index("title", "245$a")     # Register named index
        .build_index())                        # Build index

    # Search returns list of record indices
    indices = reader.search("245$a", "music")

    # Use get_record() to retrieve records by index
    for idx in indices[:5]:
        record = reader.get_record(idx)
        print(record['245']['a'])

Core Workflow

  1. Open file with MARCReader(fp)
  2. Register indexes with .add_index()
  3. Build with .build_index(charset=...)
  4. Search with .search() or iterate with for record in reader

Examples

Basic Search

with open("records.mrc", "rb") as f:
    reader = MARCReader(f).add_index("title", "245$a").build_index()

    # Search returns list of matching record indices
    indices = reader.search("245$a", "music")
    print(f"Found {len(indices)} matching records")

    # Retrieve each record by index
    for idx in indices[:5]:
        record = reader.get_record(idx)
        print(f"Record {idx}: {record['245']['a']}")

Subject Analysis

from collections import Counter

subject_counts = Counter()

def count_subjects(fields):
    for subject in fields.get("650$a", []):
        subject_counts[subject.strip()] += 1

with open("records.mrc", "rb") as f:
    reader = MARCReader(f)
    reader.foreach(["650$a"], count_subjects)

    print(f"Analyzed {len(reader):,} records")
    print(f"Unique subjects: {len(subject_counts)}\n")

    for subject, count in subject_counts.most_common(10):
        print(f"{count:6d}  {subject}")

Duplicate Detection

with open("records.mrc", "rb") as f:
    reader = MARCReader(f).add_index("control_num", "001", mode="map").build_index()

    # Map index: values to lists of record indices
    id_index = reader.get_index("control_num")
    duplicates = {id_: idxs for id_, idxs in id_index.items() if len(idxs) > 1}

    print(f"Found {len(duplicates)} duplicate IDs\n")
    for id_, indices in list(duplicates.items())[:3]:
        print(f"ID '{id_}' appears in {len(indices)} records:")
        # Retrieve each duplicate record by index
        for idx in indices[:3]:
            record = reader.get_record(idx)
            print(f"  Record {idx}: {record['245']['a']}")

Multi-Field Year Extraction

from collections import Counter
import re

years = Counter()
source = Counter()

def extract_year(fields):
    year = None
    # Try 008 first
    if "008" in fields and fields["008"]:
        field_008 = fields["008"][0]
        if len(field_008) >= 11:
            year = field_008[7:11]
            if year.isdigit():
                source['008'] += 1
    # Fall back to 264$c
    if year is None and "264$c" in fields and fields["264$c"]:
        match = re.search(r'(19|20)\d{2}', fields["264$c"][0])
        if match:
            year = match.group(0)
            source['264$c'] += 1

    if year and year.isdigit():
        years[year] += 1

with open("records.mrc", "rb") as f:
    reader = MARCReader(f)
    reader.foreach(["008", "264$c"], extract_year)

    print(f"Total years: {sum(years.values()):,}")
    print(f"Sources: {source}")
    print("\nTop 10 years:")
    for year, count in years.most_common(10):
        print(f"{year}: {count:6d}")

Finding OCLC Numbers

with open("records.mrc", "rb") as f:
    # Search for OCLC numbers in 035$a (System Control Number)
    reader = MARCReader(f).add_index("sys_control", "035$a").build_index()

    # Search returns indices of records with OCLC numbers
    indices = reader.search("035$a", "oc")
    print(f"Found {len(indices)} records with OCLC numbers")

    # Show first few OCLC records
    for idx in indices[:5]:
        record = reader.get_record(idx)
        print(f"Record {idx}: {record['035']['a']}")

Reuse reader for multiple searches

with open("records.mrc", "rb") as f:
    reader = (MARCReader(f)
        .add_index("title", "245$a")
        .add_index("author", "100$a")
        .add_index("subject", "650$a")
        .build_index())

    # Each search returns list of record indices
    title_indices = reader.search("245$a", "history")
    author_indices = reader.search("100$a", "smith")
    subject_indices = reader.search("650$a", "music")

    print(f"Found {len(title_indices)} titles with 'history'")
    print(f"Found {len(author_indices)} authors named 'smith'")
    print(f"Found {len(subject_indices)} subjects about 'music'")

    # Retrieve records from any search result
    for idx in title_indices[:5]:
        record = reader.get_record(idx)
        print(f"  {record['245']['a']}")

Extract All Field Values

from collections import Counter
from itertools import chain

with open(\"records.mrc\", \"rb\") as f:
    reader = MARCReader(f).build_index()

    # Get all titles - returns list of lists (one per record)
    all_titles = reader.get_all_values(\"245$a\")
    print(f\"Total records: {len(all_titles)}\")
    records_with_titles = sum(1 for titles in all_titles if titles)
    print(f\"Records with titles: {records_with_titles}\")

    # Get all subject headings (repeating field)
    all_subjects = reader.get_all_values(\"650$a\")

    # Find records with multiple subjects
    multi_subject_records = [(i, subs) for i, subs in enumerate(all_subjects) if len(subs) > 3]
    print(f\"\\nRecords with >3 subjects: {len(multi_subject_records)}\")

    # Flatten and count all subjects
    flattened_subjects = list(chain.from_iterable(all_subjects))
    subject_counts = Counter(flattened_subjects)
    print(f\"Total subject entries: {len(flattened_subjects)}\")
    print(f\"Unique subjects: {len(subject_counts)}\")
    print(\"\\nTop 10 subjects:\")
    for subject, count in subject_counts.most_common(10):
        print(f\"{count:6d}  {subject}\")

    # Find records missing ISBNs
    all_isbns = reader.get_all_values(\"020$a\")
    records_without_isbn = sum(1 for isbns in all_isbns if not isbns)
    print(f\"\\nRecords without ISBN: {records_without_isbn}\")

Development

Building from Source

For development, you need to rebuild the Cython extension after modifying reader.pyx:

# Initial setup
uv pip install -e .

# After modifying reader.pyx, rebuild the extension
uv run python setup.py build_ext --inplace

The --inplace flag puts the compiled .so file directly in the source tree, so your editable install picks it up immediately.

Documentation

About

A fast MARCReader for use with Pymarc providing some nice performance features.

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published