High-performance MARC record reader designed for fast retrieval, search, and analysis.
- Indexing: Index for both exact and fuzzy matching, useful for looking up identifiers or searching for text
- Custom charsets: Optimize memory by adjusting the character set used for indexing.
- Optimized for MARC binary: Taking full advantage of the binary MARC format, fastmarc rips through hundreds of thousands of records in the blink of an eye.
- Compatibility with PyMARC: Returns
pymarc.Recordobjects for familiar record access
from fastmarc import MARCReader
with open("records.mrc", "rb") as f:
reader = (MARCReader(f)
.add_index("title", "245$a") # Register named index
.build_index()) # Build index
# Search returns list of record indices
indices = reader.search("245$a", "music")
# Use get_record() to retrieve records by index
for idx in indices[:5]:
record = reader.get_record(idx)
print(record['245']['a'])- Open file with
MARCReader(fp) - Register indexes with
.add_index() - Build with
.build_index(charset=...) - Search with
.search()or iterate withfor record in reader
with open("records.mrc", "rb") as f:
reader = MARCReader(f).add_index("title", "245$a").build_index()
# Search returns list of matching record indices
indices = reader.search("245$a", "music")
print(f"Found {len(indices)} matching records")
# Retrieve each record by index
for idx in indices[:5]:
record = reader.get_record(idx)
print(f"Record {idx}: {record['245']['a']}")from collections import Counter
subject_counts = Counter()
def count_subjects(fields):
for subject in fields.get("650$a", []):
subject_counts[subject.strip()] += 1
with open("records.mrc", "rb") as f:
reader = MARCReader(f)
reader.foreach(["650$a"], count_subjects)
print(f"Analyzed {len(reader):,} records")
print(f"Unique subjects: {len(subject_counts)}\n")
for subject, count in subject_counts.most_common(10):
print(f"{count:6d} {subject}")with open("records.mrc", "rb") as f:
reader = MARCReader(f).add_index("control_num", "001", mode="map").build_index()
# Map index: values to lists of record indices
id_index = reader.get_index("control_num")
duplicates = {id_: idxs for id_, idxs in id_index.items() if len(idxs) > 1}
print(f"Found {len(duplicates)} duplicate IDs\n")
for id_, indices in list(duplicates.items())[:3]:
print(f"ID '{id_}' appears in {len(indices)} records:")
# Retrieve each duplicate record by index
for idx in indices[:3]:
record = reader.get_record(idx)
print(f" Record {idx}: {record['245']['a']}")from collections import Counter
import re
years = Counter()
source = Counter()
def extract_year(fields):
year = None
# Try 008 first
if "008" in fields and fields["008"]:
field_008 = fields["008"][0]
if len(field_008) >= 11:
year = field_008[7:11]
if year.isdigit():
source['008'] += 1
# Fall back to 264$c
if year is None and "264$c" in fields and fields["264$c"]:
match = re.search(r'(19|20)\d{2}', fields["264$c"][0])
if match:
year = match.group(0)
source['264$c'] += 1
if year and year.isdigit():
years[year] += 1
with open("records.mrc", "rb") as f:
reader = MARCReader(f)
reader.foreach(["008", "264$c"], extract_year)
print(f"Total years: {sum(years.values()):,}")
print(f"Sources: {source}")
print("\nTop 10 years:")
for year, count in years.most_common(10):
print(f"{year}: {count:6d}")with open("records.mrc", "rb") as f:
# Search for OCLC numbers in 035$a (System Control Number)
reader = MARCReader(f).add_index("sys_control", "035$a").build_index()
# Search returns indices of records with OCLC numbers
indices = reader.search("035$a", "oc")
print(f"Found {len(indices)} records with OCLC numbers")
# Show first few OCLC records
for idx in indices[:5]:
record = reader.get_record(idx)
print(f"Record {idx}: {record['035']['a']}")with open("records.mrc", "rb") as f:
reader = (MARCReader(f)
.add_index("title", "245$a")
.add_index("author", "100$a")
.add_index("subject", "650$a")
.build_index())
# Each search returns list of record indices
title_indices = reader.search("245$a", "history")
author_indices = reader.search("100$a", "smith")
subject_indices = reader.search("650$a", "music")
print(f"Found {len(title_indices)} titles with 'history'")
print(f"Found {len(author_indices)} authors named 'smith'")
print(f"Found {len(subject_indices)} subjects about 'music'")
# Retrieve records from any search result
for idx in title_indices[:5]:
record = reader.get_record(idx)
print(f" {record['245']['a']}")from collections import Counter
from itertools import chain
with open(\"records.mrc\", \"rb\") as f:
reader = MARCReader(f).build_index()
# Get all titles - returns list of lists (one per record)
all_titles = reader.get_all_values(\"245$a\")
print(f\"Total records: {len(all_titles)}\")
records_with_titles = sum(1 for titles in all_titles if titles)
print(f\"Records with titles: {records_with_titles}\")
# Get all subject headings (repeating field)
all_subjects = reader.get_all_values(\"650$a\")
# Find records with multiple subjects
multi_subject_records = [(i, subs) for i, subs in enumerate(all_subjects) if len(subs) > 3]
print(f\"\\nRecords with >3 subjects: {len(multi_subject_records)}\")
# Flatten and count all subjects
flattened_subjects = list(chain.from_iterable(all_subjects))
subject_counts = Counter(flattened_subjects)
print(f\"Total subject entries: {len(flattened_subjects)}\")
print(f\"Unique subjects: {len(subject_counts)}\")
print(\"\\nTop 10 subjects:\")
for subject, count in subject_counts.most_common(10):
print(f\"{count:6d} {subject}\")
# Find records missing ISBNs
all_isbns = reader.get_all_values(\"020$a\")
records_without_isbn = sum(1 for isbns in all_isbns if not isbns)
print(f\"\\nRecords without ISBN: {records_without_isbn}\")For development, you need to rebuild the Cython extension after modifying reader.pyx:
# Initial setup
uv pip install -e .
# After modifying reader.pyx, rebuild the extension
uv run python setup.py build_ext --inplaceThe --inplace flag puts the compiled .so file directly in the source tree, so your editable install picks it up immediately.
- API.org - Complete API reference