diff --git a/.gitignore b/.gitignore index cb1b75b..cd0df49 100644 --- a/.gitignore +++ b/.gitignore @@ -8,6 +8,8 @@ *.swo # Python +.venv +venv __pycache__/ *.py[cod] *$py.class @@ -38,4 +40,4 @@ htmlcov/ docs/_build/ # Validation workspace (temporary files and reports) -validation_workspace/ \ No newline at end of file +validation_workspace/ diff --git a/README.md b/README.md index bef3c43..186566d 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,38 @@ NCBI_API_KEY=your_ncbi_key # Optional but recommended for higher rate lim ## Usage +### Bibliography → CSL-JSON mapping + +Take a DeepSearch-style bibliography (URLs, optionally with `source_id`) and return CSL-JSON keyed by the original reference numbers: + +```python +from lit_agent.identifiers import resolve_bibliography + +bibliography = [ + {"source_id": "1", "url": "https://pubmed.ncbi.nlm.nih.gov/37674083/"}, + {"source_id": "2", "url": "https://pmc.ncbi.nlm.nih.gov/articles/PMC11239014/"}, + {"source_id": "3", "url": "https://doi.org/10.1038/s41586-023-06502-w"}, +] + +result = resolve_bibliography( + bibliography, + validate=True, # NCBI/metapub validation + metadata fetch + scrape=False, # Enable if you want web/PDF scraping for failures + pdf=False, + topic_validation=False, +) + +print(result.citations["1"]["PMID"]) # "37674083" +print(result.citations["2"]["PMCID"]) # "PMC11239014" +print(result.citations["3"]["DOI"]) # "10.1038/s41586-023-06502-w" +print(result.citations["1"]["resolution"]) # methods, confidence, validation, errors +``` + +Each citation is CSL-JSON–compatible with a custom `resolution` block: +- `id` is the original `source_id` (or 1-based string if absent) +- `URL`, identifiers (`DOI`/`PMID`/`PMCID`), optional metadata (`title`, `author`, `container-title`, `issued`, etc.) +- `resolution`: `confidence`, `methods`, `validation` statuses, `errors`, `source_url`, optional `canonical_id` + ### Academic Identifier Extraction Extract DOI, PMID, and PMC identifiers from academic URLs with comprehensive validation: @@ -353,4 +385,3 @@ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file - Built with [LiteLLM](https://github.com/BerriAI/litellm) for unified LLM API access - Uses [uv](https://github.com/astral-sh/uv) for fast Python package management - Code quality maintained with [black](https://github.com/psf/black) and [ruff](https://github.com/astral-sh/ruff) - diff --git a/plans/aim1-url2ref-functional-plan.md b/plans/aim1-url2ref-functional-plan.md new file mode 100644 index 0000000..db49f5b --- /dev/null +++ b/plans/aim1-url2ref-functional-plan.md @@ -0,0 +1,45 @@ +# Aim 1 – url2ref functionality plan (standalone) + +## Goal +Expand `url2ref` (lit_agent) so it can take a numbered bibliography (URLs) from upstream systems (e.g., DeepSearch) and return a citation map keyed by the original reference numbers. Each entry should be CSL-JSON–compatible, enriched with resolved identifiers and confidence/validation details. + +## Chosen citation schema +- **CSL-JSON** as the citation payload: stable, widely supported, flexible for partial metadata. +- Fields we commit to populate when available: `id` (ref_id), `URL`, `type`, `title`, `author` (family/given), `issued` (`date-parts`), `container-title`, `publisher`, `page`, `volume`, `issue`, `DOI`, `PMID`, `PMCID`. +- Add a `resolution` object (custom) with: `confidence` (0–1), `methods` (ordered list of extraction methods), `validation` (e.g., `{"ncbi": "passed" | "failed" | "skipped", "metapub": ...}`), `errors` (optional list), and `source_url` for traceability. +- Numbering: preserve the **original ref number** (stringified) from the input order. Never renumber. If deduplication is applied, keep both the original `id` and a `canonical_id` for grouping. + +## New/updated APIs +- High-level function: `resolve_bibliography(urls: list[str], *, validate=True, scrape=True, pdf=True, topic_validation=False) -> CitationResolutionResult` + - Input: ordered list of URLs (implicitly numbered starting at 1). + - Output: `CitationResolutionResult` with: + - `citations: dict[str, CSLJSONCitation]` keyed by ref_id (`"1"`, `"2"`, ...). + - `stats`: counts for resolved/unresolved, by method, average confidence, validation outcomes. + - `failures`: list of ref_ids with reasons. +- Keep existing `extract_identifiers_from_bibliography` public but use it internally. + +## Processing pipeline +1) **Identifier extraction (existing)**: reuse `JournalURLExtractor` → `CompositeValidator` (NCBI/metapub) with per-identifier confidence. +2) **Phase 2 (optional)**: web scraping/PDF extraction for failed URLs; track methods. +3) **Metadata enrichment**: + - Primary: `NCBIAPIValidator.get_article_metadata` when PMID/PMCID present. + - DOI metadata lookup (CrossRef or other) if available in codebase or add light DOI resolver (no network if policies forbid; allow graceful skip). + - Map metadata to CSL-JSON fields; fill `issued` from year (or full date if present). +4) **Record assembly**: + - For each URL (ref_id), create CSL-JSON object with `id = ref_id`, `URL = url`, identifiers (`DOI`, `PMID`, `PMCID`), and enriched metadata. + - Attach `resolution` with method path and validation outcomes; if unresolved, include `errors` and leave identifiers blank. +5) **Stats/reporting**: + - Aggregate success/failure, per-method success, confidence histograms, validation pass rates. + - Optional: expose `to_json()` for container embedding. + +## Edge cases & rules +- Preserve input order as authoritative numbering; never reshuffle. +- If multiple identifiers per URL, keep all (`DOI`, `PMID`, `PMCID`); prefer PMID/PMCID to fetch metadata, but do not drop DOI. +- If metadata fetch fails, still return identifiers and source URL with low confidence. +- If scraping/PDF disabled or not permitted, mark validation as `skipped` and return partial data. +- Keep network-dependent steps optional via flags; ensure graceful degradation without secrets. + +## Testing +- Unit tests: fixture URLs → expected CSL-JSON snippets; confidence/method tracking; unresolved paths. +- Integration (if allowed): small curated URLs hitting NCBI (and DOI if available) with recorded responses; fallback to mocks when offline. +- Schema checks: validate produced citation map against CSL-JSON structure + custom `resolution` fragment. diff --git a/pyproject.toml b/pyproject.toml index 623d069..8ecb5bc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools>=68", "wheel"] build-backend = "setuptools.build_meta" [project] -name = "lit-agent" +name = "url2ref" version = "0.1.0" description = "Reference extraction agent for analyzing Deepsearch results" authors = [{name = "Research Team"}] @@ -38,6 +38,9 @@ markers = [ "unit: fast, isolated tests", "integration: tests that hit real services or I/O" ] +filterwarnings = [ + "ignore:invalid escape sequence.*docopt.*:SyntaxWarning", +] [tool.coverage.run] branch = true @@ -45,4 +48,3 @@ source = ["src/lit_agent"] [tool.coverage.report] skip_empty = true - diff --git a/src/lit_agent/identifiers/__init__.py b/src/lit_agent/identifiers/__init__.py index 2309c5b..127609d 100644 --- a/src/lit_agent/identifiers/__init__.py +++ b/src/lit_agent/identifiers/__init__.py @@ -34,6 +34,8 @@ extract_identifiers_from_bibliography, extract_identifiers_from_url, validate_identifier, + resolve_bibliography, + CitationResolutionResult, ) # Demo functionality @@ -66,6 +68,8 @@ "extract_identifiers_from_bibliography", "extract_identifiers_from_url", "validate_identifier", + "resolve_bibliography", + "CitationResolutionResult", # Demo "demo_extraction", ] diff --git a/src/lit_agent/identifiers/api.py b/src/lit_agent/identifiers/api.py index dcefcfa..e5d44c0 100644 --- a/src/lit_agent/identifiers/api.py +++ b/src/lit_agent/identifiers/api.py @@ -1,6 +1,9 @@ """High-level API functions for academic identifier extraction.""" -from typing import List, Dict, Any +from collections import Counter, defaultdict +from dataclasses import dataclass +import json +from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional from .base import IdentifierType, AcademicIdentifier, IdentifierExtractionResult from .extractors import JournalURLExtractor @@ -9,12 +12,32 @@ from .topic_validator import TopicValidator +@dataclass +class CitationResolutionResult: + """Resolved bibliography output mapped to CSL-JSON entries.""" + + citations: Dict[str, Dict[str, Any]] + stats: Dict[str, Any] + failures: List[str] + + def to_json(self) -> str: + """Serialize the resolution result to JSON.""" + + payload = { + "citations": self.citations, + "stats": self.stats, + "failures": self.failures, + } + return json.dumps(payload, default=str) + + def extract_identifiers_from_bibliography( urls: List[str], use_web_scraping: bool = False, use_api_validation: bool = True, use_metapub_validation: bool = True, use_topic_validation: bool = False, + use_pdf_extraction: bool = True, ) -> IdentifierExtractionResult: """Extract academic identifiers from a list of bibliography URLs. @@ -27,6 +50,7 @@ def extract_identifiers_from_bibliography( use_api_validation: Whether to validate identifiers using NCBI API use_metapub_validation: Whether to validate identifiers using metapub use_topic_validation: Whether to validate topic relevance using LLM analysis + use_pdf_extraction: Whether to allow PDF extraction in Phase 2 Returns: IdentifierExtractionResult containing all extracted identifiers and statistics @@ -76,6 +100,8 @@ def extract_identifiers_from_bibliography( try: # Choose extractor based on URL type if pdf_extractor.is_pdf_url(failed_url): + if not use_pdf_extraction: + continue identifiers = pdf_extractor.extract_from_url(failed_url) else: identifiers = web_extractor.extract_from_url(failed_url) @@ -258,3 +284,268 @@ def validate_identifier( "identifier_type": identifier_type.value, "value": value, } + + +def resolve_bibliography( + bibliography: Iterable[Any], + *, + validate: bool = True, + scrape: bool = True, + pdf: bool = True, + topic_validation: bool = False, + metadata_lookup: Optional[Callable[[IdentifierType, str], Optional[Dict[str, Any]]]] = None, +) -> CitationResolutionResult: + """Resolve a DeepSearch bibliography to CSL-JSON keyed by source_id. + + Args: + bibliography: Iterable of URLs or mappings with ``source_id`` and ``url`` keys. + validate: Whether to run API/metapub validation when extracting identifiers. + scrape: Whether to enable web scraping for failed URLs. + pdf: Whether to enable PDF extraction during scraping. + topic_validation: Whether to run topic validation. + metadata_lookup: Optional callable to enrich metadata (identifier_type, value) -> metadata dict. + + Returns: + CitationResolutionResult with CSL-JSON citations keyed by source_id and resolution stats. + """ + + entries = _normalize_bibliography_entries(bibliography) + urls = [entry["url"] for entry in entries] + + extraction_result = extract_identifiers_from_bibliography( + urls, + use_web_scraping=scrape, + use_api_validation=validate, + use_metapub_validation=validate, + use_topic_validation=topic_validation, + use_pdf_extraction=pdf, + ) + + grouped_identifiers: Dict[str, List[AcademicIdentifier]] = defaultdict(list) + for identifier in extraction_result.identifiers: + grouped_identifiers[identifier.source_url].append(identifier) + + citations: Dict[str, Dict[str, Any]] = {} + failures: List[str] = [] + method_counter: Counter[str] = Counter() + confidence_values: List[float] = [] + + for entry in entries: + source_id = entry["source_id"] + url = entry["url"] + identifiers = grouped_identifiers.get(url, []) + + citation = _build_csl_citation( + source_id=source_id, + url=url, + identifiers=identifiers, + metadata_lookup=metadata_lookup, + validate=validate, + ) + + citations[source_id] = citation + + method_counter.update(citation["resolution"].get("methods", [])) + if identifiers: + confidence_values.extend([identifier.confidence for identifier in identifiers]) + else: + failures.append(source_id) + + stats = { + "total": len(entries), + "resolved": len(entries) - len(failures), + "unresolved": len(failures), + "methods": dict(method_counter), + "average_confidence": round(sum(confidence_values) / len(confidence_values), 2) + if confidence_values + else 0.0, + } + + return CitationResolutionResult( + citations=citations, + stats=stats, + failures=failures, + ) + + +def _normalize_bibliography_entries(bibliography: Iterable[Any]) -> List[Dict[str, str]]: + """Normalize bibliography input to a list of ``{"source_id", "url"}`` dicts.""" + + normalized_entries: List[Dict[str, str]] = [] + for index, entry in enumerate(bibliography, start=1): + if isinstance(entry, Mapping): + source_id = str(entry.get("source_id") or entry.get("id") or index) + url = str(entry.get("url")) if entry.get("url") is not None else "" + else: + source_id = str(index) + url = str(entry) + + normalized_entries.append({"source_id": source_id, "url": url}) + + return normalized_entries + + +def _build_csl_citation( + *, + source_id: str, + url: str, + identifiers: List[AcademicIdentifier], + metadata_lookup: Optional[Callable[[IdentifierType, str], Optional[Dict[str, Any]]]], + validate: bool, +) -> Dict[str, Any]: + """Convert extracted identifiers into a CSL-JSON-like dict.""" + + citation: Dict[str, Any] = { + "id": source_id, + "URL": url, + "type": "article-journal", + "resolution": { + "confidence": max((identifier.confidence for identifier in identifiers), default=0.0), + "methods": sorted({identifier.extraction_method.value for identifier in identifiers}), + "validation": _build_validation_status(validate, bool(identifiers)), + "errors": [], + "source_url": url, + "canonical_id": None, + }, + } + + for identifier in identifiers: + if identifier.type == IdentifierType.DOI: + citation["DOI"] = identifier.value + elif identifier.type == IdentifierType.PMID: + citation["PMID"] = identifier.value + elif identifier.type == IdentifierType.PMC: + citation["PMCID"] = identifier.value + + if not identifiers: + citation["resolution"]["errors"].append("no identifiers extracted") + return citation + + preferred_identifier = _select_preferred_identifier(identifiers) + + metadata = None + if metadata_lookup: + try: + metadata = metadata_lookup(preferred_identifier.type, preferred_identifier.value) + except Exception as exc: # pragma: no cover - defensive + citation["resolution"]["errors"].append(f"metadata lookup failed: {exc}") + elif validate: + metadata_validator = NCBIAPIValidator() + try: + metadata = metadata_validator.get_article_metadata( + preferred_identifier.type, preferred_identifier.value + ) + citation["resolution"]["validation"]["ncbi"] = "passed" if metadata else "failed" + except Exception as exc: # pragma: no cover - defensive + citation["resolution"]["errors"].append(f"metadata lookup failed: {exc}") + citation["resolution"]["validation"]["ncbi"] = "failed" + + if metadata: + _apply_metadata_to_citation(citation, metadata) + if "metadata_lookup" not in citation["resolution"]["methods"]: + citation["resolution"]["methods"].append("metadata_lookup") + + return citation + + +def _build_validation_status(validate: bool, has_identifiers: bool) -> Dict[str, str]: + """Construct validation status map for ncbi/metapub.""" + + if not validate: + return {"ncbi": "skipped", "metapub": "skipped"} + + return {"ncbi": "unknown" if has_identifiers else "failed", "metapub": "unknown"} + + +def _select_preferred_identifier(identifiers: List[AcademicIdentifier]) -> AcademicIdentifier: + """Choose the best identifier for metadata lookup (PMID > PMC > DOI).""" + + priority = {IdentifierType.PMID: 0, IdentifierType.PMC: 1, IdentifierType.DOI: 2} + return sorted(identifiers, key=lambda identifier: priority.get(identifier.type, 99))[0] + + +def _apply_metadata_to_citation(citation: Dict[str, Any], metadata: Dict[str, Any]) -> None: + """Map metadata dictionary into CSL fields on the citation dict.""" + + if title := metadata.get("title"): + citation["title"] = title + + if journal := metadata.get("journal"): + citation["container-title"] = journal + + if pubdate := metadata.get("pubdate"): + date_parts = _parse_pubdate(pubdate) + if date_parts: + citation["issued"] = {"date-parts": [date_parts]} + + if authors := metadata.get("authors"): + citation["author"] = _parse_authors(authors) + + for field in ["volume", "issue", "pages"]: + if metadata.get(field): + citation[field] = metadata[field] + + if metadata.get("doi"): + citation.setdefault("DOI", metadata["doi"]) + if metadata.get("pmid"): + citation.setdefault("PMID", metadata["pmid"]) + if metadata.get("pmcid"): + citation.setdefault("PMCID", metadata["pmcid"]) + + +def _parse_pubdate(pubdate: str) -> List[int]: + """Parse NCBI-style pubdate strings into date-parts.""" + + months = { + "jan": 1, + "feb": 2, + "mar": 3, + "apr": 4, + "may": 5, + "jun": 6, + "jul": 7, + "aug": 8, + "sep": 9, + "oct": 10, + "nov": 11, + "dec": 12, + } + + tokens = pubdate.replace(",", " ").split() + date_parts: List[int] = [] + + for token in tokens: + lower_token = token.lower() + if lower_token in months: + date_parts.append(months[lower_token]) + else: + try: + date_parts.append(int(token)) + except ValueError: + continue + + if not date_parts and pubdate.isdigit(): + date_parts.append(int(pubdate)) + + return date_parts[:3] + + +def _parse_authors(authors: Iterable[str]) -> List[Dict[str, str]]: + """Convert a list of author strings into CSL author dicts.""" + + parsed_authors: List[Dict[str, str]] = [] + + for author in authors: + if not author: + continue + + tokens = author.replace(",", " ").split() + if not tokens: + continue + + family = tokens[0] + given = " ".join(tokens[1:]) if len(tokens) > 1 else "" + + parsed_authors.append({"family": family, "given": given}) + + return parsed_authors diff --git a/tests/integration/test_citation_resolution_integration.py b/tests/integration/test_citation_resolution_integration.py new file mode 100644 index 0000000..f593382 --- /dev/null +++ b/tests/integration/test_citation_resolution_integration.py @@ -0,0 +1,44 @@ +"""Integration test for bibliography resolution hitting real services.""" + +import pytest + +from lit_agent.identifiers import resolve_bibliography + + +@pytest.mark.integration +def test_resolve_bibliography_live_ncbi_lookup(): + """Ensure resolve_bibliography works against live PubMed/PMC entries.""" + + bibliography = [ + { + "source_id": "pubmed", + "url": "https://pubmed.ncbi.nlm.nih.gov/37674083/", + }, + { + "source_id": "pmc", + "url": "https://pmc.ncbi.nlm.nih.gov/articles/PMC11239014/", + }, + ] + + result = resolve_bibliography( + bibliography, + validate=True, + scrape=False, + pdf=False, + topic_validation=False, + ) + + assert set(result.citations.keys()) == {"pubmed", "pmc"} + assert result.stats["resolved"] == 2 + assert not result.failures + + pubmed_citation = result.citations["pubmed"] + assert pubmed_citation["PMID"] == "37674083" + assert pubmed_citation["resolution"]["confidence"] > 0.8 + assert "url_pattern" in pubmed_citation["resolution"]["methods"] + assert pubmed_citation["resolution"]["validation"]["ncbi"] in {"passed", "unknown", "failed"} + + pmc_citation = result.citations["pmc"] + assert pmc_citation["PMCID"] == "PMC11239014" + assert pmc_citation["resolution"]["confidence"] > 0.8 + assert "url_pattern" in pmc_citation["resolution"]["methods"] diff --git a/tests/unit/test_citation_resolution.py b/tests/unit/test_citation_resolution.py new file mode 100644 index 0000000..e05f3b8 --- /dev/null +++ b/tests/unit/test_citation_resolution.py @@ -0,0 +1,108 @@ +"""Unit tests for bibliography resolution to CSL-JSON.""" + +import pytest + +from lit_agent.identifiers import resolve_bibliography + + +@pytest.mark.unit +def test_resolve_bibliography_preserves_source_ids(): + """URLs with explicit source_ids should be keyed by those IDs.""" + + entries = [ + { + "source_id": "10", + "url": "https://pubmed.ncbi.nlm.nih.gov/12345678/", + }, + { + "source_id": "20", + "url": "https://pmc.ncbi.nlm.nih.gov/articles/PMC7654321/", + }, + ] + + result = resolve_bibliography( + entries, + validate=False, + scrape=False, + pdf=False, + topic_validation=False, + ) + + assert set(result.citations.keys()) == {"10", "20"} + + pmid_citation = result.citations["10"] + assert pmid_citation["id"] == "10" + assert pmid_citation["URL"] == entries[0]["url"] + assert pmid_citation["PMID"] == "12345678" + assert "url_pattern" in pmid_citation["resolution"]["methods"] + assert pmid_citation["resolution"]["source_url"] == entries[0]["url"] + + pmc_citation = result.citations["20"] + assert pmc_citation["PMCID"] == "PMC7654321" + assert result.stats["resolved"] == 2 + assert result.stats["unresolved"] == 0 + + +@pytest.mark.unit +def test_resolve_bibliography_numbers_plain_url_lists(): + """Plain URL lists fall back to 1-based string source_ids.""" + + urls = [ + "https://doi.org/10.1234/abc.def.001", + "https://example.com/not-a-reference", + ] + + result = resolve_bibliography( + urls, + validate=False, + scrape=False, + pdf=False, + topic_validation=False, + ) + + assert list(result.citations.keys()) == ["1", "2"] + assert result.citations["1"]["DOI"] == "10.1234/abc.def.001" + + assert result.stats["resolved"] == 1 + assert result.stats["unresolved"] == 1 + assert "2" in result.failures + assert result.citations["2"]["resolution"]["errors"] + + +@pytest.mark.unit +def test_resolve_bibliography_enriches_metadata(monkeypatch): + """Metadata lookup should populate CSL fields when provided.""" + + url = "https://pubmed.ncbi.nlm.nih.gov/11111111/" + + def fake_metadata_lookup(identifier_type, value): + return { + "title": "Sample Article Title", + "authors": ["Doe J", "Smith A"], + "journal": "Journal of Examples", + "pubdate": "2024 Jan 15", + "pmid": value, + "pmcid": "PMC9999999", + "doi": "10.4321/example.doi", + "volume": "12", + "issue": "3", + "pages": "101-110", + } + + result = resolve_bibliography( + [url], + validate=False, + scrape=False, + pdf=False, + topic_validation=False, + metadata_lookup=fake_metadata_lookup, + ) + + citation = result.citations["1"] + assert citation["title"] == "Sample Article Title" + assert citation["container-title"] == "Journal of Examples" + assert citation["issued"]["date-parts"][0][0] == 2024 + assert citation["author"][0]["family"] == "Doe" + assert citation["PMID"] == "11111111" + assert citation["PMCID"] == "PMC9999999" + assert citation["DOI"] == "10.4321/example.doi"