diff --git a/packages/markitdown-dicom/.gitignore b/packages/markitdown-dicom/.gitignore new file mode 100644 index 000000000..571830800 --- /dev/null +++ b/packages/markitdown-dicom/.gitignore @@ -0,0 +1 @@ +tests/test-dicom-files/ diff --git a/packages/markitdown-dicom/README.md b/packages/markitdown-dicom/README.md new file mode 100644 index 000000000..ce09eac83 --- /dev/null +++ b/packages/markitdown-dicom/README.md @@ -0,0 +1,119 @@ +# MarkItDown DICOM Plugin (`markitdown-dicom`) + +This is a plugin for [MarkItDown](https://github.com/microsoft/markitdown) that adds support for converting DICOM (`.dcm`) files into LLM-friendly Markdown metadata representations. + +The plugin is designed to be highly memory-efficient (using deferred loading for pixel data) and token-efficient, ignoring raw pixel arrays while extracting clinically-relevant metadata. + +## Features + +- **Efficient Stream Peeking**: Fast detection of `.dcm` files by peeking at the `DICM` file preamble/magic bytes at offset 128. +- **Memory Safety**: Uses `pydicom` with deferred value loading (`defer_size="1 KB"`) to parse headers of large multi-frame DICOM files without loading gigabytes of pixel data. +- **PII-Aware by Default**: Automatically redacts Patient Name, Patient ID, and Patient Birth Date. +- **Formatted Metadata**: Standardizes dates to `YYYY-MM-DD` and times to `HH:MM:SS` for downstream RAG and vector database ingestion. +- **Custom Tag Support**: Automatically extracts additional standard metadata fields. Private/vendor tags can optionally be included and are filtered to avoid binary, sequence, and other high-volume data types. + +## Installation + +Install the plugin along with MarkItDown: + +```bash +pip install markitdown-dicom +``` + +## Usage + +### Command Line Interface + +Use the `-p` (or `--use-plugins`) option to enable third-party plugins: + +```bash +markitdown --use-plugins patient_scan.dcm -o patient_scan.md +``` + +### Python API + +```python +from markitdown import MarkItDown + +# Initialize MarkItDown with plugins enabled +md = MarkItDown(enable_plugins=True) + +# Convert a DICOM file +result = md.convert("patient_scan.dcm") +print(result.text_content) +``` + +### Disabling PII Redaction + +If you are working in a fully de-identified or secure clinical environment and want to retain Patient Name and Patient ID, you can disable redaction: + +```python +from markitdown import MarkItDown + +md = MarkItDown(enable_plugins=True, redact_pii=False) +result = md.convert("patient_scan.dcm") +``` + +## Example Output + +```markdown +# DICOM File + +## Patient Information + +* **Patient Name**: [REDACTED] +* **Patient ID**: [REDACTED] +* **Patient Birth Date**: [REDACTED] +* **Patient Sex**: M +* **Patient Age**: 045Y + +## Study Information + +* **Study Instance UID**: 1.2.840.113619.2.134.1.20230612.98765432 +* **Study ID**: STUDY-1 +* **Study Date**: 2023-06-12 +* **Study Time**: 11:44:27 +* **Study Description**: Chest X-Ray +* **Accession Number**: ACC-98765 + +## Series Information + +* **Series Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432 +* **Series Number**: 1 +* **Series Description**: PA View +* **Series Date**: 2023-06-12 +* **Series Time**: 11:45:00 + +## Acquisition Parameters + +* **Modality**: DX +* **Protocol Name**: Chest PA +* **Exposure**: 2 +* **Exposure Time**: 10 +* **KVP**: 120 +* **Acquisition Date**: 2023-06-12 +* **Acquisition Time**: 11:45:00 + +## Equipment + +* **Manufacturer**: GE Medical Systems +* **Manufacturer Model Name**: Discovery +* **Device Serial Number**: SN-12345 +* **Software Versions**: v1.2.3 + +## Image Properties + +* **Rows**: 2048 +* **Columns**: 1500 +* **Samples Per Pixel**: 1 +* **Bits Allocated**: 16 +* **Bits Stored**: 12 +* **High Bit**: 11 +* **Pixel Representation**: 0 +* **Photometric Interpretation**: MONOCHROME2 +* **Frame Count**: 1 +* **Instance Number**: 42 +* **SOP Class UID**: 1.2.840.10008.5.1.4.1.1.2 +* **SOP Instance UID**: 1.2.840.113619.2.134.2.20230612.98765432.1 +* **Pixel Data Present**: Yes +``` diff --git a/packages/markitdown-dicom/pyproject.toml b/packages/markitdown-dicom/pyproject.toml new file mode 100644 index 000000000..8c2687539 --- /dev/null +++ b/packages/markitdown-dicom/pyproject.toml @@ -0,0 +1,68 @@ +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[project] +name = "markitdown-dicom" +dynamic = ["version"] +description = 'DICOM converter plugin for MarkItDown - Extracts metadata from .dcm files' +readme = "README.md" +requires-python = ">=3.10" +license = "MIT" +keywords = ["markitdown", "dicom", "metadata", "pydicom"] +authors = [ + { name = "Aryan Kaushik", email = "aryankaushik251@gmail.com" }, +] +classifiers = [ + "Development Status :: 4 - Beta", + "Programming Language :: Python", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: Implementation :: CPython", +] +dependencies = [ + "markitdown>=0.1.0a1", + "pydicom>=2.4.0", +] + +[project.urls] +Documentation = "https://github.com/microsoft/markitdown#readme" +Issues = "https://github.com/microsoft/markitdown/issues" +Source = "https://github.com/microsoft/markitdown" + +[tool.hatch.version] +path = "src/markitdown_dicom/__about__.py" + +[project.entry-points."markitdown.plugin"] +dicom = "markitdown_dicom" + +[tool.hatch.envs.types] +extra-dependencies = [ + "mypy>=1.0.0", +] +[tool.hatch.envs.types.scripts] +check = "mypy --install-types --non-interactive {args:src/markitdown_dicom tests}" + +[tool.coverage.run] +source_pkgs = ["markitdown_dicom", "tests"] +branch = true +parallel = true +omit = [ + "src/markitdown_dicom/__about__.py", +] + +[tool.coverage.paths] +markitdown-dicom = ["src/markitdown_dicom", "*/markitdown-dicom/src/markitdown_dicom"] +tests = ["tests", "*/markitdown-dicom/tests"] + +[tool.coverage.report] +exclude_lines = [ + "no cov", + "if __name__ == .__main__.:", + "if TYPE_CHECKING:", +] + +[tool.hatch.build.targets.sdist] +only-include = ["src/markitdown_dicom"] diff --git a/packages/markitdown-dicom/src/markitdown_dicom/__about__.py b/packages/markitdown-dicom/src/markitdown_dicom/__about__.py new file mode 100644 index 000000000..24f8ff955 --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/__about__.py @@ -0,0 +1,4 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT +__version__ = "0.1.0a1" diff --git a/packages/markitdown-dicom/src/markitdown_dicom/__init__.py b/packages/markitdown-dicom/src/markitdown_dicom/__init__.py new file mode 100644 index 000000000..1da335c88 --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/__init__.py @@ -0,0 +1,14 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT + +from ._plugin import __plugin_interface_version__, register_converters +from ._dicom_converter import DicomConverter +from .__about__ import __version__ + +__all__ = [ + "__version__", + "__plugin_interface_version__", + "register_converters", + "DicomConverter", +] diff --git a/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py new file mode 100644 index 000000000..5effe62f0 --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/_dicom_converter.py @@ -0,0 +1,324 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT + +import re +import sys +from typing import Any, BinaryIO, Dict, List, Optional + +from markitdown import DocumentConverter, DocumentConverterResult, StreamInfo, MissingDependencyException + +# Lazy loading of pydicom to raise MissingDependencyException during conversion if not installed. +_dependency_exc_info = None +try: + import pydicom +except ImportError: + _dependency_exc_info = sys.exc_info() + + +class DicomConverter(DocumentConverter): + """ + Converts DICOM (.dcm, .dicom) files to structured, token-efficient Markdown. + Extracts key Study, Series, Acquisition, Equipment, and Image characteristics. + Omits and redacts Patient PII (Name, ID, Birth Date) by default. + Supports both medical imaging and industrial radiography datasets conforming to the + DICONDE standard (ASTM E2339) used in Non-Destructive Testing (NDT). + """ + + def __init__(self, redact_pii: bool = True, include_private_tags: bool = False, **kwargs: Any): + super().__init__() + self._redact_pii = redact_pii + self._include_private_tags = include_private_tags + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + mimetype = (stream_info.mimetype or "").lower() + extension = (stream_info.extension or "").lower() + + # Check standard extension / MIME type + if extension in (".dcm", ".dicom") or mimetype == "application/dicom": + return True + + # Peek at stream to check signature 'DICM' at offset 128. + # This acts as a robust fallback for files lacking standard extensions (like + # industrial NDT or DICONDE images). + cur_pos = file_stream.tell() + try: + file_stream.seek(128) + sig = file_stream.read(4) + if sig == b"DICM": + return True + except Exception: + pass + finally: + file_stream.seek(cur_pos) + + return False + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + # Check if pydicom is available + if _dependency_exc_info is not None: + raise MissingDependencyException( + "markitdown-dicom requires pydicom to be installed. " + "To resolve, run: pip install pydicom" + ) from _dependency_exc_info[1].with_traceback(_dependency_exc_info[2]) # type: ignore + + # Resolve settings + redact_pii = kwargs.get("redact_pii", self._redact_pii) + include_private_tags = kwargs.get("include_private_tags", self._include_private_tags) + + # Parse DICOM from the stream. + # Use defer_size="1 KB" so we don't load large pixel data arrays into memory. + # We attempt a strict read first (force=False) to ensure compliance and avoid false positives + # on non-DICOM streams. If that fails (e.g. for raw datasets lacking a file meta header), + # we reset and fall back to force=True. + cur_pos = file_stream.tell() + try: + ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=False) + if ds is None or len(ds) == 0: + raise ValueError("Parsed dataset has no elements.") + except (pydicom.errors.InvalidDicomError, TypeError): + file_stream.seek(cur_pos) + try: + ds = pydicom.dcmread(file_stream, defer_size="1 KB", force=True) + if ds is None or len(ds) == 0: + raise ValueError("Parsed dataset has no elements.") + except Exception as e: + raise ValueError(f"Failed to parse DICOM file: {e}") from e + except Exception as e: + raise ValueError(f"Failed to parse DICOM file: {e}") from e + + # Extracted elements + lines = ["# DICOM File", ""] + + # Date and Time Formatter helpers + def _format_date(val: Any) -> Optional[str]: + if not val: + return None + val_str = str(val).strip() + if len(val_str) == 8 and val_str.isdigit(): + return f"{val_str[0:4]}-{val_str[4:6]}-{val_str[6:8]}" + return val_str + + def _format_time(val: Any) -> Optional[str]: + if not val: + return None + val_str = str(val).strip() + if "." in val_str: + time_part, frac_part = val_str.split(".", 1) + else: + time_part, frac_part = val_str, "" + + if len(time_part) >= 6 and time_part.isdigit(): + formatted = f"{time_part[0:2]}:{time_part[2:4]}:{time_part[4:6]}" + if frac_part: + formatted += f".{frac_part}" + return formatted + elif len(time_part) >= 4 and time_part.isdigit(): + formatted = f"{time_part[0:2]}:{time_part[2:4]}" + if frac_part: + formatted += f".{frac_part}" + return formatted + return val_str + + def _get_val(keyword: str) -> Any: + val = getattr(ds, keyword, None) + if val is None: + return None + if isinstance(val, (list, tuple)) or type(val).__name__ == "MultiValue": + return ", ".join(str(x) for x in val) + return val + + # Define category structures + # 1. Patient Information + p_name = _get_val("PatientName") + p_id = _get_val("PatientID") + p_dob = _get_val("PatientBirthDate") + + if redact_pii: + p_name = "[REDACTED]" if p_name is not None else None + p_id = "[REDACTED]" if p_id is not None else None + p_dob = "[REDACTED]" if p_dob is not None else None + else: + if p_name: + p_name = str(p_name).replace("^", " ").strip() + + patient_fields = { + "Patient Name": p_name, + "Patient ID": p_id, + "Patient Birth Date": _format_date(p_dob), + "Patient Sex": _get_val("PatientSex"), + "Patient Age": _get_val("PatientAge"), + } + + # 2. Study Information + study_fields = { + "Study Instance UID": _get_val("StudyInstanceUID"), + "Study ID": _get_val("StudyID"), + "Study Date": _format_date(_get_val("StudyDate")), + "Study Time": _format_time(_get_val("StudyTime")), + "Study Description": _get_val("StudyDescription"), + "Accession Number": _get_val("AccessionNumber"), + } + + # 3. Series Information + series_fields = { + "Series Instance UID": _get_val("SeriesInstanceUID"), + "Series Number": _get_val("SeriesNumber"), + "Series Description": _get_val("SeriesDescription"), + "Series Date": _format_date(_get_val("SeriesDate")), + "Series Time": _format_time(_get_val("SeriesTime")), + } + + # 4. Acquisition Information + acquisition_fields = { + "Modality": _get_val("Modality"), + "Protocol Name": _get_val("ProtocolName"), + "Exposure": _get_val("Exposure"), + "Exposure Time": _get_val("ExposureTime"), + "KVP": _get_val("KVP"), + "Acquisition Date": _format_date(_get_val("AcquisitionDate")), + "Acquisition Time": _format_time(_get_val("AcquisitionTime")), + } + + # 5. Equipment Information + equipment_fields = { + "Manufacturer": _get_val("Manufacturer"), + "Manufacturer Model Name": _get_val("ManufacturerModelName"), + "Device Serial Number": _get_val("DeviceSerialNumber"), + "Software Versions": _get_val("SoftwareVersions"), + } + + # 6. Image Characteristics + pixel_data_present = "Yes" if (0x7FE0, 0x0010) in ds else "No" + + image_fields = { + "Rows": _get_val("Rows"), + "Columns": _get_val("Columns"), + "Samples Per Pixel": _get_val("SamplesPerPixel"), + "Bits Allocated": _get_val("BitsAllocated"), + "Bits Stored": _get_val("BitsStored"), + "High Bit": _get_val("HighBit"), + "Pixel Representation": _get_val("PixelRepresentation"), + "Photometric Interpretation": _get_val("PhotometricInterpretation"), + "Frame Count": _get_val("NumberOfFrames"), + "Instance Number": _get_val("InstanceNumber"), + "SOP Class UID": _get_val("SOPClassUID"), + "SOP Instance UID": _get_val("SOPInstanceUID"), + "Pixel Data Present": pixel_data_present, + } + + # 7. Other Useful Text Fields + other_fields = { + "Image Comments": _get_val("ImageComments"), + "Institution Name": _get_val("InstitutionName"), + "Station Name": _get_val("StationName"), + "Body Part Examined": _get_val("BodyPartExamined"), + } + + # Helper to render sections + def _render_section(title: str, fields: Dict[str, Any]) -> List[str]: + active = {k: v for k, v in fields.items() if v is not None and str(v).strip() != ""} + if not active: + return [] + sec_lines = [f"## {title}", ""] + for k, v in active.items(): + sec_lines.append(f"* **{k}**: {v}") + sec_lines.append("") + return sec_lines + + # Predefined sections + lines.extend(_render_section("Patient Information", patient_fields)) + lines.extend(_render_section("Study Information", study_fields)) + lines.extend(_render_section("Series Information", series_fields)) + lines.extend(_render_section("Acquisition Parameters", acquisition_fields)) + lines.extend(_render_section("Equipment", equipment_fields)) + lines.extend(_render_section("Image Properties", image_fields)) + lines.extend(_render_section("Other Information", other_fields)) + + # 8. Private / Custom textual tags when reasonable + EXCLUDED_KEYWORDS = { + # Study + "StudyInstanceUID", "StudyDate", "StudyTime", "StudyDescription", "AccessionNumber", "StudyID", + # Series + "SeriesInstanceUID", "SeriesNumber", "SeriesDescription", "SeriesDate", "SeriesTime", + # Acquisition + "Modality", "ProtocolName", "Exposure", "ExposureTime", "KVP", "AcquisitionDate", "AcquisitionTime", + # Equipment + "Manufacturer", "ManufacturerModelName", "DeviceSerialNumber", "SoftwareVersions", + # Image Characteristics + "Rows", "Columns", "SamplesPerPixel", "BitsAllocated", "BitsStored", "HighBit", "PixelRepresentation", "PhotometricInterpretation", "NumberOfFrames", "InstanceNumber", "SOPClassUID", "SOPInstanceUID", + # Other Useful Text Fields + "ImageComments", "InstitutionName", "StationName", "BodyPartExamined", + # Patient info + "PatientName", "PatientID", "PatientBirthDate", "PatientSex", "PatientAge" + } + EXCLUDED_VRS = {"OB", "OW", "OF", "OD", "SQ", "UN"} + + custom_fields: Dict[str, str] = {} + for elem in ds: + # Skip private tags unless explicitly requested + if elem.tag.is_private and not include_private_tags: + continue + + # Skip file meta or pixel group + if elem.tag.group in (0x0002, 0x7FE0) or elem.tag.element == 0: + continue + + # Skip binary, sequence, or unknown VRs + if elem.VR in EXCLUDED_VRS: + continue + + keyword = elem.keyword + if not keyword: + if elem.tag.is_private: + label = f"Private Tag ({elem.tag.group:04X},{elem.tag.element:04X})" + else: + label = f"Tag ({elem.tag.group:04X},{elem.tag.element:04X})" + else: + if keyword in EXCLUDED_KEYWORDS: + continue + # Split CamelCase to separate words + label = re.sub(r'(? None: + """ + Called during construction of MarkItDown instances to register converters provided by plugins. + """ + markitdown.register_converter(DicomConverter(**kwargs)) diff --git a/packages/markitdown-dicom/src/markitdown_dicom/py.typed b/packages/markitdown-dicom/src/markitdown_dicom/py.typed new file mode 100644 index 000000000..7632ecf77 --- /dev/null +++ b/packages/markitdown-dicom/src/markitdown_dicom/py.typed @@ -0,0 +1 @@ +# Marker file for PEP 561 diff --git a/packages/markitdown-dicom/tests/__init__.py b/packages/markitdown-dicom/tests/__init__.py new file mode 100644 index 000000000..aa8931747 --- /dev/null +++ b/packages/markitdown-dicom/tests/__init__.py @@ -0,0 +1,3 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT diff --git a/packages/markitdown-dicom/tests/test_dicom_converter.py b/packages/markitdown-dicom/tests/test_dicom_converter.py new file mode 100644 index 000000000..931fa7a1b --- /dev/null +++ b/packages/markitdown-dicom/tests/test_dicom_converter.py @@ -0,0 +1,279 @@ +# SPDX-FileCopyrightText: 2026-present Aryan Kaushik +# +# SPDX-License-Identifier: MIT + +import io +import pytest +from typing import Dict, Any, Optional + +import pydicom +from pydicom.dataset import FileDataset, FileMetaDataset +from pydicom.uid import ExplicitVRLittleEndian, generate_uid + +from markitdown import MarkItDown, StreamInfo +from markitdown_dicom import DicomConverter + + +def create_mock_dicom( + patient_name: Optional[str] = "Test^Patient", + patient_id: Optional[str] = "123456", + patient_dob: Optional[str] = "19800101", + modality: str = "CT", + study_description: str = "Mock Study", + rows: Optional[int] = 512, + cols: Optional[int] = 512, + has_pixel_data: bool = True, + extra_fields: Optional[Dict[str, Any]] = None, +) -> io.BytesIO: + """Helper to programmatically generate a valid DICOM file in memory.""" + file_meta = FileMetaDataset() + file_meta.MediaStorageSOPClassUID = "1.2.840.10008.5.1.4.1.1.2" # CT Image Storage + file_meta.MediaStorageSOPInstanceUID = generate_uid() + file_meta.TransferSyntaxUID = ExplicitVRLittleEndian + file_meta.ImplementationClassUID = pydicom.uid.PYDICOM_IMPLEMENTATION_UID + + ds = FileDataset("in_memory.dcm", {}, file_meta=file_meta, preamble=b"\0" * 128) + + if patient_name is not None: + ds.PatientName = patient_name + if patient_id is not None: + ds.PatientID = patient_id + if patient_dob is not None: + ds.PatientBirthDate = patient_dob + + ds.PatientSex = "M" + ds.PatientAge = "045Y" + ds.Modality = modality + ds.StudyInstanceUID = generate_uid() + ds.SeriesInstanceUID = generate_uid() + ds.SOPInstanceUID = file_meta.MediaStorageSOPInstanceUID + ds.SOPClassUID = file_meta.MediaStorageSOPClassUID + ds.StudyID = "STUDY-1" + ds.SeriesDate = "20260612" + ds.SeriesTime = "120500" + ds.InstanceNumber = 42 + ds.StudyDate = "20260612" + ds.StudyTime = "120000.123" + ds.StudyDescription = study_description + ds.AccessionNumber = "ACC-12345" + ds.SeriesNumber = 1 + ds.SeriesDescription = "PA View" + ds.Manufacturer = "GE Medical Systems" + + if rows is not None: + ds.Rows = rows + if cols is not None: + ds.Columns = cols + + ds.SamplesPerPixel = 1 + ds.BitsAllocated = 16 + ds.BitsStored = 16 + ds.HighBit = 15 + ds.PixelRepresentation = 0 + ds.PhotometricInterpretation = "MONOCHROME2" + + if has_pixel_data: + # Use simple dummy bytes for pixel data + ds.PixelData = b"\x00" * 100 + + if extra_fields: + for keyword, val in extra_fields.items(): + setattr(ds, keyword, val) + + buffer = io.BytesIO() + ds.save_as(buffer, enforce_file_format=False) + buffer.seek(0) + return buffer + + +def test_dicom_converter_accepts() -> None: + """Verifies that the DicomConverter accepts DICOM streams using metadata or signature checks.""" + converter = DicomConverter() + + # Case 1: Acceptance by extension + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(extension=".dcm"), + ) + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(extension=".dicom"), + ) + + # Case 2: Acceptance by MIME type + assert converter.accepts( + io.BytesIO(b""), + StreamInfo(mimetype="application/dicom"), + ) + + # Case 3: Acceptance by peeking at DICM signature at offset 128 + mock_dicom = create_mock_dicom() + assert converter.accepts( + mock_dicom, + StreamInfo(extension=".raw"), # Wrong extension, but valid stream + ) + + # Case 4: Rejection of non-DICOM content + assert not converter.accepts( + io.BytesIO(b"\x00" * 200), + StreamInfo(extension=".txt"), + ) + + +def test_dicom_converter_default_redaction() -> None: + """Tests that by default, patient identifying details are redacted but clinical demographics are kept.""" + converter = DicomConverter() + stream = create_mock_dicom( + patient_name="Doe^John", + patient_id="PID-999", + patient_dob="19750505", + ) + + result = converter.convert(stream, StreamInfo()) + + # PatientName, PatientID, and PatientBirthDate must be redacted + assert "Doe John" not in result.markdown + assert "PID-999" not in result.markdown + assert "1975-05-05" not in result.markdown + assert "Patient Name**: [REDACTED]" in result.markdown + assert "Patient ID**: [REDACTED]" in result.markdown + assert "Patient Birth Date**: [REDACTED]" in result.markdown + + # Patient Sex and Age should remain as clinical metadata + assert "Patient Sex**: M" in result.markdown + assert "Patient Age**: 045Y" in result.markdown + + # Verifying other standard sections are rendered properly + assert "Study Description**: Mock Study" in result.markdown + assert "Study ID**: STUDY-1" in result.markdown + assert "Series Date**: 2026-06-12" in result.markdown + assert "Series Time**: 12:05:00" in result.markdown + assert "Rows**: 512" in result.markdown + assert "Columns**: 512" in result.markdown + assert "Instance Number**: 42" in result.markdown + assert "Study Date**: 2026-06-12" in result.markdown + assert "Study Time**: 12:00:00.123" in result.markdown + assert "SOP Class UID**" in result.markdown + assert "SOP Instance UID**" in result.markdown + + +def test_dicom_converter_disabled_redaction() -> None: + """Tests that when redact_pii is set to False, identifiers are extracted normally.""" + converter = DicomConverter(redact_pii=False) + stream = create_mock_dicom( + patient_name="Doe^John", + patient_id="PID-999", + patient_dob="19750505", + ) + + result = converter.convert(stream, StreamInfo()) + + assert "Patient Name**: Doe John" in result.markdown + assert "Patient ID**: PID-999" in result.markdown + assert "Patient Birth Date**: 1975-05-05" in result.markdown + + +def test_dicom_converter_missing_fields() -> None: + """Verifies that missing optional tags do not raise exceptions and are simply omitted.""" + converter = DicomConverter() + # Create DICOM file with no manufacturer, resolution, or description + stream = create_mock_dicom( + study_description="", + rows=None, + cols=None, + ) + + result = converter.convert(stream, StreamInfo()) + + # Ensure no empty field or error occurs + assert "Rows" not in result.markdown + assert "Columns" not in result.markdown + assert "Study Description" not in result.markdown + assert "Manufacturer**" in result.markdown # Manufacturer remains since it wasn't set to None + assert "DICOM File" in result.markdown + + +def test_dicom_converter_custom_and_private_tags() -> None: + """Verifies that extra textual/numeric tags and private tags are formatted correctly.""" + converter = DicomConverter(include_private_tags=True) + + # Add custom standard tags (e.g. BodyPartExamined, InstitutionName) and a private tag + # Private tags use odd group numbers, e.g., 0x0009 + extra_fields = { + "InstitutionName": "Central Hospital", + "BodyPartExamined": "CHEST", + "InstitutionAddress": "123 Clinic Rd", + } + stream = create_mock_dicom(extra_fields=extra_fields) + + # Let's add a raw private tag directly to the dataset + ds = pydicom.dcmread(stream, force=True) + # Register private creator block + ds.private_block(0x0009, "Mock Creator", create=True) + # Add a private element in group 0x0009 + ds[0x0009, 0x1001] = pydicom.dataelem.DataElement(0x00091001, "LO", "Mock Private Value") + + # Save modified dataset to a new stream + new_stream = io.BytesIO() + ds.save_as(new_stream) + new_stream.seek(0) + + result = converter.convert(new_stream, StreamInfo()) + + # Verify standard custom fields + assert "Institution Name**: Central Hospital" in result.markdown + assert "Body Part Examined**: CHEST" in result.markdown + + # Verify additional standard fields split camelcase + assert "Institution Address**: 123 Clinic Rd" in result.markdown + + # Verify private tag rendering + assert "Private Tag (0009,1001)**: Mock Private Value" in result.markdown + + +def test_dicom_converter_exclude_private_tags_by_default() -> None: + """Verifies that private tags are excluded by default when include_private_tags is False.""" + converter = DicomConverter() # default is False + + extra_fields = { + "InstitutionName": "Central Hospital", + } + stream = create_mock_dicom(extra_fields=extra_fields) + + ds = pydicom.dcmread(stream, force=True) + ds.private_block(0x0009, "Mock Creator", create=True) + ds[0x0009, 0x1001] = pydicom.dataelem.DataElement(0x00091001, "LO", "Mock Private Value") + + new_stream = io.BytesIO() + ds.save_as(new_stream) + new_stream.seek(0) + + result = converter.convert(new_stream, StreamInfo()) + + # Standard custom fields should still be present + assert "Institution Name**: Central Hospital" in result.markdown + + # Private tags should be excluded + assert "Mock Private Value" not in result.markdown + assert "Private Tag" not in result.markdown + + +def test_markitdown_plugin_integration() -> None: + """Tests that MarkItDown loads and uses the DicomConverter when enable_plugins is True.""" + md = MarkItDown(enable_plugins=True) + stream = create_mock_dicom(study_description="Integration Test") + + # Convert using the file stream with hint + result = md.convert(stream, stream_info=StreamInfo(extension=".dcm")) + + assert "Study Description**: Integration Test" in result.markdown + assert "Patient Name**: [REDACTED]" in result.markdown + + +def test_corrupted_dicom() -> None: + """Verifies that a corrupted DICOM stream raises ValueError during conversion.""" + converter = DicomConverter() + corrupt_stream = io.BytesIO(b"DICM" + b"\xff" * 100) + + with pytest.raises(ValueError, match="Failed to parse DICOM file"): + converter.convert(corrupt_stream, StreamInfo())