From 113bc8b256bcee9936eda847bacb4465dd6e3b3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Damian=20Pomyka=C5=82a?= <168227269+dpomykala@users.noreply.github.com> Date: Mon, 10 Nov 2025 21:08:26 +0100 Subject: [PATCH] Support other file formats in the filecontent filter Use a specialized extractor function if registered for the given format (currently PDF and DOCX). For all other formats use the `extract_txt` function as a fallback. Resolves #464. --- CHANGELOG.md | 1 + organize/filters/filecontent.py | 9 ++------- tests/filters/test_filecontent.py | 21 +++++++++++++++++++-- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a9dc9371..35bf186d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## [Unreleased] +- Added support for other plain-text file formats in the `filecontent` filter (#464). - Fixed #438 (`filecontent` filter fails for PDFs when `pdftotext` isn't installed, instead of falling back to `pdfminer`) ## v3.3.0 (2024-11-25) diff --git a/organize/filters/filecontent.py b/organize/filters/filecontent.py index 6022c6d5..21bd79f1 100644 --- a/organize/filters/filecontent.py +++ b/organize/filters/filecontent.py @@ -79,17 +79,14 @@ def extract_docx(path: Path) -> str: return clean(result) -EXTRACTORS: Dict[str, Callable[[Path], str]] = { - ".md": extract_txt, - ".txt": extract_txt, - ".log": extract_txt, +SPECIALIZED_EXTRACTORS: Dict[str, Callable[[Path], str]] = { ".pdf": extract_pdf, ".docx": extract_docx, } def textract(path: Path) -> str: - extractor = EXTRACTORS[path.suffix.lower()] + extractor = SPECIALIZED_EXTRACTORS.get(path.suffix.lower(), extract_txt) return extractor(path) @@ -97,8 +94,6 @@ def textract(path: Path) -> str: class FileContent: """Matches file content with the given regular expression. - Supports .md, .txt, .log, .pdf and .docx files. - For PDF content extraction poppler should be installed for the `pdftotext` command. If this is not available `filecontent` will fall back to the `pdfminer` library. diff --git a/tests/filters/test_filecontent.py b/tests/filters/test_filecontent.py index 5a23625a..82b72960 100644 --- a/tests/filters/test_filecontent.py +++ b/tests/filters/test_filecontent.py @@ -1,14 +1,23 @@ from conftest import make_files, read_files from organize import Config +from organize.filters import filecontent -def test_filecontent(fs): +def test_filecontent(fs, monkeypatch): + # Mock extractor functions for PDF and DOCX - return fixed values + monkeypatch.setitem(filecontent.SPECIALIZED_EXTRACTORS, ".pdf", lambda _: "PDF") + monkeypatch.setitem(filecontent.SPECIALIZED_EXTRACTORS, ".docx", lambda _: "DOCX") + # inspired by https://github.com/tfeldmann/organize/issues/43 files = { - "Test1.txt": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765", + "Test1": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765", "Test2.txt": "Tests", "Test3.txt": "My Homework ...", + "test4.xml": "XML", + # Content is not important as we mock extractors + "test5.pdf": "", + "test6.docx": "", } make_files(files, "test") Config.from_string( @@ -24,10 +33,18 @@ def test_filecontent(fs): - filecontent: '.*Homework.*' actions: - rename: "Homework.txt" + - locations: "/test" + filters: + - filecontent: '(?PXML|PDF|DOCX)' + actions: + - rename: '{filecontent.all}' """ ).execute(simulate=False) assert read_files("test") == { "Homework.txt": "My Homework ...", "MegaCorp_Invoice_12345.txt": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765", "Test2.txt": "Tests", + "XML": "XML", + "PDF": "", + "DOCX": "", }