From 113bc8b256bcee9936eda847bacb4465dd6e3b3d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Damian=20Pomyka=C5=82a?=
 <168227269+dpomykala@users.noreply.github.com>
Date: Mon, 10 Nov 2025 21:08:26 +0100
Subject: [PATCH] Support other file formats in the filecontent filter

Use a specialized extractor function if registered for the given format
(currently PDF and DOCX). For all other formats use the `extract_txt`
function as a fallback.

Resolves #464.
---
 CHANGELOG.md                      |  1 +
 organize/filters/filecontent.py   |  9 ++-------
 tests/filters/test_filecontent.py | 21 +++++++++++++++++++--
 3 files changed, 22 insertions(+), 9 deletions(-)
diff --git a/CHANGELOG.md b/CHANGELOG.md
index a9dc9371..35bf186d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -2,6 +2,7 @@
 
 ## [Unreleased]
 
+- Added support for other plain-text file formats in the `filecontent` filter (#464).
 - Fixed #438 (`filecontent` filter fails for PDFs when `pdftotext` isn't installed, instead of falling back to `pdfminer`)
 
 ## v3.3.0 (2024-11-25)
diff --git a/organize/filters/filecontent.py b/organize/filters/filecontent.py
index 6022c6d5..21bd79f1 100644
--- a/organize/filters/filecontent.py
+++ b/organize/filters/filecontent.py
@@ -79,17 +79,14 @@ def extract_docx(path: Path) -> str:
     return clean(result)
 
 
-EXTRACTORS: Dict[str, Callable[[Path], str]] = {
-    ".md": extract_txt,
-    ".txt": extract_txt,
-    ".log": extract_txt,
+SPECIALIZED_EXTRACTORS: Dict[str, Callable[[Path], str]] = {
     ".pdf": extract_pdf,
     ".docx": extract_docx,
 }
 
 
 def textract(path: Path) -> str:
-    extractor = EXTRACTORS[path.suffix.lower()]
+    extractor = SPECIALIZED_EXTRACTORS.get(path.suffix.lower(), extract_txt)
     return extractor(path)
 
 
@@ -97,8 +94,6 @@ def textract(path: Path) -> str:
 class FileContent:
     """Matches file content with the given regular expression.
 
-    Supports .md, .txt, .log, .pdf and .docx files.
-
     For PDF content extraction poppler should be installed for the `pdftotext` command.
     If this is not available `filecontent` will fall back to the `pdfminer` library.
 
diff --git a/tests/filters/test_filecontent.py b/tests/filters/test_filecontent.py
index 5a23625a..82b72960 100644
--- a/tests/filters/test_filecontent.py
+++ b/tests/filters/test_filecontent.py
@@ -1,14 +1,23 @@
 from conftest import make_files, read_files
 
 from organize import Config
+from organize.filters import filecontent
 
 
-def test_filecontent(fs):
+def test_filecontent(fs, monkeypatch):
+    # Mock extractor functions for PDF and DOCX - return fixed values
+    monkeypatch.setitem(filecontent.SPECIALIZED_EXTRACTORS, ".pdf", lambda _: "PDF")
+    monkeypatch.setitem(filecontent.SPECIALIZED_EXTRACTORS, ".docx", lambda _: "DOCX")
+
     # inspired by https://github.com/tfeldmann/organize/issues/43
     files = {
-        "Test1.txt": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765",
+        "Test1": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765",
         "Test2.txt": "Tests",
         "Test3.txt": "My Homework ...",
+        "test4.xml": "XML",
+        # Content is not important as we mock extractors
+        "test5.pdf": "",
+        "test6.docx": "",
     }
     make_files(files, "test")
     Config.from_string(
@@ -24,10 +33,18 @@ def test_filecontent(fs):
             - filecontent: '.*Homework.*'
           actions:
             - rename: "Homework.txt"
+        - locations: "/test"
+          filters:
+            - filecontent: '(?P<all>XML|PDF|DOCX)'
+          actions:
+            - rename: '{filecontent.all}'
         """
     ).execute(simulate=False)
     assert read_files("test") == {
         "Homework.txt": "My Homework ...",
         "MegaCorp_Invoice_12345.txt": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765",
         "Test2.txt": "Tests",
+        "XML": "XML",
+        "PDF": "",
+        "DOCX": "",
     }