Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

## [Unreleased]

- Added support for other plain-text file formats in the `filecontent` filter (#464).
- Fixed #438 (`filecontent` filter fails for PDFs when `pdftotext` isn't installed, instead of falling back to `pdfminer`)

## v3.3.0 (2024-11-25)
Expand Down
9 changes: 2 additions & 7 deletions organize/filters/filecontent.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,26 +79,21 @@ def extract_docx(path: Path) -> str:
return clean(result)


EXTRACTORS: Dict[str, Callable[[Path], str]] = {
".md": extract_txt,
".txt": extract_txt,
".log": extract_txt,
SPECIALIZED_EXTRACTORS: Dict[str, Callable[[Path], str]] = {
".pdf": extract_pdf,
".docx": extract_docx,
}


def textract(path: Path) -> str:
extractor = EXTRACTORS[path.suffix.lower()]
extractor = SPECIALIZED_EXTRACTORS.get(path.suffix.lower(), extract_txt)
return extractor(path)


@dataclass(config=ConfigDict(coerce_numbers_to_str=True, extra="forbid"))
class FileContent:
"""Matches file content with the given regular expression.

Supports .md, .txt, .log, .pdf and .docx files.

For PDF content extraction poppler should be installed for the `pdftotext` command.
If this is not available `filecontent` will fall back to the `pdfminer` library.

Expand Down
21 changes: 19 additions & 2 deletions tests/filters/test_filecontent.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,23 @@
from conftest import make_files, read_files

from organize import Config
from organize.filters import filecontent


def test_filecontent(fs):
def test_filecontent(fs, monkeypatch):
# Mock extractor functions for PDF and DOCX - return fixed values
monkeypatch.setitem(filecontent.SPECIALIZED_EXTRACTORS, ".pdf", lambda _: "PDF")
monkeypatch.setitem(filecontent.SPECIALIZED_EXTRACTORS, ".docx", lambda _: "DOCX")

# inspired by https://github.com/tfeldmann/organize/issues/43
files = {
"Test1.txt": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765",
"Test1": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765",
"Test2.txt": "Tests",
"Test3.txt": "My Homework ...",
"test4.xml": "XML",
# Content is not important as we mock extractors
"test5.pdf": "",
"test6.docx": "",
}
make_files(files, "test")
Config.from_string(
Expand All @@ -24,10 +33,18 @@ def test_filecontent(fs):
- filecontent: '.*Homework.*'
actions:
- rename: "Homework.txt"
- locations: "/test"
filters:
- filecontent: '(?P<all>XML|PDF|DOCX)'
actions:
- rename: '{filecontent.all}'
"""
).execute(simulate=False)
assert read_files("test") == {
"Homework.txt": "My Homework ...",
"MegaCorp_Invoice_12345.txt": "Lorem MegaCorp Ltd. ipsum\nInvoice 12345\nMore text\nID: 98765",
"Test2.txt": "Tests",
"XML": "XML",
"PDF": "",
"DOCX": "",
}