Skip to content

Community Feature: PDF SlateParser #1306

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 6 commits into from
Apr 28, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions pkgs/community/swarmauri_parser_slate/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
![Swarmauri Logo](https://res.cloudinary.com/dbjmpekvl/image/upload/v1730099724/Swarmauri-logo-lockup-2048x757_hww01w.png)

<div align="center">

![PyPI - Downloads](https://img.shields.io/pypi/dm/swarmauri_parser_pypdftk)
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/swarmauri_parser_pypdftk)
![PyPI - License](https://img.shields.io/pypi/l/swarmauri_parser_pypdftk)
![PyPI - Version](https://img.shields.io/pypi/v/swarmauri_parser_pypdftk?label=swarmauri_parser_pypdftk&color=green)

</div>

---

# Swarmauri SlateParser

A parser for reading and extracting data fields from PDF files using Slate.

## Installation

```bash
pip install swarmauri_parser_slate
```

## Usage
Basic usage example with code snippet:
```python
from swarmauri.parsers.SlateParser import SlateParser

parser = SlateParser()
file_path = "path/to/your/pdf_file.pdf"
documents = parser.parse(file_path)

for document in documents:
print(document.content)
```

## Want to help?

If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started.
65 changes: 65 additions & 0 deletions pkgs/community/swarmauri_parser_slate/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
[project]
name = "swarmauri_parser_slate"
version = "0.1.0.dev1"
description = "A parser for extracting text from PDFs using Slate."
license = "Apache-2.0"
readme = "README.md"
repository = "http://github.com/swarmauri/swarmauri-sdk"
requires-python = ">=3.10,<3.13"
classifiers = [
"License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12",
]
authors = [{ name = "Vijay Vignesh", email = "[email protected]" }]
dependencies = [
"slate3k>=0.5",
"swarmauri_core",
"swarmauri_base",
"swarmauri_standard",
]

[tool.uv.sources]
swarmauri_core = { workspace = true }
swarmauri_base = { workspace = true }
swarmauri_standard = { workspace = true }

[tool.pytest.ini_options]
norecursedirs = ["combined", "scripts"]
markers = [
"test: standard test",
"unit: Unit tests",
"i9n: Integration tests",
"r8n: Regression tests",
"timeout: mark test to timeout after X seconds",
"xpass: Expected passes",
"xfail: Expected failures",
"acceptance: Acceptance tests",
]
timeout = 300
log_cli = true
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
asyncio_default_fixture_loop_scope = "function"

[tool.project.entry-points."swarmauri.parsers"]
SlateParser = "swarmauri_parser_slate.SlateParser:SlateParser"

[build-system]
requires = ["poetry-core>=1.0.0"]
build-backend = "poetry.core.masonry.api"

[dependency-groups]
dev = [
"pytest>=8.0",
"pytest-asyncio>=0.24.0",
"pytest-xdist>=3.6.1",
"pytest-json-report>=1.5.0",
"python-dotenv",
"requests>=2.32.3",
"flake8>=7.0",
"pytest-timeout>=2.3.1",
"ruff>=0.9.9",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from typing import List, Literal

import slate3k as slate
from swarmauri_standard.documents.Document import Document
from swarmauri_base.parsers.ParserBase import ParserBase
from swarmauri_base.ComponentBase import ComponentBase


@ComponentBase.register_type(ParserBase, "SlateParser")
class SlateParser(ParserBase):
"""
Parser for reading and extracting data fields from PDF files using Slate3k.
"""

type: Literal["SlateParser"] = "SlateParser"

def parse(self, source: str) -> List[Document]:
"""
Parses a PDF file and extracts its data fields as Document instances.

Parameters:
- source (str): The path to the PDF file.

Returns:
- List[IDocument]: A list containing a single Document instance with the extracted data fields.
"""

documents = []
if isinstance(source, str):
try:
with open(source, "rb") as file:
reader = slate.PDF(file)
print(reader)
for page_num, page in enumerate(reader):
text = page
if text:
document = Document(
content=text.strip(),
metadata={
"page_number": page_num + 1,
"source": source,
},
)
documents.append(document)
except Exception as e:
print(f"An error occurred while parsing the PDF '{source}': {e}")
return []
else:
raise TypeError("Source must be of type str (file path) or bytes.")

return documents
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from .SlateParser import SlateParser


__all__ = ["SlateParser"]

try:
# For Python 3.8 and newer
from importlib.metadata import version, PackageNotFoundError
except ImportError:
# For older Python versions, use the backport
from importlib_metadata import version, PackageNotFoundError

try:
__version__ = version("swarmauri_parser_slate")
except PackageNotFoundError:
# If the package is not installed (for example, during development)
__version__ = "0.0.0"
132 changes: 132 additions & 0 deletions pkgs/community/swarmauri_parser_slate/tests/SlateParser_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
from unittest import mock

import os
import pytest
from swarmauri_parser_slate.SlateParser import SlateParser as Parser
from swarmauri_core.documents.IDocument import IDocument


@pytest.mark.unit
def test_parser_resource():
"""
Test to ensure the parser's resource attribute is correctly set.
"""
parser = Parser()
assert parser.resource == "Parser", "The resource attribute should be 'Parser'."


@pytest.mark.unit
def test_parser_type():
"""
Test to ensure the parser's type attribute is correctly set.
"""
parser = Parser()
assert parser.type == "SlateParser", "The type attribute should be 'SlateParser'."


@pytest.mark.unit
def test_parser_serialization():
"""
Test to ensure the parser can be serialized and deserialized correctly.
"""
parser = Parser()
serialized = parser.model_dump_json()
deserialized = Parser.model_validate_json(serialized)
assert parser.id == deserialized.id, (
"Serialization and deserialization should preserve the parser's ID."
)


@pytest.mark.unit
def test_parser_success_mock_file_path():
"""
Test the parser's ability to successfully parse a PDF file and extract text.
"""
os.chdir(os.path.dirname(__file__))

parser = Parser()
file_path = "resources/demo.pdf"

with mock.patch("slate3k.PDF") as mock_pdf_reader:

mock_pdf_reader.return_value = ['Sample text from page 1.']

# Call the parser's parse method
documents = parser.parse(file_path)

# Assertions
mock_pdf_reader.assert_called_once()

assert len(documents) == 1, "Parser should return a list with one document."
assert isinstance(documents[0], IDocument), (
"Returned object should be an instance of IDocument."
)
assert documents[0].content == 'Sample text from page 1.', (
"Extracted content does not match expected."
)
assert documents[0].metadata["page_number"] == 1, (
"Metadata 'page_number' should be 1."
)
assert documents[0].metadata["source"] == file_path, (
"Metadata 'source' should match the file path."
)

@pytest.mark.unit
def test_parser_success_file_path():
"""
Test the parser's ability to successfully read and parse a PDF file and extract text.
"""
os.chdir(os.path.dirname(__file__))

parser = Parser()
file_path = "resources/demo.pdf"

# Call the parser's parse method
documents = parser.parse(file_path)

assert len(documents) == 1, "Parser should return a list with one document."
assert isinstance(documents[0], IDocument), (
"Returned object should be an instance of IDocument."
)
assert documents[0].content == "This is a demo pdf", (
"Extracted content does not match expected."
)
assert documents[0].metadata["page_number"] == 1, (
"Metadata 'page_number' should be 1."
)
assert documents[0].metadata["source"] == file_path, (
"Metadata 'source' should match the file path."
)


@pytest.mark.unit
def test_parser_invalid_source():
"""
Test that the parser raises a TypeError when given an invalid source type.
"""
parser = Parser()
invalid_source = 12345 # Not a str or bytes

with pytest.raises(TypeError) as exc_info:
parser.parse(invalid_source)

assert "Source must be of type str (file path) or bytes." in str(exc_info.value), (
"TypeError not raised as expected."
)


@pytest.mark.unit
def test_parser_exception_handling():
"""
Test the parser's exception handling when an error occurs during parsing.
"""
parser = Parser()
file_path = "non_existent_file.pdf"

# Call the parser's parse method with a non-existent file
documents = parser.parse(file_path)

# Assertions
assert len(documents) == 0, (
"Parser should return an empty list when an error occurs."
)
Binary file not shown.
1 change: 1 addition & 0 deletions pkgs/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,7 @@ members = [
"community/swarmauri_toolkit_jupytertoolkit",
"community/swarmauri_tool_jupyterexportlatex",
"community/swarmauri_tool_jupytergetiopubmessage",
"community/swarmauri_parser_slate"

]

Expand Down