Skip to content

Commit eccf950

Browse files
authored
Merge pull request #1306 from VijayVignesh1/community/slatepdfparser
Community Feature: PDF SlateParser
2 parents 6bb6ba7 + a7f5db1 commit eccf950

File tree

7 files changed

+305
-0
lines changed

7 files changed

+305
-0
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
![Swarmauri Logo](https://res.cloudinary.com/dbjmpekvl/image/upload/v1730099724/Swarmauri-logo-lockup-2048x757_hww01w.png)
2+
3+
<div align="center">
4+
5+
![PyPI - Downloads](https://img.shields.io/pypi/dm/swarmauri_parser_pypdftk)
6+
![PyPI - Python Version](https://img.shields.io/pypi/pyversions/swarmauri_parser_pypdftk)
7+
![PyPI - License](https://img.shields.io/pypi/l/swarmauri_parser_pypdftk)
8+
![PyPI - Version](https://img.shields.io/pypi/v/swarmauri_parser_pypdftk?label=swarmauri_parser_pypdftk&color=green)
9+
10+
</div>
11+
12+
---
13+
14+
# Swarmauri SlateParser
15+
16+
A parser for reading and extracting data fields from PDF files using Slate.
17+
18+
## Installation
19+
20+
```bash
21+
pip install swarmauri_parser_slate
22+
```
23+
24+
## Usage
25+
Basic usage example with code snippet:
26+
```python
27+
from swarmauri.parsers.SlateParser import SlateParser
28+
29+
parser = SlateParser()
30+
file_path = "path/to/your/pdf_file.pdf"
31+
documents = parser.parse(file_path)
32+
33+
for document in documents:
34+
print(document.content)
35+
```
36+
37+
## Want to help?
38+
39+
If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,65 @@
1+
[project]
2+
name = "swarmauri_parser_slate"
3+
version = "0.1.0.dev1"
4+
description = "A parser for extracting text from PDFs using Slate."
5+
license = "Apache-2.0"
6+
readme = "README.md"
7+
repository = "http://github.com/swarmauri/swarmauri-sdk"
8+
requires-python = ">=3.10,<3.13"
9+
classifiers = [
10+
"License :: OSI Approved :: Apache Software License",
11+
"Programming Language :: Python :: 3.10",
12+
"Programming Language :: Python :: 3.11",
13+
"Programming Language :: Python :: 3.12",
14+
]
15+
authors = [{ name = "Vijay Vignesh", email = "[email protected]" }]
16+
dependencies = [
17+
"slate3k>=0.5",
18+
"swarmauri_core",
19+
"swarmauri_base",
20+
"swarmauri_standard",
21+
]
22+
23+
[tool.uv.sources]
24+
swarmauri_core = { workspace = true }
25+
swarmauri_base = { workspace = true }
26+
swarmauri_standard = { workspace = true }
27+
28+
[tool.pytest.ini_options]
29+
norecursedirs = ["combined", "scripts"]
30+
markers = [
31+
"test: standard test",
32+
"unit: Unit tests",
33+
"i9n: Integration tests",
34+
"r8n: Regression tests",
35+
"timeout: mark test to timeout after X seconds",
36+
"xpass: Expected passes",
37+
"xfail: Expected failures",
38+
"acceptance: Acceptance tests",
39+
]
40+
timeout = 300
41+
log_cli = true
42+
log_cli_level = "INFO"
43+
log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
44+
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
45+
asyncio_default_fixture_loop_scope = "function"
46+
47+
[tool.project.entry-points."swarmauri.parsers"]
48+
SlateParser = "swarmauri_parser_slate.SlateParser:SlateParser"
49+
50+
[build-system]
51+
requires = ["poetry-core>=1.0.0"]
52+
build-backend = "poetry.core.masonry.api"
53+
54+
[dependency-groups]
55+
dev = [
56+
"pytest>=8.0",
57+
"pytest-asyncio>=0.24.0",
58+
"pytest-xdist>=3.6.1",
59+
"pytest-json-report>=1.5.0",
60+
"python-dotenv",
61+
"requests>=2.32.3",
62+
"flake8>=7.0",
63+
"pytest-timeout>=2.3.1",
64+
"ruff>=0.9.9",
65+
]
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
from typing import List, Literal
2+
3+
import slate3k as slate
4+
from swarmauri_standard.documents.Document import Document
5+
from swarmauri_base.parsers.ParserBase import ParserBase
6+
from swarmauri_base.ComponentBase import ComponentBase
7+
8+
9+
@ComponentBase.register_type(ParserBase, "SlateParser")
10+
class SlateParser(ParserBase):
11+
"""
12+
Parser for reading and extracting data fields from PDF files using Slate3k.
13+
"""
14+
15+
type: Literal["SlateParser"] = "SlateParser"
16+
17+
def parse(self, source: str) -> List[Document]:
18+
"""
19+
Parses a PDF file and extracts its data fields as Document instances.
20+
21+
Parameters:
22+
- source (str): The path to the PDF file.
23+
24+
Returns:
25+
- List[IDocument]: A list containing a single Document instance with the extracted data fields.
26+
"""
27+
28+
documents = []
29+
if isinstance(source, str):
30+
try:
31+
with open(source, "rb") as file:
32+
reader = slate.PDF(file)
33+
print(reader)
34+
for page_num, page in enumerate(reader):
35+
text = page
36+
if text:
37+
document = Document(
38+
content=text.strip(),
39+
metadata={
40+
"page_number": page_num + 1,
41+
"source": source,
42+
},
43+
)
44+
documents.append(document)
45+
except Exception as e:
46+
print(f"An error occurred while parsing the PDF '{source}': {e}")
47+
return []
48+
else:
49+
raise TypeError("Source must be of type str (file path) or bytes.")
50+
51+
return documents
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
from .SlateParser import SlateParser
2+
3+
4+
__all__ = ["SlateParser"]
5+
6+
try:
7+
# For Python 3.8 and newer
8+
from importlib.metadata import version, PackageNotFoundError
9+
except ImportError:
10+
# For older Python versions, use the backport
11+
from importlib_metadata import version, PackageNotFoundError
12+
13+
try:
14+
__version__ = version("swarmauri_parser_slate")
15+
except PackageNotFoundError:
16+
# If the package is not installed (for example, during development)
17+
__version__ = "0.0.0"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,132 @@
1+
from unittest import mock
2+
3+
import os
4+
import pytest
5+
from swarmauri_parser_slate.SlateParser import SlateParser as Parser
6+
from swarmauri_core.documents.IDocument import IDocument
7+
8+
9+
@pytest.mark.unit
10+
def test_parser_resource():
11+
"""
12+
Test to ensure the parser's resource attribute is correctly set.
13+
"""
14+
parser = Parser()
15+
assert parser.resource == "Parser", "The resource attribute should be 'Parser'."
16+
17+
18+
@pytest.mark.unit
19+
def test_parser_type():
20+
"""
21+
Test to ensure the parser's type attribute is correctly set.
22+
"""
23+
parser = Parser()
24+
assert parser.type == "SlateParser", "The type attribute should be 'SlateParser'."
25+
26+
27+
@pytest.mark.unit
28+
def test_parser_serialization():
29+
"""
30+
Test to ensure the parser can be serialized and deserialized correctly.
31+
"""
32+
parser = Parser()
33+
serialized = parser.model_dump_json()
34+
deserialized = Parser.model_validate_json(serialized)
35+
assert parser.id == deserialized.id, (
36+
"Serialization and deserialization should preserve the parser's ID."
37+
)
38+
39+
40+
@pytest.mark.unit
41+
def test_parser_success_mock_file_path():
42+
"""
43+
Test the parser's ability to successfully parse a PDF file and extract text.
44+
"""
45+
os.chdir(os.path.dirname(__file__))
46+
47+
parser = Parser()
48+
file_path = "resources/demo.pdf"
49+
50+
with mock.patch("slate3k.PDF") as mock_pdf_reader:
51+
52+
mock_pdf_reader.return_value = ['Sample text from page 1.']
53+
54+
# Call the parser's parse method
55+
documents = parser.parse(file_path)
56+
57+
# Assertions
58+
mock_pdf_reader.assert_called_once()
59+
60+
assert len(documents) == 1, "Parser should return a list with one document."
61+
assert isinstance(documents[0], IDocument), (
62+
"Returned object should be an instance of IDocument."
63+
)
64+
assert documents[0].content == 'Sample text from page 1.', (
65+
"Extracted content does not match expected."
66+
)
67+
assert documents[0].metadata["page_number"] == 1, (
68+
"Metadata 'page_number' should be 1."
69+
)
70+
assert documents[0].metadata["source"] == file_path, (
71+
"Metadata 'source' should match the file path."
72+
)
73+
74+
@pytest.mark.unit
75+
def test_parser_success_file_path():
76+
"""
77+
Test the parser's ability to successfully read and parse a PDF file and extract text.
78+
"""
79+
os.chdir(os.path.dirname(__file__))
80+
81+
parser = Parser()
82+
file_path = "resources/demo.pdf"
83+
84+
# Call the parser's parse method
85+
documents = parser.parse(file_path)
86+
87+
assert len(documents) == 1, "Parser should return a list with one document."
88+
assert isinstance(documents[0], IDocument), (
89+
"Returned object should be an instance of IDocument."
90+
)
91+
assert documents[0].content == "This is a demo pdf", (
92+
"Extracted content does not match expected."
93+
)
94+
assert documents[0].metadata["page_number"] == 1, (
95+
"Metadata 'page_number' should be 1."
96+
)
97+
assert documents[0].metadata["source"] == file_path, (
98+
"Metadata 'source' should match the file path."
99+
)
100+
101+
102+
@pytest.mark.unit
103+
def test_parser_invalid_source():
104+
"""
105+
Test that the parser raises a TypeError when given an invalid source type.
106+
"""
107+
parser = Parser()
108+
invalid_source = 12345 # Not a str or bytes
109+
110+
with pytest.raises(TypeError) as exc_info:
111+
parser.parse(invalid_source)
112+
113+
assert "Source must be of type str (file path) or bytes." in str(exc_info.value), (
114+
"TypeError not raised as expected."
115+
)
116+
117+
118+
@pytest.mark.unit
119+
def test_parser_exception_handling():
120+
"""
121+
Test the parser's exception handling when an error occurs during parsing.
122+
"""
123+
parser = Parser()
124+
file_path = "non_existent_file.pdf"
125+
126+
# Call the parser's parse method with a non-existent file
127+
documents = parser.parse(file_path)
128+
129+
# Assertions
130+
assert len(documents) == 0, (
131+
"Parser should return an empty list when an error occurs."
132+
)
Binary file not shown.

pkgs/pyproject.toml

+1
Original file line numberDiff line numberDiff line change
@@ -105,6 +105,7 @@ members = [
105105
"community/swarmauri_toolkit_jupytertoolkit",
106106
"community/swarmauri_tool_jupyterexportlatex",
107107
"community/swarmauri_tool_jupytergetiopubmessage",
108+
"community/swarmauri_parser_slate"
108109

109110
]
110111

0 commit comments

Comments
 (0)