diff --git a/pkgs/community/swarmauri_parser_slate/README.md b/pkgs/community/swarmauri_parser_slate/README.md new file mode 100644 index 000000000..200c9db7b --- /dev/null +++ b/pkgs/community/swarmauri_parser_slate/README.md @@ -0,0 +1,39 @@ +![Swarmauri Logo](https://res.cloudinary.com/dbjmpekvl/image/upload/v1730099724/Swarmauri-logo-lockup-2048x757_hww01w.png) + +
+ +![PyPI - Downloads](https://img.shields.io/pypi/dm/swarmauri_parser_pypdftk) +![PyPI - Python Version](https://img.shields.io/pypi/pyversions/swarmauri_parser_pypdftk) +![PyPI - License](https://img.shields.io/pypi/l/swarmauri_parser_pypdftk) +![PyPI - Version](https://img.shields.io/pypi/v/swarmauri_parser_pypdftk?label=swarmauri_parser_pypdftk&color=green) + +
+ +--- + +# Swarmauri SlateParser + +A parser for reading and extracting data fields from PDF files using Slate. + +## Installation + +```bash +pip install swarmauri_parser_slate +``` + +## Usage +Basic usage example with code snippet: +```python +from swarmauri.parsers.SlateParser import SlateParser + +parser = SlateParser() +file_path = "path/to/your/pdf_file.pdf" +documents = parser.parse(file_path) + +for document in documents: + print(document.content) +``` + +## Want to help? + +If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started. diff --git a/pkgs/community/swarmauri_parser_slate/pyproject.toml b/pkgs/community/swarmauri_parser_slate/pyproject.toml new file mode 100644 index 000000000..c56abb935 --- /dev/null +++ b/pkgs/community/swarmauri_parser_slate/pyproject.toml @@ -0,0 +1,65 @@ +[project] +name = "swarmauri_parser_slate" +version = "0.1.0.dev1" +description = "A parser for extracting text from PDFs using Slate." +license = "Apache-2.0" +readme = "README.md" +repository = "http://github.com/swarmauri/swarmauri-sdk" +requires-python = ">=3.10,<3.13" +classifiers = [ + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", +] +authors = [{ name = "Vijay Vignesh", email = "vijayvigneshp02@gmail.com" }] +dependencies = [ + "slate3k>=0.5", + "swarmauri_core", + "swarmauri_base", + "swarmauri_standard", +] + +[tool.uv.sources] +swarmauri_core = { workspace = true } +swarmauri_base = { workspace = true } +swarmauri_standard = { workspace = true } + +[tool.pytest.ini_options] +norecursedirs = ["combined", "scripts"] +markers = [ + "test: standard test", + "unit: Unit tests", + "i9n: Integration tests", + "r8n: Regression tests", + "timeout: mark test to timeout after X seconds", + "xpass: Expected passes", + "xfail: Expected failures", + "acceptance: Acceptance tests", +] +timeout = 300 +log_cli = true +log_cli_level = "INFO" +log_cli_format = "%(asctime)s [%(levelname)s] %(message)s" +log_cli_date_format = "%Y-%m-%d %H:%M:%S" +asyncio_default_fixture_loop_scope = "function" + +[tool.project.entry-points."swarmauri.parsers"] +SlateParser = "swarmauri_parser_slate.SlateParser:SlateParser" + +[build-system] +requires = ["poetry-core>=1.0.0"] +build-backend = "poetry.core.masonry.api" + +[dependency-groups] +dev = [ + "pytest>=8.0", + "pytest-asyncio>=0.24.0", + "pytest-xdist>=3.6.1", + "pytest-json-report>=1.5.0", + "python-dotenv", + "requests>=2.32.3", + "flake8>=7.0", + "pytest-timeout>=2.3.1", + "ruff>=0.9.9", +] diff --git a/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/SlateParser.py b/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/SlateParser.py new file mode 100644 index 000000000..5f2d22787 --- /dev/null +++ b/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/SlateParser.py @@ -0,0 +1,51 @@ +from typing import List, Literal + +import slate3k as slate +from swarmauri_standard.documents.Document import Document +from swarmauri_base.parsers.ParserBase import ParserBase +from swarmauri_base.ComponentBase import ComponentBase + + +@ComponentBase.register_type(ParserBase, "SlateParser") +class SlateParser(ParserBase): + """ + Parser for reading and extracting data fields from PDF files using Slate3k. + """ + + type: Literal["SlateParser"] = "SlateParser" + + def parse(self, source: str) -> List[Document]: + """ + Parses a PDF file and extracts its data fields as Document instances. + + Parameters: + - source (str): The path to the PDF file. + + Returns: + - List[IDocument]: A list containing a single Document instance with the extracted data fields. + """ + + documents = [] + if isinstance(source, str): + try: + with open(source, "rb") as file: + reader = slate.PDF(file) + print(reader) + for page_num, page in enumerate(reader): + text = page + if text: + document = Document( + content=text.strip(), + metadata={ + "page_number": page_num + 1, + "source": source, + }, + ) + documents.append(document) + except Exception as e: + print(f"An error occurred while parsing the PDF '{source}': {e}") + return [] + else: + raise TypeError("Source must be of type str (file path) or bytes.") + + return documents diff --git a/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/__init__.py b/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/__init__.py new file mode 100644 index 000000000..c7cfecc78 --- /dev/null +++ b/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/__init__.py @@ -0,0 +1,17 @@ +from .SlateParser import SlateParser + + +__all__ = ["SlateParser"] + +try: + # For Python 3.8 and newer + from importlib.metadata import version, PackageNotFoundError +except ImportError: + # For older Python versions, use the backport + from importlib_metadata import version, PackageNotFoundError + +try: + __version__ = version("swarmauri_parser_slate") +except PackageNotFoundError: + # If the package is not installed (for example, during development) + __version__ = "0.0.0" diff --git a/pkgs/community/swarmauri_parser_slate/tests/SlateParser_test.py b/pkgs/community/swarmauri_parser_slate/tests/SlateParser_test.py new file mode 100644 index 000000000..2d6b9d668 --- /dev/null +++ b/pkgs/community/swarmauri_parser_slate/tests/SlateParser_test.py @@ -0,0 +1,132 @@ +from unittest import mock + +import os +import pytest +from swarmauri_parser_slate.SlateParser import SlateParser as Parser +from swarmauri_core.documents.IDocument import IDocument + + +@pytest.mark.unit +def test_parser_resource(): + """ + Test to ensure the parser's resource attribute is correctly set. + """ + parser = Parser() + assert parser.resource == "Parser", "The resource attribute should be 'Parser'." + + +@pytest.mark.unit +def test_parser_type(): + """ + Test to ensure the parser's type attribute is correctly set. + """ + parser = Parser() + assert parser.type == "SlateParser", "The type attribute should be 'SlateParser'." + + +@pytest.mark.unit +def test_parser_serialization(): + """ + Test to ensure the parser can be serialized and deserialized correctly. + """ + parser = Parser() + serialized = parser.model_dump_json() + deserialized = Parser.model_validate_json(serialized) + assert parser.id == deserialized.id, ( + "Serialization and deserialization should preserve the parser's ID." + ) + + +@pytest.mark.unit +def test_parser_success_mock_file_path(): + """ + Test the parser's ability to successfully parse a PDF file and extract text. + """ + os.chdir(os.path.dirname(__file__)) + + parser = Parser() + file_path = "resources/demo.pdf" + + with mock.patch("slate3k.PDF") as mock_pdf_reader: + + mock_pdf_reader.return_value = ['Sample text from page 1.'] + + # Call the parser's parse method + documents = parser.parse(file_path) + + # Assertions + mock_pdf_reader.assert_called_once() + + assert len(documents) == 1, "Parser should return a list with one document." + assert isinstance(documents[0], IDocument), ( + "Returned object should be an instance of IDocument." + ) + assert documents[0].content == 'Sample text from page 1.', ( + "Extracted content does not match expected." + ) + assert documents[0].metadata["page_number"] == 1, ( + "Metadata 'page_number' should be 1." + ) + assert documents[0].metadata["source"] == file_path, ( + "Metadata 'source' should match the file path." + ) + +@pytest.mark.unit +def test_parser_success_file_path(): + """ + Test the parser's ability to successfully read and parse a PDF file and extract text. + """ + os.chdir(os.path.dirname(__file__)) + + parser = Parser() + file_path = "resources/demo.pdf" + + # Call the parser's parse method + documents = parser.parse(file_path) + + assert len(documents) == 1, "Parser should return a list with one document." + assert isinstance(documents[0], IDocument), ( + "Returned object should be an instance of IDocument." + ) + assert documents[0].content == "This is a demo pdf", ( + "Extracted content does not match expected." + ) + assert documents[0].metadata["page_number"] == 1, ( + "Metadata 'page_number' should be 1." + ) + assert documents[0].metadata["source"] == file_path, ( + "Metadata 'source' should match the file path." + ) + + +@pytest.mark.unit +def test_parser_invalid_source(): + """ + Test that the parser raises a TypeError when given an invalid source type. + """ + parser = Parser() + invalid_source = 12345 # Not a str or bytes + + with pytest.raises(TypeError) as exc_info: + parser.parse(invalid_source) + + assert "Source must be of type str (file path) or bytes." in str(exc_info.value), ( + "TypeError not raised as expected." + ) + + +@pytest.mark.unit +def test_parser_exception_handling(): + """ + Test the parser's exception handling when an error occurs during parsing. + """ + parser = Parser() + file_path = "non_existent_file.pdf" + + # Call the parser's parse method with a non-existent file + documents = parser.parse(file_path) + + # Assertions + assert len(documents) == 0, ( + "Parser should return an empty list when an error occurs." + ) diff --git a/pkgs/community/swarmauri_parser_slate/tests/resources/demo.pdf b/pkgs/community/swarmauri_parser_slate/tests/resources/demo.pdf new file mode 100644 index 000000000..eedb59082 Binary files /dev/null and b/pkgs/community/swarmauri_parser_slate/tests/resources/demo.pdf differ diff --git a/pkgs/pyproject.toml b/pkgs/pyproject.toml index badef4732..8e6865756 100644 --- a/pkgs/pyproject.toml +++ b/pkgs/pyproject.toml @@ -105,6 +105,7 @@ members = [ "community/swarmauri_toolkit_jupytertoolkit", "community/swarmauri_tool_jupyterexportlatex", "community/swarmauri_tool_jupytergetiopubmessage", + "community/swarmauri_parser_slate" ]