Merge pull request #1306 from VijayVignesh1/community/slatepdfparser

cobycloud · web-flow · commit eccf950a98e7 · 2025-04-28T02:57:09.000-05:00
Community Feature: PDF SlateParser
diff --git a/pkgs/community/swarmauri_parser_slate/README.md b/pkgs/community/swarmauri_parser_slate/README.md
@@ -0,0 +1,39 @@
+![Swarmauri Logo](https://res.cloudinary.com/dbjmpekvl/image/upload/v1730099724/Swarmauri-logo-lockup-2048x757_hww01w.png)
+
+<div align="center">
+
+![PyPI - Downloads](https://img.shields.io/pypi/dm/swarmauri_parser_pypdftk)
+![PyPI - Python Version](https://img.shields.io/pypi/pyversions/swarmauri_parser_pypdftk)
+![PyPI - License](https://img.shields.io/pypi/l/swarmauri_parser_pypdftk)
+![PyPI - Version](https://img.shields.io/pypi/v/swarmauri_parser_pypdftk?label=swarmauri_parser_pypdftk&color=green)
+
+</div>
+
+---
+
+# Swarmauri SlateParser
+
+A parser for reading and extracting data fields from PDF files using Slate.
+
+## Installation
+
+```bash
+pip install swarmauri_parser_slate
+```
+
+## Usage
+Basic usage example with code snippet:
+```python
+from swarmauri.parsers.SlateParser import SlateParser
+
+parser = SlateParser()
+file_path = "path/to/your/pdf_file.pdf"
+documents = parser.parse(file_path)
+
+for document in documents:
+    print(document.content)
+```
+
+## Want to help?
+
+If you want to contribute to swarmauri-sdk, read up on our [guidelines for contributing](https://github.com/swarmauri/swarmauri-sdk/blob/master/contributing.md) that will help you get started.
diff --git a/pkgs/community/swarmauri_parser_slate/pyproject.toml b/pkgs/community/swarmauri_parser_slate/pyproject.toml
@@ -0,0 +1,65 @@
+[project]
+name = "swarmauri_parser_slate"
+version = "0.1.0.dev1"
+description = "A parser for extracting text from PDFs using Slate."
+license = "Apache-2.0"
+readme = "README.md"
+repository = "http://github.com/swarmauri/swarmauri-sdk"
+requires-python = ">=3.10,<3.13"
+classifiers = [
+    "License :: OSI Approved :: Apache Software License",
+    "Programming Language :: Python :: 3.10",
+    "Programming Language :: Python :: 3.11",
+    "Programming Language :: Python :: 3.12",
+]
+authors = [{ name = "Vijay Vignesh", email = "vijayvigneshp02@gmail.com" }]
+dependencies = [
+    "slate3k>=0.5",
+    "swarmauri_core",
+    "swarmauri_base",
+    "swarmauri_standard",
+]
+
+[tool.uv.sources]
+swarmauri_core = { workspace = true }
+swarmauri_base = { workspace = true }
+swarmauri_standard = { workspace = true }
+
+[tool.pytest.ini_options]
+norecursedirs = ["combined", "scripts"]
+markers = [
+    "test: standard test",
+    "unit: Unit tests",
+    "i9n: Integration tests",
+    "r8n: Regression tests",
+    "timeout: mark test to timeout after X seconds",
+    "xpass: Expected passes",
+    "xfail: Expected failures",
+    "acceptance: Acceptance tests",
+]
+timeout = 300
+log_cli = true
+log_cli_level = "INFO"
+log_cli_format = "%(asctime)s [%(levelname)s] %(message)s"
+log_cli_date_format = "%Y-%m-%d %H:%M:%S"
+asyncio_default_fixture_loop_scope = "function"
+
+[tool.project.entry-points."swarmauri.parsers"]
+SlateParser = "swarmauri_parser_slate.SlateParser:SlateParser"
+
+[build-system]
+requires = ["poetry-core>=1.0.0"]
+build-backend = "poetry.core.masonry.api"
+
+[dependency-groups]
+dev = [
+    "pytest>=8.0",
+    "pytest-asyncio>=0.24.0",
+    "pytest-xdist>=3.6.1",
+    "pytest-json-report>=1.5.0",
+    "python-dotenv",
+    "requests>=2.32.3",
+    "flake8>=7.0",
+    "pytest-timeout>=2.3.1",
+    "ruff>=0.9.9",
+]
diff --git a/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/SlateParser.py b/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/SlateParser.py
@@ -0,0 +1,51 @@
+from typing import List, Literal
+
+import slate3k as slate
+from swarmauri_standard.documents.Document import Document
+from swarmauri_base.parsers.ParserBase import ParserBase
+from swarmauri_base.ComponentBase import ComponentBase
+
+
+@ComponentBase.register_type(ParserBase, "SlateParser")
+class SlateParser(ParserBase):
+    """
+    Parser for reading and extracting data fields from PDF files using Slate3k.
+    """
+
+    type: Literal["SlateParser"] = "SlateParser"
+
+    def parse(self, source: str) -> List[Document]:
+        """
+        Parses a PDF file and extracts its data fields as Document instances.
+
+        Parameters:
+        - source (str): The path to the PDF file.
+
+        Returns:
+        - List[IDocument]: A list containing a single Document instance with the extracted data fields.
+        """
+
+        documents = []
+        if isinstance(source, str):
+            try:
+                with open(source, "rb") as file:
+                    reader = slate.PDF(file)
+                    print(reader)
+                    for page_num, page in enumerate(reader):
+                        text = page
+                        if text:
+                            document = Document(
+                                content=text.strip(),
+                                metadata={
+                                    "page_number": page_num + 1,
+                                    "source": source,
+                                },
+                            )
+                            documents.append(document)
+            except Exception as e:
+                print(f"An error occurred while parsing the PDF '{source}': {e}")
+                return []
+        else:
+            raise TypeError("Source must be of type str (file path) or bytes.")
+        
+        return documents
diff --git a/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/__init__.py b/pkgs/community/swarmauri_parser_slate/swarmauri_parser_slate/__init__.py
@@ -0,0 +1,17 @@
+from .SlateParser import SlateParser
+
+
+__all__ = ["SlateParser"]
+
+try:
+    # For Python 3.8 and newer
+    from importlib.metadata import version, PackageNotFoundError
+except ImportError:
+    # For older Python versions, use the backport
+    from importlib_metadata import version, PackageNotFoundError
+
+try:
+    __version__ = version("swarmauri_parser_slate")
+except PackageNotFoundError:
+    # If the package is not installed (for example, during development)
+    __version__ = "0.0.0"
diff --git a/pkgs/community/swarmauri_parser_slate/tests/SlateParser_test.py b/pkgs/community/swarmauri_parser_slate/tests/SlateParser_test.py
@@ -0,0 +1,132 @@
+from unittest import mock
+
+import os
+import pytest
+from swarmauri_parser_slate.SlateParser import SlateParser as Parser
+from swarmauri_core.documents.IDocument import IDocument
+
+
+@pytest.mark.unit
+def test_parser_resource():
+    """
+    Test to ensure the parser's resource attribute is correctly set.
+    """
+    parser = Parser()
+    assert parser.resource == "Parser", "The resource attribute should be 'Parser'."
+
+
+@pytest.mark.unit
+def test_parser_type():
+    """
+    Test to ensure the parser's type attribute is correctly set.
+    """
+    parser = Parser()
+    assert parser.type == "SlateParser", "The type attribute should be 'SlateParser'."
+
+
+@pytest.mark.unit
+def test_parser_serialization():
+    """
+    Test to ensure the parser can be serialized and deserialized correctly.
+    """
+    parser = Parser()
+    serialized = parser.model_dump_json()
+    deserialized = Parser.model_validate_json(serialized)
+    assert parser.id == deserialized.id, (
+        "Serialization and deserialization should preserve the parser's ID."
+    )
+
+
+@pytest.mark.unit
+def test_parser_success_mock_file_path():
+    """
+    Test the parser's ability to successfully parse a PDF file and extract text.
+    """
+    os.chdir(os.path.dirname(__file__))
+
+    parser = Parser()
+    file_path = "resources/demo.pdf"
+
+    with mock.patch("slate3k.PDF") as mock_pdf_reader:
+
+        mock_pdf_reader.return_value = ['Sample text from page 1.']
+
+        # Call the parser's parse method
+        documents = parser.parse(file_path)
+        
+        # Assertions
+        mock_pdf_reader.assert_called_once()
+
+        assert len(documents) == 1, "Parser should return a list with one document."
+        assert isinstance(documents[0], IDocument), (
+            "Returned object should be an instance of IDocument."
+        )
+        assert documents[0].content == 'Sample text from page 1.', (
+            "Extracted content does not match expected."
+        )
+        assert documents[0].metadata["page_number"] == 1, (
+            "Metadata 'page_number' should be 1."
+        )
+        assert documents[0].metadata["source"] == file_path, (
+            "Metadata 'source' should match the file path."
+        )
+
+@pytest.mark.unit
+def test_parser_success_file_path():
+    """
+    Test the parser's ability to successfully read and parse a PDF file and extract text.
+    """
+    os.chdir(os.path.dirname(__file__))
+
+    parser = Parser()
+    file_path = "resources/demo.pdf"
+
+    # Call the parser's parse method
+    documents = parser.parse(file_path)
+
+    assert len(documents) == 1, "Parser should return a list with one document."
+    assert isinstance(documents[0], IDocument), (
+        "Returned object should be an instance of IDocument."
+    )
+    assert documents[0].content == "This is a demo pdf", (
+        "Extracted content does not match expected."
+    )
+    assert documents[0].metadata["page_number"] == 1, (
+        "Metadata 'page_number' should be 1."
+    )
+    assert documents[0].metadata["source"] == file_path, (
+        "Metadata 'source' should match the file path."
+    )    
+
+
+@pytest.mark.unit
+def test_parser_invalid_source():
+    """
+    Test that the parser raises a TypeError when given an invalid source type.
+    """
+    parser = Parser()
+    invalid_source = 12345  # Not a str or bytes
+
+    with pytest.raises(TypeError) as exc_info:
+        parser.parse(invalid_source)
+
+    assert "Source must be of type str (file path) or bytes." in str(exc_info.value), (
+        "TypeError not raised as expected."
+    )
+
+
+@pytest.mark.unit
+def test_parser_exception_handling():
+    """
+    Test the parser's exception handling when an error occurs during parsing.
+    """
+    parser = Parser()
+    file_path = "non_existent_file.pdf"
+
+    # Call the parser's parse method with a non-existent file
+    documents = parser.parse(file_path)
+
+    # Assertions
+    assert len(documents) == 0, (
+        "Parser should return an empty list when an error occurs."
+    )
diff --git a/pkgs/community/swarmauri_parser_slate/tests/resources/demo.pdf b/pkgs/community/swarmauri_parser_slate/tests/resources/demo.pdf
diff --git a/pkgs/pyproject.toml b/pkgs/pyproject.toml
@@ -105,6 +105,7 @@ members = [
     "community/swarmauri_toolkit_jupytertoolkit",
     "community/swarmauri_tool_jupyterexportlatex",
     "community/swarmauri_tool_jupytergetiopubmessage",
+    "community/swarmauri_parser_slate"
 
 ]
 

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,7 @@ members = [`
`105`	`105`	`"community/swarmauri_toolkit_jupytertoolkit",`
`106`	`106`	`"community/swarmauri_tool_jupyterexportlatex",`
`107`	`107`	`"community/swarmauri_tool_jupytergetiopubmessage",`
	`108`	`+ "community/swarmauri_parser_slate"`
`108`	`109`
`109`	`110`	`]`
`110`	`111`