Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
01d62ca
feat: replace pdfminer with paves.miner
Aug 1, 2025
1df8a60
deps: update deps
Aug 1, 2025
a57f753
feat!: remove dependency on pypdf by not extracting pages
Aug 1, 2025
36e383c
fix: nope gotta use those laparams
Aug 1, 2025
9fd1992
chore: foo
Aug 1, 2025
db45d7b
fix: support parallel like before
Aug 1, 2025
1b3eb72
fix: make error messages match
Aug 1, 2025
85ee346
fix: raise from
Aug 1, 2025
9fe7ce7
fix!: allow converting other than page 1
Aug 1, 2025
ff6e1ac
fix: update error message
Aug 1, 2025
f4a9dca
fix: rely on new playa that lets you set rotation
Aug 1, 2025
3b7a290
fix: apply rotation in image processing and plotting
Aug 2, 2025
241245e
chore: lock
Aug 2, 2025
69eb340
fix: apply rotation to threshold too
Aug 2, 2025
6028b12
fix(tests): ensure ultimate error message compatibility
Aug 2, 2025
aed3c7b
fix(types): fix types
Aug 2, 2025
44d3bb1
chore: isort
Aug 2, 2025
3c500b2
chore: blacken
Aug 2, 2025
0a5557d
revert: go back to master for ntoebook
Aug 2, 2025
2fa6884
fix: is_extractable works in strange ways...
Aug 2, 2025
02d3fd0
Revert "fix: is_extractable works in strange ways..."
Aug 2, 2025
059a05c
fix(tests): verify that no-extraction is respected
Aug 2, 2025
af23ad9
fix: remove unused import
dhdaines Aug 16, 2025
def20ac
fix(types): remove test that cannot possibly work and has bad types
dhdaines Aug 16, 2025
94aa235
fix: render the correct page and do not save images in lattice parser
dhdaines Aug 17, 2025
1caec0c
fix(deps): restore python 3.8 compatibility with latest playa
Aug 17, 2025
3d73d25
fix(tests): add setuptools dependency to hopefully fix py3.8 tests
Aug 20, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion camelot/backends/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,6 @@ def installed(self) -> bool: # noqa D102
raise NotImplementedError

def convert( # noqa D102
self, pdf_path: str, png_path: str, resolution: int = 300
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None: # noqa D102
raise NotImplementedError
8 changes: 7 additions & 1 deletion camelot/backends/ghostscript_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,9 @@
class GhostscriptBackend(ConversionBackend):
"""Classmethod to create GhostscriptScriptBackend."""

def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
def convert(
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None:
"""Convert a PDF to a PNG image using Ghostscript .

Parameters
Expand All @@ -17,6 +19,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
[description]
resolution : int, optional
[description], by default 300
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -35,6 +39,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
"gs",
"-q",
"-sDEVICE=png16m",
f"-dFirstPage={page}",
f"-dLastPage={page}",
"-o",
png_path,
f"-r{resolution}",
Expand Down
8 changes: 5 additions & 3 deletions camelot/backends/image_conversion.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ def implements_convert():

return backend

def convert(self, pdf_path: str, png_path: str) -> None:
def convert(self, pdf_path: str, png_path: str, page: int = 1) -> None:
"""Convert PDF to png_path.

Parameters
Expand All @@ -113,6 +113,8 @@ def convert(self, pdf_path: str, png_path: str) -> None:
Path where to read the pdf file.
png_path : str
Path where to save png file.
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -122,13 +124,13 @@ def convert(self, pdf_path: str, png_path: str) -> None:
[description]
"""
try:
self.backend.convert(pdf_path, png_path)
self.backend.convert(pdf_path, png_path, page=page)
except Exception as f:
if self.use_fallback:
for fallback in self.fallbacks:
try:
converter = BACKENDS[fallback]()
converter.convert(pdf_path, png_path)
converter.convert(pdf_path, png_path, page=page)
except Exception as e:
msg = f"Image conversion failed with image conversion backend {fallback!r}\n error: {e}"
raise ImageConversionError(msg) from e
Expand Down
8 changes: 6 additions & 2 deletions camelot/backends/pdfium_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ def installed(self) -> bool: # noqa D102
return True
return False

def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
def convert(
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None:
"""Convert PDF to png.

Parameters
Expand All @@ -29,6 +31,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
Path where to read the pdf file.
png_path : str
Path where to save png file.
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -39,7 +43,7 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
raise OSError(f"pypdfium2 is not available: {PDFIUM_EXC!r}")
doc = pdfium.PdfDocument(pdf_path)
doc.init_forms()
image = doc[0].render(scale=resolution / 72).to_pil()
image = doc[page - 1].render(scale=resolution / 72).to_pil()
image.save(png_path)
image.close()
doc.close()
23 changes: 19 additions & 4 deletions camelot/backends/poppler_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,9 @@
class PopplerBackend(ConversionBackend):
"""Classmethod to create a poplerBackendBackend class."""

def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
def convert(
self, pdf_path: str, png_path: str, resolution: int = 300, page: int = 1
) -> None:
"""Convert PDF to png.

Parameters
Expand All @@ -31,6 +33,8 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
Path where to read the pdf file.
png_path : str
Path where to save png file.
page: int, optional
Single page to convert.

Raises
------
Expand All @@ -39,13 +43,24 @@ def convert(self, pdf_path: str, png_path: str, resolution: int = 300) -> None:
ValueError
[description]
"""
pdftopng_executable = shutil.which("pdftopng", path=path)
pdftopng_executable = shutil.which("pdftocairo", path=path)
if pdftopng_executable is None:
raise OSError(
"pdftopng is not installed. You can install it using the 'pip install pdftopng' command."
"pdftocairo is not installed. Please install `poppler-utils`."
)

pdftopng_command = [pdftopng_executable, pdf_path, png_path]
png_stem, _ = os.path.splitext(png_path)
pdftopng_command = [
pdftopng_executable,
"-png",
"-singlefile",
"-f",
str(page),
"-l",
str(page),
pdf_path,
png_stem,
]

try:
subprocess.check_output(
Expand Down
6 changes: 4 additions & 2 deletions camelot/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from typing_extensions import TypedDict, Unpack

from .backends import ImageConversionBackend
from .image_processing import undo_rotation
from .utils import build_file_path_in_temp_dir
from .utils import get_index_closest_point
from .utils import get_textline_coords
Expand Down Expand Up @@ -548,6 +549,7 @@ def __init__(self, cols, rows):
self.filename = None
self.order = None
self.page = None
self.rotation = ""
self.flavor = None # Flavor of the parser that generated the table
self.pdf_size = None # Dimensions of the original PDF page
self._bbox = None # Bounding box in original document
Expand Down Expand Up @@ -618,8 +620,8 @@ def get_pdf_image(self):
os.path.basename(self.filename), ".png"
)
backend = ImageConversionBackend(use_fallback=True)
backend.convert(self.filename, self._image_path)
self._image = cv2.imread(self._image_path)
backend.convert(self.filename, self._image_path, page=self.page)
self._image = undo_rotation(cv2.imread(self._image_path), self.rotation)
return self._image

def set_all_edges(self):
Expand Down
Loading
Loading