Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/location.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,15 @@
from functools import total_ordering
from typing import Any, Dict
from typing import Any, Dict, Optional

from dedocutils.data_structures import BBox


@total_ordering
class Location:
def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0) -> None:
def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0, page_width: int = None, page_height: int = None) -> None:
self.page_number = page_number
self.page_width = page_width
self.page_height = page_height
self.bbox = bbox
self.name = name
# TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
Expand All @@ -16,6 +18,11 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
def shift(self, shift_x: int, shift_y: int) -> None:
self.bbox.shift(shift_x, shift_y)

def to_relative_bbox_dict(self) -> Optional[Dict]:
if not self.page_height or not self.page_width:
return None
return self.bbox.to_relative_dict(self.page_width, self.page_height)

def to_dict(self) -> Dict[str, Any]:
from collections import OrderedDict

Expand Down
4 changes: 2 additions & 2 deletions dedoc/readers/pdf_reader/data_classes/tables/scantable.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,11 @@ class ScanTable(Table):
Utility class for storing recognized tables from document images. The class
:class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class.
"""
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1, page_width: int = None, page_height: int = None) -> None:

super().__init__(cells, TableMetadata(page_id=page_number))
self.order = order
self.locations = [Location(page_number, bbox)]
self.locations = [Location(page_number, bbox, page_width=page_width, page_height=page_height)]

def extended(self, table: "ScanTable") -> None:
# extend locations
Expand Down
24 changes: 16 additions & 8 deletions dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
lines, headers, footers = footer_header_analysis(lines)
all_lines = list(flatten(lines))
if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
self._shift_all_contents(lines=all_lines, unref_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
self._shift_all_contents(lines=all_lines, onepage_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)

Expand All @@ -156,27 +156,35 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag
gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
page_range = range(first_page, first_page + len(gost_analyzed_images))
gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))

if isinstance(self, PdfTxtlayerReader):
self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))

result = Parallel(n_jobs=self.config["n_jobs"])(
delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
gost_analyzed_images.items()
)
return result, gost_analyzed_images

def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
def _shift_all_contents(self, lines: List[LineWithMeta], onepage_tables: List[ScanTable], attachments: List[PdfImageAttachment],
gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None:
"""
Shift all recognized content relative to the original source image
"""
# shift unref_tables
for scan_table in unref_tables:
for scan_table in onepage_tables:
for location in scan_table.locations:
table_page_number = location.page_number
location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
page_number = location.page_number
location.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left, shift_y=gost_analyzed_images[page_number][1].y_top_left)
location.page_width, location.page_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]

page_number = scan_table.locations[0].page_number
for row in scan_table.cells:
for cell in row:
image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
orig_image_width, orig_image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
gost_frame_bbox = gost_analyzed_images[page_number][1]
shift_x, shift_y = gost_frame_bbox.x_top_left, gost_frame_bbox.y_top_left
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=orig_image_width, image_height=orig_image_height)

# shift attachments
for attachment in attachments:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_me

self.single_tables = single_tables
multipages_tables = []
list_page_with_tables = []
table_pages = list(map(lambda t: t.location.page_number, single_tables))
max_page_with_table = max(table_pages, default=0)
min_page_with_table = min(table_pages, default=max_page_with_table)
Expand All @@ -36,10 +35,8 @@ def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_me
}

total_cur_page = min_page_with_table
if max_page_with_table == 1: # check on unnecessary this block
for tbls in list_page_with_tables:
multipages_tables.extend(tbls)
return multipages_tables
if max_page_with_table == 0: # check on unnecessary this block
return single_tables

while total_cur_page < max_page_with_table + 1:
begin_page = total_cur_page
Expand Down Expand Up @@ -180,5 +177,14 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
self.logger.debug("Different width columns")
return False

# condition 5. Check table layout
t1_relative_bb = t1.locations[-1].to_relative_bbox_dict()
t2_relative_bb = t2.locations[0].to_relative_bbox_dict()
if t1_relative_bb and t2_relative_bb:
t1_bottom = t1_relative_bb["y_top_left"] + t1_relative_bb["height"] # the end of the table should be at the end of the page
t2_top = t2_relative_bb["y_top_left"] # the beginning of the table should be in the beginning of the page
if t1_bottom < 0.7 or t2_top > 0.3:
return False

t2.cells = copy.deepcopy(t2_update.cells) # save changes
return True
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:
for i, row in enumerate(matrix):
matrix[i] = sorted(row, key=lambda cell: cell.bbox.x_top_left, reverse=False)

matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number)
page_height, page_width = self.image.shape[:2]
matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number, page_width=page_width, page_height=page_height)

return matrix_table

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,8 @@ def __get_tables(self, page: dict) -> List[ScanTable]:

try:
cells = self.table_extractor.handle_cells(cells)
scan_tables.append(ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order))
scan_tables.append(ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order,
page_width=page_width, page_height=page_height))
except Exception as ex:
self.logger.warning(f"Warning: unrecognized table on page {page_number}. {ex}")
if self.config.get("debug_mode", False):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tup
shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left # shift tables to original coordinates
for location in table.locations:
location.bbox.shift(shift_x=shift_x, shift_y=shift_y)
location.page_height, location.page_width = image_height, image_width
for row in table.cells:
for cell in row:
cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
Expand Down
7 changes: 6 additions & 1 deletion tests/api_tests/test_api_module_table_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def test_detect_small_table(self) -> None:

def test_multipage_gost_table_image(self) -> None:
file_name = "gost_multipage_table.pdf"
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"}) # don't pass pdf_with_text_layer to check condition in PDFBaseReader
result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "false"})
self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35)
target_bbox_dict = {
"x_top_left": 0.14,
Expand Down Expand Up @@ -311,6 +311,11 @@ def test_multipage_tables_1(self) -> None:
self.assertEqual(2, len(result["content"]["tables"]))

def test_multipage_tables_2(self) -> None:
result = self._send_request("MIPS64.pdf", data=dict(language="rus+eng", pages="78:79", pdf_with_text_layer="false"))

self.assertEqual(2, len(result["content"]["tables"]))

def test_multipage_tables_3(self) -> None:
result = self._send_request("MIPS64.pdf", data=dict(language="rus+eng", pages="394:395"))

self.assertEqual(2, len(result["content"]["tables"]))