ispras · oksidgy · Sep 8, 2025 · Sep 4, 2025 · Sep 4, 2025 · Sep 4, 2025
diff --git a/dedoc/readers/pdf_reader/data_classes/tables/location.py b/dedoc/readers/pdf_reader/data_classes/tables/location.py
@@ -1,13 +1,15 @@
 from functools import total_ordering
-from typing import Any, Dict
+from typing import Any, Dict, Optional
 
 from dedocutils.data_structures import BBox
 
 
 @total_ordering
 class Location:
-    def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0) -> None:
+    def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle: float = 0.0, page_width: int = None, page_height: int = None) -> None:
         self.page_number = page_number
+        self.page_width = page_width
+        self.page_height = page_height
         self.bbox = bbox
         self.name = name
         # TODO put self.order (change LineWithLocation, PdfImageAttachment, ScanTable)
@@ -16,6 +18,11 @@ def __init__(self, page_number: int, bbox: BBox, name: str = "", rotated_angle:
     def shift(self, shift_x: int, shift_y: int) -> None:
         self.bbox.shift(shift_x, shift_y)
 
+    def to_relative_bbox_dict(self) -> Optional[Dict]:
+        if not self.page_height or not self.page_width:
+            return None
+        return self.bbox.to_relative_dict(self.page_width, self.page_height)
+
     def to_dict(self) -> Dict[str, Any]:
         from collections import OrderedDict
 

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py
@@ -14,11 +14,11 @@ class ScanTable(Table):
     Utility class for storing recognized tables from document images. The class
     :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class.
     """
-    def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None:
+    def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1, page_width: int = None, page_height: int = None) -> None:
 
         super().__init__(cells, TableMetadata(page_id=page_number))
         self.order = order
-        self.locations = [Location(page_number, bbox)]
+        self.locations = [Location(page_number, bbox, page_width=page_width, page_height=page_height)]
 
     def extended(self, table: "ScanTable") -> None:
         # extend locations

diff --git a/dedoc/readers/pdf_reader/pdf_base_reader.py b/dedoc/readers/pdf_reader/pdf_base_reader.py
@@ -136,7 +136,7 @@ def _parse_document(self, path: str, parameters: ParametersForParseDoc) -> (
             lines, headers, footers = footer_header_analysis(lines)
             all_lines = list(flatten(lines))
         if parameters.need_gost_frame_analysis and isinstance(self, PdfImageReader):
-            self._shift_all_contents(lines=all_lines, unref_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
+            self._shift_all_contents(lines=all_lines, onepage_tables=unref_tables, attachments=attachments, gost_analyzed_images=gost_analyzed_images)
         mp_tables = self.table_recognizer.convert_to_multipages_tables(unref_tables, lines_with_meta=all_lines)
         all_lines_with_links = self.linker.link_objects(lines=all_lines, tables=mp_tables, images=attachments)
 
@@ -156,27 +156,35 @@ def _process_document_with_gost_frame(self, images: Iterator[ndarray], first_pag
         gost_analyzed_images = Parallel(n_jobs=self.config["n_jobs"])(delayed(self.gost_frame_recognizer.rec_and_clean_frame)(image) for image in images)
         page_range = range(first_page, first_page + len(gost_analyzed_images))
         gost_analyzed_images = dict(zip(page_range, gost_analyzed_images))
+
         if isinstance(self, PdfTxtlayerReader):
             self.gost_frame_boxes = dict(zip(page_range, [(item[1], item[2]) for item in gost_analyzed_images.values()]))
+
         result = Parallel(n_jobs=self.config["n_jobs"])(
             delayed(self._process_one_page)(image, parameters, page_number, path) for page_number, (image, box, original_image_shape) in
             gost_analyzed_images.items()
         )
         return result, gost_analyzed_images
 
-    def _shift_all_contents(self, lines: List[LineWithMeta], unref_tables: List[ScanTable], attachments: List[PdfImageAttachment],
+    def _shift_all_contents(self, lines: List[LineWithMeta], onepage_tables: List[ScanTable], attachments: List[PdfImageAttachment],
                             gost_analyzed_images: Dict[int, Tuple[ndarray, BBox, Tuple[int, ...]]]) -> None:
+        """
+            Shift all recognized content relative to the original source image
+        """
         # shift unref_tables
-        for scan_table in unref_tables:
+        for scan_table in onepage_tables:
             for location in scan_table.locations:
-                table_page_number = location.page_number
-                location.shift(shift_x=gost_analyzed_images[table_page_number][1].x_top_left, shift_y=gost_analyzed_images[table_page_number][1].y_top_left)
+                page_number = location.page_number
+                location.shift(shift_x=gost_analyzed_images[page_number][1].x_top_left, shift_y=gost_analyzed_images[page_number][1].y_top_left)
+                location.page_width, location.page_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
+
             page_number = scan_table.locations[0].page_number
             for row in scan_table.cells:
                 for cell in row:
-                    image_width, image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
-                    shift_x, shift_y = (gost_analyzed_images[page_number][1].x_top_left, gost_analyzed_images[page_number][1].y_top_left)
-                    cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)
+                    orig_image_width, orig_image_height = gost_analyzed_images[page_number][2][1], gost_analyzed_images[page_number][2][0]
+                    gost_frame_bbox = gost_analyzed_images[page_number][1]
+                    shift_x, shift_y = gost_frame_bbox.x_top_left, gost_frame_bbox.y_top_left
+                    cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=orig_image_width, image_height=orig_image_height)
 
         # shift attachments
         for attachment in attachments:

diff --git a/...reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py b/...reader/table_recognizer/table_extractors/concrete_extractors/multipage_table_extractor.py
@@ -23,7 +23,6 @@ def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_me
 
         self.single_tables = single_tables
         multipages_tables = []
-        list_page_with_tables = []
         table_pages = list(map(lambda t: t.location.page_number, single_tables))
         max_page_with_table = max(table_pages, default=0)
         min_page_with_table = min(table_pages, default=max_page_with_table)
@@ -36,10 +35,8 @@ def extract_multipage_tables(self, single_tables: List[ScanTable], lines_with_me
         }
 
         total_cur_page = min_page_with_table
-        if max_page_with_table == 1:  # check on unnecessary this block
-            for tbls in list_page_with_tables:
-                multipages_tables.extend(tbls)
-            return multipages_tables
+        if max_page_with_table == 0:  # check on unnecessary this block
+            return single_tables
 
         while total_cur_page < max_page_with_table + 1:
             begin_page = total_cur_page
@@ -180,5 +177,14 @@ def __is_one_table(self, t1: ScanTable, t2: ScanTable) -> bool:
                 self.logger.debug("Different width columns")
             return False
 
+        # condition 5. Check table layout
+        t1_relative_bb = t1.locations[-1].to_relative_bbox_dict()
+        t2_relative_bb = t2.locations[0].to_relative_bbox_dict()
+        if t1_relative_bb and t2_relative_bb:
+            t1_bottom = t1_relative_bb["y_top_left"] + t1_relative_bb["height"]  # the end of the table should be at the end of the page
+            t2_top = t2_relative_bb["y_top_left"]                                # the beginning of the table should be in the beginning of the page
+            if t1_bottom < 0.7 or t2_top > 0.3:
+                return False
+
         t2.cells = copy.deepcopy(t2_update.cells)  # save changes
         return True
diff --git a/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py b/...e_reader/table_recognizer/table_extractors/concrete_extractors/onepage_table_extractor.py
@@ -73,7 +73,8 @@ def __get_matrix_table_from_tree(self, table_tree: TableTree) -> ScanTable:
         for i, row in enumerate(matrix):
             matrix[i] = sorted(row, key=lambda cell: cell.bbox.x_top_left, reverse=False)
 
-        matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number)
+        page_height, page_width = self.image.shape[:2]
+        matrix_table = ScanTable(cells=matrix, bbox=table_tree.cell_box, page_number=self.page_number, page_width=page_width, page_height=page_height)
 
         return matrix_table
 

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_tabby_reader.py
@@ -211,7 +211,8 @@ def __get_tables(self, page: dict) -> List[ScanTable]:
 
             try:
                 cells = self.table_extractor.handle_cells(cells)
-                scan_tables.append(ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order))
+                scan_tables.append(ScanTable(page_number=page_number, cells=cells, bbox=table_bbox, order=order,
+                                             page_width=page_width, page_height=page_height))
             except Exception as ex:
                 self.logger.warning(f"Warning: unrecognized table on page {page_number}. {ex}")
                 if self.config.get("debug_mode", False):

diff --git a/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py b/dedoc/readers/pdf_reader/pdf_txtlayer_reader/pdf_txtlayer_reader.py
@@ -86,6 +86,7 @@ def _move_table_cells(self, tables: List[ScanTable], page_shift: BBox, page: Tup
             shift_x, shift_y = page_shift.x_top_left, page_shift.y_top_left  # shift tables to original coordinates
             for location in table.locations:
                 location.bbox.shift(shift_x=shift_x, shift_y=shift_y)
+                location.page_height, location.page_width = image_height, image_width
             for row in table.cells:
                 for cell in row:
                     cell.shift(shift_x=shift_x, shift_y=shift_y, image_width=image_width, image_height=image_height)

diff --git a/tests/api_tests/test_api_module_table_recognizer.py b/tests/api_tests/test_api_module_table_recognizer.py
@@ -207,7 +207,7 @@ def test_detect_small_table(self) -> None:
 
     def test_multipage_gost_table_image(self) -> None:
         file_name = "gost_multipage_table.pdf"
-        result = self._send_request(file_name, data={"need_gost_frame_analysis": "True"})  # don't pass pdf_with_text_layer to check condition in PDFBaseReader
+        result = self._send_request(file_name, data={"need_gost_frame_analysis": "True", "pdf_with_text_layer": "false"})
         self.assertTrue(len(result["content"]["tables"][0]["cells"]) > 35)
         target_bbox_dict = {
             "x_top_left": 0.14,
@@ -311,6 +311,11 @@ def test_multipage_tables_1(self) -> None:
         self.assertEqual(2, len(result["content"]["tables"]))
 
     def test_multipage_tables_2(self) -> None:
+        result = self._send_request("MIPS64.pdf", data=dict(language="rus+eng", pages="78:79", pdf_with_text_layer="false"))
+
+        self.assertEqual(2, len(result["content"]["tables"]))
+
+    def test_multipage_tables_3(self) -> None:
         result = self._send_request("MIPS64.pdf", data=dict(language="rus+eng", pages="394:395"))
 
         self.assertEqual(2, len(result["content"]["tables"]))