diff --git a/dedoc/api/web/index.html b/dedoc/api/web/index.html index 7e18eff2..85473f06 100644 --- a/dedoc/api/web/index.html +++ b/dedoc/api/web/index.html @@ -13,6 +13,7 @@ details > summary {font-style: italic; cursor: pointer; display: list-item;} .child.max {padding-left: 5px; flex: 1} .parent {display: flex} + details { padding-left: 24px;} @@ -142,15 +143,18 @@

PDF handling

-
need_pdf_table_analysis +
need_pdf_table_analysis, table_type

+

+ +

- +

diff --git a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py index 9ae91c18..1dbc7d99 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/scantable.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/scantable.py @@ -10,6 +10,10 @@ class ScanTable(Table): + """ + Utility class for storing recognized tables from document images. The class + :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer` works with this class. + """ def __init__(self, page_number: int, cells: List[List[CellWithMeta]], bbox: BBox, order: int = -1) -> None: super().__init__(cells, TableMetadata(page_id=page_number)) diff --git a/dedoc/readers/pdf_reader/data_classes/tables/table_type.py b/dedoc/readers/pdf_reader/data_classes/tables/table_type.py index 31ceb4f0..99c64b27 100644 --- a/dedoc/readers/pdf_reader/data_classes/tables/table_type.py +++ b/dedoc/readers/pdf_reader/data_classes/tables/table_type.py @@ -1,4 +1,61 @@ class TableTypeAdditionalOptions: + """ + Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by + class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`. + + * Parameter `table_type=wo_external_bounds` - recognize tables without external bounds; + + Example of a table of type `wo_external_bounds`:: + + text | text | text + --------+------+------ + text | text | text + --------+------+------ + text | text | text + --------+------+------ + text | text | text + + + * Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table; + + Example of a page with a table of type `one_cell_table`:: + + _________________________ + Header of document + text text text +------+ + text | text | <--- it is a table + +------+ + ________________________ + + * Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table; + + Example of a table of type `split_last_column`:: + + +--------+------+-------+ + | text | text | text1 | + +--------+------+ | + | text0 | text | text2 | + | | -----| | + | | text | text3 | + +--------+------+ | + | text | text | text4 | + +--------+------+-------+ + | + Recognition + | + V + +--------+------+-------+ + | text | text | text1 | + +--------+------+-------| + | text0 | text | text2 | + |--------+ -----+------ | + | text0 | text | text3 | + +--------+------+------ | + | text | text | text4 | + +--------+------+-------+ + + """ + def __init__(self) -> None: self.table_wo_external_bounds = "wo_external_bounds" self.detect_one_cell_table = "one_cell_table" diff --git a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py index e60fa141..7bcc7cc6 100644 --- a/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py +++ b/dedoc/readers/pdf_reader/pdf_image_reader/table_recognizer/table_recognizer.py @@ -19,7 +19,18 @@ """-------------------------------------entry class of Table Recognizer Module---------------------------------------""" -class TableRecognizer(object): +class TableRecognizer: + """ + The class recognizes tables from document images. This class is internal to the system. It is called from readers such as . + + * The class recognizes tables with borders from the document image and returns the class + (function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`); + + + * The class also analyzes recognized single-page tables and combines them into multi-page ones + (function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`); + + """ def __init__(self, *, config: dict = None) -> None: self.logger = config.get("logger", logging.getLogger()) @@ -29,10 +40,20 @@ def __init__(self, *, config: dict = None) -> None: self.table_type = TableTypeAdditionalOptions() def convert_to_multipages_tables(self, all_single_tables: List[ScanTable], lines_with_meta: List[LineWithMeta]) -> List[ScanTable]: + """ + The function analyzes recognized tables from the entire document (all pages) to see if they are multi-page. + If single-page tables are part of one multi-page, they are combined into one multi-page table. + """ multipage_tables = self.multipage_tables_extractor.extract_multipage_tables(single_tables=all_single_tables, lines_with_meta=lines_with_meta) return multipage_tables def recognize_tables_from_image(self, image: np.ndarray, page_number: int, language: str, table_type: str = "") -> Tuple[np.ndarray, List[ScanTable]]: + """ + The function recognizes tables with borders from scanned document image. + Here, the contour analysis method is used to determine the boundaries of table cells. + Then, a set of heuristics is used to detect tables, and finally, + the detected table cells are converted to a matrix form (merged cells are detected and separated). + """ self.logger.debug(f"Page {page_number}") try: cleaned_image, scan_tables = self.__rec_tables_from_img(image, page_num=page_number, language=language, table_type=table_type) diff --git a/docs/source/modules/data_structures.rst b/docs/source/modules/data_structures.rst index d6785ac7..d846c281 100644 --- a/docs/source/modules/data_structures.rst +++ b/docs/source/modules/data_structures.rst @@ -76,6 +76,11 @@ Helper classes .. autoclass:: dedoc.data_structures.AttachedFile :members: + +.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.scantable.ScanTable + :show-inheritance: + :members: + .. _annotations: Annotations of the text lines diff --git a/docs/source/modules/manager.rst b/docs/source/modules/manager.rst index 53a5cb26..68e5a43c 100644 --- a/docs/source/modules/manager.rst +++ b/docs/source/modules/manager.rst @@ -10,3 +10,11 @@ Dedoc pipeline .. autoclass:: dedoc.attachments_handler.AttachmentsHandler :special-members: __init__ :members: + +.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer + :show-inheritance: + :members: + +.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions + :show-inheritance: + :members: \ No newline at end of file diff --git a/docs/source/parameters/pdf_handling.rst b/docs/source/parameters/pdf_handling.rst index 11bf1348..c3f73396 100644 --- a/docs/source/parameters/pdf_handling.rst +++ b/docs/source/parameters/pdf_handling.rst @@ -169,6 +169,18 @@ PDF and images handling If the document has a textual layer, it is recommended to use :class:`dedoc.readers.PdfTabbyReader`, in this case tables will be parsed much easier and faster. + * - table_type + - "", wo_external_bounds, one_cell_table, split_last_column and their combinaton + - "" + - * :meth:`dedoc.DedocManager.parse` + * :meth:`dedoc.readers.PdfAutoReader.read`, :meth:`dedoc.readers.PdfTxtlayerReader.read`, :meth:`dedoc.readers.PdfImageReader.read` + * :meth:`dedoc.readers.ReaderComposition.read` + - Setting up the table recognition method. The table recognition method is used in :class:`dedoc.readers.PdfImageReader` and + :class:`dedoc.readers.PdfTxtlayerReader`. The value of the parameter specifies the type of tables recognized when processed by + class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`. More details about each parameter value + are disclosed in the class :class:`dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions` description. + You can use combination of values (for example, `wo_external_bounds+one_cell_table`). + * - need_gost_frame_analysis - True, False - False