Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ dedoc/version.py

# Distribution / packaging
.Python
etc/
env/
build/
develop-eggs/
Expand Down
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@ ENV RESOURCES_PATH "/dedoc_root/resources"
COPY requirements.txt .
RUN pip3 install --no-cache-dir -r requirements.txt
RUN apt-get update && apt-get install -y --fix-missing --no-install-recommends fontforge
RUN apt install -y libutf8proc-dev
RUN ln -s /usr/lib/x86_64-linux-gnu/libutf8proc.so /usr/lib/libutf8proc.so.1

RUN mkdir /dedoc_root
RUN mkdir /dedoc_root/dedoc
Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2.6
2.6.1
2 changes: 1 addition & 1 deletion dedoc/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,7 @@
eml_like_format={".eml"},
mhtml_like_format={".mhtml", ".mht", ".mhtml.gz", ".mht.gz"},
archive_like_format={".zip", ".tar", ".tar.gz", ".rar", ".7z"},
image_like_format={".png"},
image_like_format={".png", ".jpg", ".jpeg", ".tiff", ".tif"},
pdf_like_format={".pdf"},
csv_like_format={".csv", ".tsv"},
txt_like_format={".txt", ".txt.gz"},
Expand Down
21 changes: 13 additions & 8 deletions dedoc/readers/archive_reader/archive_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,14 +85,19 @@ def __read_rar_archive(self, path: str, tmp_dir: str, need_content_analysis: boo
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)

def __read_7z_archive(self, path: str, tmp_dir: str, need_content_analysis: bool) -> Iterator[AttachedFile]:
import py7zlib

with open(path, "rb") as content:
arch_file = py7zlib.Archive7z(content)
names = arch_file.getnames()
for name in names:
file = arch_file.getmember(name)
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=name, file=file, need_content_analysis=need_content_analysis)
import os
import py7zr
import tempfile

with tempfile.TemporaryDirectory() as tmpdir:
with py7zr.SevenZipFile(path, "r") as arch_file:
arch_file.extractall(tmpdir)

for dir_path, _, file_names in os.walk(tmpdir):
for file_name in file_names:
file_path = os.path.join(dir_path, file_name)
with open(file_path, "rb") as file:
yield self.__save_archive_file(tmp_dir=tmp_dir, file_name=file_name, file=file, need_content_analysis=need_content_analysis)

def __save_archive_file(self, tmp_dir: str, file_name: str, file: IO[bytes], need_content_analysis: bool) -> AttachedFile:
import os
Expand Down
5 changes: 4 additions & 1 deletion dedoc/readers/docx_reader/numbering_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,10 @@ def parse(self, xml: Tag, paragraph_properties: BaseProperties, run_properties:
else:
ilvl = ilvl["w:val"]

lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl]
try:
lvl_info: LevelInfo = self.num_dict[num_id].level_number2level_info[ilvl]
except KeyError:
return
text = self.__get_list_item_text(ilvl, num_id)

# change style of the paragraph/run: style -> pPr -> rPr
Expand Down
2 changes: 1 addition & 1 deletion dedoc/readers/docx_reader/styles_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ def __get_styles_hierarchy(self, style: Tag, style_id: str, style_type: StyleTyp

styles = [style]
current_style = style
while current_style.basedOn:
while current_style and current_style.basedOn:
try:
parent_style_id = current_style.basedOn["w:val"]
current_style = self.__find_style(parent_style_id, style_type)
Expand Down
9 changes: 5 additions & 4 deletions dedoc/readers/pdf_reader/data_classes/tables/table_type.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
class TableTypeAdditionalOptions:
"""
Setting up the table recognizer. The value of the parameter specifies the type of tables recognized when processed by
Enum for table types of tables for the table recognizer.
The value of the parameter specifies the type of tables recognized when processed by
class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer`.

* Parameter `table_type=wo_external_bounds` - recognize tables without external bounds;
* Parameter `table_type=wo_external_bounds` - recognize tables without external bounds.

Example of a table of type `wo_external_bounds`::

Expand All @@ -16,7 +17,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_
text | text | text


* Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table;
* Parameter `table_type=one_cell_table` - if a document contains a bounding box with text, it will be considered a table.

Example of a page with a table of type `one_cell_table`::

Expand All @@ -27,7 +28,7 @@ class :class:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_
+------+
________________________

* Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table;
* Parameter `table_type=split_last_column` - specified parameter for the merged last column of the table.

Example of a table of type `split_last_column`::

Expand Down
3 changes: 2 additions & 1 deletion dedoc/readers/pdf_reader/pdf_base_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from dedoc.readers.pdf_reader.data_classes.line_with_location import LineWithLocation
from dedoc.readers.pdf_reader.data_classes.pdf_image_attachment import PdfImageAttachment
from dedoc.readers.pdf_reader.data_classes.tables.scantable import ScanTable
from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector


ParametersForParseDoc = namedtuple("ParametersForParseDoc", [
"is_one_column_document",
Expand Down Expand Up @@ -44,6 +44,7 @@ def __init__(self, *, config: Optional[dict] = None, recognized_extensions: Opti
from dedoc.readers.pdf_reader.pdf_image_reader.paragraph_extractor.scan_paragraph_classifier_extractor import ScanParagraphClassifierExtractor
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.gost_frame_recognizer import GOSTFrameRecognizer
from dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer import TableRecognizer
from dedoc.readers.pdf_reader.utils.header_footers_analysis import HeaderFooterDetector
from dedoc.readers.pdf_reader.utils.line_object_linker import LineObjectLinker
from dedoc.attachments_extractors.concrete_attachments_extractors.pdf_attachments_extractor import PDFAttachmentsExtractor

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,15 +21,13 @@

class TableRecognizer:
"""
The class recognizes tables from document images. This class is internal to the system. It is called from readers such as .

* The class recognizes tables with borders from the document image and returns the class
(function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`);


* The class also analyzes recognized single-page tables and combines them into multi-page ones
(function :meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`);
The class recognizes tables from document images. This class is internal to the system.
It is called from readers such as :class:`dedoc.readers.PdfTxtlayerReader` or :class:`dedoc.readers.PdfImageReader`.

* The class recognizes tables with borders from the document image using
:meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.recognize_tables_from_image`;
* The class also analyzes recognized single-page tables and combines them into multi-page ones using
:meth:`~dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer.convert_to_multipages_tables`
"""

def __init__(self, *, config: dict = None) -> None:
Expand Down
16 changes: 8 additions & 8 deletions dedoc/readers/pdf_reader/utils/header_footers_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,18 @@ class HeaderFooterDetector:
`Lin X. Header and footer extraction by page association //Document Recognition and Retrieval X. – SPIE, 2003. – Т. 5010. – С. 164-171.`

Algorithm's notes:
1) For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers.
For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared.
Therefore, alternating footers-headers will not be detected on documents of less than 6 pages.

2) The algorithm analyzes the first 4 and last 4 lines on each page of the document and,
by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity.
1. For documents of 6 pages or more, lines on even and odd pages of the document are compared to detect alternating footers-headers.
For documents of less than 6 pages, lines between adjacent pages (between even or odd pages) are compared.
Therefore, alternating footers-headers will not be detected on documents of less than 6 pages.

3) For the algorithm to work, the document must have at least two pages of text.
It is not an ML algorithm it cannot work with just one page.
2. The algorithm analyzes the first 4 and last 4 lines on each page of the document and,
by comparing lines across pages, identifies common footer-header patterns using Levenshtein similarity.

4) The more pages the better. Remember the parameter `pages` limits the number of pages in a document.
3. For algorithm work, the document must have at least two pages of text.
It is not an ML algorithm so it cannot work with just one page.

4. The more pages, the better. Remember that the parameter `pages` limits the number of pages in a document.
"""

def __init__(self) -> None:
Expand Down
29 changes: 17 additions & 12 deletions docs/source/changelog.rst
Original file line number Diff line number Diff line change
@@ -1,28 +1,33 @@
Changelog
=========

v2.6.1 (2025-12-16)
-------------------
Release note: `v2.6.1 <https://github.com/ispras/dedoc/releases/tag/v2.6.1>`_

* Fixed some bugs in `DocxReader`.
* Replace outdated `pylzma` dependency by `py7zr`.

v2.6 (2025-09-19)
-----------------
Release note: `v2.6 <https://github.com/ispras/dedoc/releases/tag/v2.6>`_

* improve table merge algorithm (added check on table layout) `MultiPageTableExtractor`.
* refactoring table merge `MultiPageTableExtractor`.
* improve header footer analysis `HeaderFooterDetector`.
* added header footer analysis support in Tabby.
* added header footer analysis info (parameter `need_header_footer_analysis`) in documentation (readthedocs).
* update to python3.10.
* update to ubuntu22.04.
* added `Contributing Information` (project rules, how build, how develop) in documentation (readthedocs).

* Improved table merge algorithm (added check on table layout) `MultiPageTableExtractor`.
* Improved header footer analysis `HeaderFooterDetector`.
* Added header footer analysis support in `PdfTabbyReader`.
* Added header footer analysis info (parameter `need_header_footer_analysis`) in documentation.
* Updated to python3.10.
* Updated to ubuntu22.04.
* Added `Support and Contributing` (project rules, how to build, how to develop) in documentation.

v2.5 (2025-09-05)
-----------------
Release note: `v2.5 <https://github.com/ispras/dedoc/releases/tag/v2.5>`_

* Added simple multilingual textual layer correctness classification based on letter percentage calculation (`textual_layer_classifier=letter`).
* Added a new parameter `textual_layer_classifier = [simple, ml (default), letter]`.
* Remove parameter `fast_textual_layer_detection`. Now it is a `textual_layer_classifier=simple`.
* Fix bug with `table_type=table_wo_external_bounds` (fixed cv2.BoundingRect).
* Some refactoring `TableRecognition`.
* Removed parameter `fast_textual_layer_detection`. Now it is a `textual_layer_classifier=simple`.
* Fixed bug with `table_type=table_wo_external_bounds` (fixed cv2.BoundingRect).
* Added parameter `table_type` and `TableRecognition` info into documentation.

v2.4 (2025-07-28)
Expand Down
9 changes: 4 additions & 5 deletions docs/source/contributing/check_documentation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,11 @@ Check documentation

pip install .[docs]

2. Documentation files should be located in the `docs/ <https://github.com/ispras/dedoc/blob/master/docs>`_ directory,
which must contain the `docs/source/conf.py <https://github.com/ispras/dedoc/blob/master/docs/source/conf.py>`_ (build settings)
and `docs/source/index.rst <https://github.com/ispras/dedoc/blob/master/docs/source/index.rst>`_ (documentation main page) files.

3. Build documentation into HTML pages is done as follows:
2. Documentation files should be located in the `docs/ <https://github.com/ispras/dedoc/blob/master/docs>`_ directory.
Build documentation into HTML pages is done as follows:

.. code-block:: bash

python -m sphinx -T -E -W -b html -d docs/_build/doctrees -D language=en docs/source docs/_build

3. After building, the documentation can be checked locally, the main built page ``docs/_build/index.html`` can be opened in the browser.
24 changes: 12 additions & 12 deletions docs/source/contributing/contributing.rst
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ Support and Contributing

Support
-------
If you are stuck with a problem using Dedoc, please do get in touch at our `Issues <https://github.com/ispras/dedoc/issues>`_ (recommend)
If you are stuck with a problem using Dedoc, please use our `Issues <https://github.com/ispras/dedoc/issues>`_ (recommended)
or `Dedoc Chat <https://t.me/dedoc_chat>`_. The developers are willing to help.

You can save time by following this procedure when reporting a problem:

* Do try to solve the problem on your own first. Read the documentation, including using the search feature, index and reference documentation.
* Try to solve the problem on your own first. Read the documentation, including using the search feature, index and reference documentation.

* Search the issue archives to see if someone else already had the same problem.

Expand All @@ -23,7 +23,9 @@ Contributing Rules

* To add new features to the project repository yourself, you should follow
the `general contributing rules of github <https://github.com/firstcontributions/first-contributions>`_.
In your Pull Request, set `develop` as the target branch.

.. note::
In your Pull Request, set `develop` as the target branch.

* We recommend using `Pycharm IDE` and `virtualenv` package for development.

Expand All @@ -34,28 +36,26 @@ Contributing Rules
* We strongly recommend using the already used ML library `torch` in development. For example,
using `tensorflow` library instead of `torch` is justified only in case of extreme necessity.

* If you add new functionality to dedoc, be sure to add python `unitests` to test the added functionality
(you can add api tests in `tests/api_tests <https://github.com/ispras/dedoc/blob/master/tests/api_tests>`_,
you can add unit tests in `tests/unit_tests <https://github.com/ispras/dedoc/blob/master/tests/unit_tests>`_).
* If you add new functionality to dedoc, be sure to add python `unittest` to test the added functionality
(you can add api tests in `tests/api_tests <https://github.com/ispras/dedoc/blob/master/tests/api_tests>`_
or unit tests in `tests/unit_tests <https://github.com/ispras/dedoc/blob/master/tests/unit_tests>`_).
These tests are run automatically in the Continuous Integration pipeline.
To run tests locally, you can use docker as described in the `README <https://github.com/ispras/dedoc/blob/master/README.md#4-run-container-with-tests>`_.

* Before each commit, check the code style using the automatic checker using the `flake8` library.
Instructions for using flake8 are provided here :ref:using_flake8`.
Instructions for using flake8 are provided in :ref:`using_flake8`.

* We recommend setting up pre-commit for convenience and speeding up development according to the instructions :ref:`using_precommit` .
This will run a style check of the changed code with each commit.
This will run a style check of the changed code before each commit.

* In case of any change in the online documentation of the project (for example, when adding a new api parameter),
be sure to check locally that the changed documentation is successfully built and looks as expected.
Building online documentation using `sphinx` is described here :ref:`check_documentation`.

.. toctree::
:maxdepth: 1
:hidden:

using_flake8
using_precommit
check_documentation




10 changes: 10 additions & 0 deletions docs/source/dedoc_api_usage/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,16 @@ Api parameters description
- false
- This option is used to **remove** headers and footers of PDF documents from the output result.
If ``need_header_footer_analysis=false``, header and footer lines will present in the output as well as all other document lines.
The algorithm is implemented and described in the class :class:`~dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector`.

* - table_type
- "", wo_external_bounds, one_cell_table, split_last_column and their combinaton
- ""
- Setting up the table recognition method. This option is used for PDF documents which are images with text (PDF without a textual layer).
It is also used for PDF documents when ``pdf_with_text_layer`` is ``true``, ``false``, ``auto`` or ``auto_tabby``.
The value of the parameter specifies the type of tables for recognition,
supported table types are described in :class:`~dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions`.
You can use combination of values (for example, ``wo_external_bounds+one_cell_table``).

* - need_binarization
- true, false
Expand Down
2 changes: 1 addition & 1 deletion docs/source/getting_started/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -182,4 +182,4 @@ For ``python3.9``:
.. code-block:: bash

pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torch-1.11.0a0+git137096a-cp39-cp39-linux_x86_64.whl
pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torchvision-0.12.0a0%2B9b5a3fe-cp39-cp39-linux_x86_64.whl
pip install https://github.com/ispras/dedockerfiles/raw/master/wheels/torchvision-0.12.0a0%2B9b5a3fe-cp39-cp39-linux_x86_64.whl
1 change: 1 addition & 0 deletions docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,7 @@ This type of structure is configurable (see :ref:`using_patterns`).
modules/metadata_extractors
modules/structure_extractors
modules/structure_constructors
modules/pdf_parsing


.. toctree::
Expand Down
12 changes: 0 additions & 12 deletions docs/source/modules/manager.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,15 +10,3 @@ Dedoc pipeline
.. autoclass:: dedoc.attachments_handler.AttachmentsHandler
:special-members: __init__
:members:

.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer
:show-inheritance:
:members:

.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions
:show-inheritance:
:members:

.. autoclass:: dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector
:show-inheritance:
:members:
14 changes: 14 additions & 0 deletions docs/source/modules/pdf_parsing.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
.. _pdf_parsing:

Auxiliary data structures for PDF and images parsing
====================================================


.. autoclass:: dedoc.readers.pdf_reader.pdf_image_reader.table_recognizer.table_recognizer.TableRecognizer
:members:

.. autoclass:: dedoc.readers.pdf_reader.data_classes.tables.table_type.TableTypeAdditionalOptions
:members:

.. autoclass:: dedoc.readers.pdf_reader.utils.header_footers_analysis.HeaderFooterDetector
:members:
Loading