Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions dedoc/readers/docx_reader/data_structures/docx_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,8 @@
from dedoc.data_structures.line_with_meta import LineWithMeta
from dedoc.readers.docx_reader.data_structures.table import DocxTable
from dedoc.readers.docx_reader.data_structures.utils import Counter, ParagraphMaker
from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
from dedoc.readers.docx_reader.note_extractor import NoteExtractor
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
from dedoc.utils.office_utils import get_bs_from_zip
Expand Down Expand Up @@ -47,8 +47,9 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
path_hash=calculate_file_hash(path=self.path),
styles_extractor=styles_extractor,
numbering_extractor=numbering_extractor,
footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
footnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
endnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote"),
comment_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/comments.xml"), key="comment")
)

def __get_lines(self) -> List[LineWithMeta]:
Expand Down
25 changes: 13 additions & 12 deletions dedoc/readers/docx_reader/data_structures/paragraph.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
from dedoc.readers.docx_reader.data_structures.run import Run
from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
from dedoc.readers.docx_reader.note_extractor import NoteExtractor
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.docx_reader.properties_extractor import change_paragraph_properties, change_run_properties
from dedoc.readers.docx_reader.styles_extractor import StyleType, StylesExtractor
Expand All @@ -16,8 +16,9 @@ def __init__(self,
xml: Tag,
styles_extractor: StylesExtractor,
numbering_extractor: NumberingExtractor,
footnote_extractor: FootnoteExtractor,
endnote_extractor: FootnoteExtractor,
footnote_extractor: NoteExtractor,
endnote_extractor: NoteExtractor,
comment_extractor: NoteExtractor,
uid: str) -> None:
"""
Contains information about paragraph properties.
Expand All @@ -30,9 +31,10 @@ def __init__(self,
self.xml = xml
self.footnote_extractor = footnote_extractor
self.endnote_extractor = endnote_extractor
self.comment_extractor = comment_extractor
self.numbering_extractor = numbering_extractor
self.styles_extractor = styles_extractor
self.footnotes = []
self.notes = []
self.runs = []
self.runs_ids = [] # list of (start, end) inside the paragraph text
self.text = ""
Expand Down Expand Up @@ -85,12 +87,8 @@ def __parse(self) -> None:
if hasattr(self, "caps") and self.caps:
self.text = self.text.upper()

for key, extractor in [("w:footnoteReference", self.footnote_extractor), ("w:endnoteReference", self.endnote_extractor)]:
notes = self.xml.find_all(key)
for footnote in notes:
note_id = footnote.get("w:id")
if note_id in extractor.id2footnote:
self.footnotes.append(extractor.id2footnote[note_id])
for extractor in [self.footnote_extractor, self.endnote_extractor]:
self.notes.extend(extractor.get_notes(self.xml))

def __get_numbering_formatting(self) -> Optional[Run]:
"""
Expand All @@ -99,7 +97,7 @@ def __get_numbering_formatting(self) -> Optional[Run]:
:returns: numbering run if there is the text in numbering else None
"""
if self.xml.numPr and self.numbering_extractor:
numbering_run = Run(self, self.styles_extractor)
numbering_run = Run(self, self.styles_extractor, self.comment_extractor)
self.numbering_extractor.parse(self.xml.numPr, self, numbering_run)

if numbering_run.text:
Expand All @@ -115,7 +113,7 @@ def __make_run_list(self) -> None:
run_list = self.xml.find_all("w:r")

for run_tree in run_list:
new_run = Run(self, self.styles_extractor)
new_run = Run(self, self.styles_extractor, self.comment_extractor)

if run_tree.rStyle:
self.styles_extractor.parse(run_tree.rStyle["w:val"], new_run, StyleType.CHARACTER)
Expand All @@ -126,6 +124,9 @@ def __make_run_list(self) -> None:
change_run_properties(new_run, run_tree.rPr)
new_run.get_text(run_tree)
if not new_run.text:
if new_run.linked_text and self.runs:
prev_linked_text = self.runs[-1].linked_text
self.runs[-1].linked_text = new_run.linked_text if not prev_linked_text else f"{prev_linked_text}; {new_run.linked_text}"
continue

if self.runs and self.runs[-1] == new_run:
Expand Down
13 changes: 11 additions & 2 deletions dedoc/readers/docx_reader/data_structures/run.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,36 @@
from bs4 import Tag

from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
from dedoc.readers.docx_reader.note_extractor import NoteExtractor
from dedoc.readers.docx_reader.properties_extractor import change_caps


class Run(BaseProperties):

def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor") -> None: # noqa
def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor", comment_extractor: Optional[NoteExtractor] = None) -> None: # noqa
"""
Contains information about run properties.
:param properties: Paragraph or Run for copying its properties
:param styles_extractor: StylesExtractor
:param comment_extractor: NoteExtractor for comments
"""

self.name2char = dict(tab="\t", br="\n", cr="\r")
self.text = ""
self.linked_text = ""
self.styles_extractor = styles_extractor
self.comment_extractor = comment_extractor
super().__init__(properties)

def get_text(self, xml: Tag) -> None:
"""
Makes the text of run.
:param xml: BeautifulSoup tree with run properties
"""
notes = self.comment_extractor.get_notes(xml) if self.comment_extractor else None
if notes:
self.linked_text = "; ".join(notes)

for tag in xml:
tag_name = tag.name

Expand Down Expand Up @@ -56,4 +64,5 @@ def __eq__(self, other: "Run") -> bool:
size_eq = self.size == other.size
font_eq = self.bold == other.bold and self.italic == other.italic and self.underlined == other.underlined
script_eq = self.superscript == other.superscript and self.subscript == other.subscript
return size_eq and font_eq and script_eq
linked_text_eq = self.linked_text == other.linked_text
return size_eq and font_eq and script_eq and linked_text_eq
9 changes: 6 additions & 3 deletions dedoc/readers/docx_reader/data_structures/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from bs4 import Tag

from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph
from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
from dedoc.readers.docx_reader.note_extractor import NoteExtractor
from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
from dedoc.readers.docx_reader.styles_extractor import StylesExtractor

Expand Down Expand Up @@ -35,14 +35,16 @@ def __init__(self,
counter: Counter,
styles_extractor: StylesExtractor,
numbering_extractor: NumberingExtractor,
footnote_extractor: FootnoteExtractor,
endnote_extractor: FootnoteExtractor) -> None:
footnote_extractor: NoteExtractor,
endnote_extractor: NoteExtractor,
comment_extractor: NoteExtractor) -> None:
self.counter = counter
self.path_hash = path_hash
self.styles_extractor = styles_extractor
self.numbering_extractor = numbering_extractor
self.footnote_extractor = footnote_extractor
self.endnote_extractor = endnote_extractor
self.comment_extractor = comment_extractor
self.uids_set = set()

def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) -> Paragraph:
Expand All @@ -52,6 +54,7 @@ def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) ->
numbering_extractor=self.numbering_extractor,
footnote_extractor=self.footnote_extractor,
endnote_extractor=self.endnote_extractor,
comment_extractor=self.comment_extractor,
uid=uid)
prev_paragraph = None if len(paragraph_list) == 0 else paragraph_list[-1]
paragraph.spacing = paragraph.spacing_before if prev_paragraph is None else max(prev_paragraph.spacing_after, paragraph.spacing_before)
Expand Down
21 changes: 0 additions & 21 deletions dedoc/readers/docx_reader/footnote_extractor.py

This file was deleted.

10 changes: 6 additions & 4 deletions dedoc/readers/docx_reader/line_with_meta_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ def __init__(self, paragraph: Paragraph, paragraph_id: int) -> None:
Converts custom DOCX Paragraph to LineWithMeta class.
:param paragraph: Paragraph for converting its properties to the unified representation.
"""
annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
annotations = [
BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation, LinkedTextAnnotation
]
self.dict2annotation = {annotation.name: annotation for annotation in annotations}
self.annotation_merger = AnnotationMerger()

Expand All @@ -37,8 +39,8 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:
AlignmentAnnotation(start=0, end=len(paragraph.text), value=paragraph.jc),
SpacingAnnotation(start=0, end=len(paragraph.text), value=str(paragraph.spacing))
]
for footnote in paragraph.footnotes:
annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=footnote))
for note in paragraph.notes:
annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=note))

if paragraph.style_name is not None:
annotations.append(StyleAnnotation(start=0, end=len(paragraph.text), value=paragraph.style_name))
Expand All @@ -47,7 +49,7 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:

for run, (start, end) in zip(paragraph.runs, paragraph.runs_ids):
annotations.append(SizeAnnotation(start=start, end=end, value=str(run.size / 2)))
for property_name in ["bold", "italic", "underlined", "strike", "superscript", "subscript"]:
for property_name in self.dict2annotation:
property_value = getattr(run, property_name)
if property_value:
annotations.append(self.dict2annotation[property_name](start=start, end=end, value=str(property_value)))
Expand Down
33 changes: 33 additions & 0 deletions dedoc/readers/docx_reader/note_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from typing import Dict, List, Optional

from bs4 import BeautifulSoup, Tag


class NoteExtractor:

def __init__(self, xml: Optional[BeautifulSoup], key: str = "footnote") -> None:
"""
:param xml: BeautifulSoup tree with styles
:param key: footnote, endnote or comment
"""
self.key = key
self.id2note: Dict[str, str] = {}
if not xml:
return

for note in xml.find_all(f"w:{key}"):
note_id = note.get("w:id")
note_text = " ".join(t.text for t in note.find_all("w:t") if t.text)
author = note.get("w:author")
note_text = f"{author}: {note_text}" if author else note_text
if note_id and note_text:
self.id2note[note_id] = note_text

def get_notes(self, xml: Tag) -> List[str]:
notes_xml = xml.find_all(f"w:{self.key}Reference")
notes = []
for note in notes_xml:
note_id = note.get("w:id")
if note_id in self.id2note:
notes.append(self.id2note[note_id])
return notes
10 changes: 8 additions & 2 deletions dedoc/readers/excel_reader/excel_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

from xlrd.sheet import Sheet

from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation
from dedoc.data_structures.table import Table
from dedoc.data_structures.unstructured_document import UnstructuredDocument
from dedoc.readers.base_reader import BaseReader
Expand Down Expand Up @@ -54,8 +55,13 @@ def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table:
for row_id in range(n_rows):
row = []
for col_id in range(n_cols):
value = str(sheet.cell_value(rowx=row_id, colx=col_id))
row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))]))
cell_text = str(sheet.cell_value(rowx=row_id, colx=col_id))
if (row_id, col_id) in sheet.cell_note_map:
note_text = sheet.cell_note_map[(row_id, col_id)].text.replace("\n", " ")
annotations = [LinkedTextAnnotation(start=0, end=len(cell_text), value=note_text)]
else:
annotations = []
row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=sheet_id, line_id=0), annotations=annotations)]))
res.append(row)
metadata = TableMetadata(page_id=sheet_id)
return Table(cells=res, metadata=metadata)
2 changes: 1 addition & 1 deletion docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,6 @@ services:
is_test: $test

grobid:
image: "lfoppiano/grobid:0.8.0"
image: "grobid/grobid:0.8.2"
ports:
- 8070:8070
12 changes: 6 additions & 6 deletions tests/api_tests/test_api_doctype_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,12 @@ def test_article(self) -> None:

# check bibliography list
self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.12")["metadata"]["paragraph_type"])
self.assertEqual(65, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"]))
self.assertEqual(64, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"]))

# check bib_item 1 recognizing
self.assertEqual("title", self._get_by_tree_path(tree, "0.12.0.0")["metadata"]["paragraph_type"])
self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.12.0.0")["text"])
self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"])
self.assertEqual("title_journal", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"])
self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.12.0.1")["text"])
self.assertEqual("author", self._get_by_tree_path(tree, "0.12.0.2")["metadata"]["paragraph_type"]) # author 1
self.assertEqual("Michel Abdalla", self._get_by_tree_path(tree, "0.12.0.2")["text"])
Expand All @@ -55,12 +55,12 @@ def test_article(self) -> None:
self.assertEqual("471-488", self._get_by_tree_path(tree, "0.12.0.6")["text"])

# check cite on bib_item
bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"] # checking on [58] references
bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"]
section = self._get_by_tree_path(tree, "0.4.0")
bibliography_refs_in_text = [ann for ann in section["annotations"] if ann["name"] == "reference" and ann["value"] == bibliography_item_uuid]
# We must found two refs [58] in Introduction section
self.assertEqual(len(bibliography_refs_in_text), 2)
self.assertEqual(["58,", "58,"], [section["text"][bibliography_refs_in_text[n]["start"]:bibliography_refs_in_text[n]["end"]] for n in range(2)])
# We must found ref [59] in Introduction section
self.assertEqual(len(bibliography_refs_in_text), 1)
self.assertEqual("59]", section["text"][bibliography_refs_in_text[0]["start"]:bibliography_refs_in_text[0]["end"]])

# check tables
self.assertEqual(len(result["content"]["tables"]), 2)
Expand Down
20 changes: 19 additions & 1 deletion tests/api_tests/test_api_format_docx.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,25 @@ def test_not_stripped_xml(self) -> None:
self._send_request("not_stripped_xml.docx", expected_code=200)

def test_docx_with_comments(self) -> None:
_ = self._send_request("with_comments.docx", expected_code=200)
content = self._send_request("with_comments.docx")["content"]
structure = content["structure"]

node = get_by_tree_path(structure, "0.0.0")
annotations = [ann for ann in node["annotations"] if ann["name"] == "linked_text"]
self.assertEqual(len(annotations), 2)
self.assertIn("Interesting entity type", annotations[0]["value"])
self.assertIn("Some reply", annotations[0]["value"])
self.assertIn("New comment", annotations[1]["value"])

node = get_by_tree_path(structure, "0.0.1.6")
annotations = [ann for ann in node["annotations"] if ann["name"] == "linked_text"]
self.assertEqual(len(annotations), 1)
self.assertIn("Примечание об организации", annotations[0]["value"])

cell_node = content["tables"][1]["cells"][1][0]["lines"][0]
annotations = [ann for ann in cell_node["annotations"] if ann["name"] == "linked_text"]
self.assertEqual(len(annotations), 1)
self.assertIn("Примечание о методе LSTM", annotations[0]["value"])

def test_return_html(self) -> None:
file_name = "example.doc"
Expand Down
16 changes: 16 additions & 0 deletions tests/api_tests/test_api_format_excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,22 @@ def test_xls(self) -> None:
tables = result["content"]["tables"]
self.__check_content(tables)

def test_xlsx_comments(self) -> None:
file_name = "with_comments.xlsx"
tables = self._send_request(file_name)["content"]["tables"]

self.__check_cell_comment(tables, 0, 1, 2, "Примечание что телефон указан не верно")
self.__check_cell_comment(tables, 0, 4, 0, "Заметка об организации")
self.__check_cell_comment(tables, 1, 0, 1, "Неточное название столбца")
self.__check_cell_comment(tables, 1, 5, 0, "Примечание о персоне Иванов Сергей")
self.__check_cell_comment(tables, 1, 9, 2, "Номер телефона под вопросом")

def __check_cell_comment(self, tables: dict, table_id: int, row_id: int, col_id: int, text: str) -> None:
cell_node = tables[table_id]["cells"][row_id][col_id]["lines"][0]
annotations = [ann for ann in cell_node["annotations"] if ann["name"] == "linked_text"]
self.assertEqual(len(annotations), 1)
self.assertIn(text, annotations[0]["value"])

def test_ods_formulas(self) -> None:
file_name = "example_formulas.ods"
result = self._send_request(file_name)
Expand Down
Binary file modified tests/data/docx/with_comments.docx
Binary file not shown.
Binary file added tests/data/xlsx/with_comments.xlsx
Binary file not shown.