ispras · NastyBoget · Mar 4, 2026 · Feb 27, 2026 · Mar 2, 2026
diff --git a/dedoc/readers/docx_reader/data_structures/docx_document.py b/dedoc/readers/docx_reader/data_structures/docx_document.py
@@ -12,8 +12,8 @@
 from dedoc.data_structures.line_with_meta import LineWithMeta
 from dedoc.readers.docx_reader.data_structures.table import DocxTable
 from dedoc.readers.docx_reader.data_structures.utils import Counter, ParagraphMaker
-from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
 from dedoc.readers.docx_reader.line_with_meta_converter import LineWithMetaConverter
+from dedoc.readers.docx_reader.note_extractor import NoteExtractor
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
 from dedoc.utils.office_utils import get_bs_from_zip
@@ -47,8 +47,9 @@ def __get_paragraph_maker(self) -> ParagraphMaker:
             path_hash=calculate_file_hash(path=self.path),
             styles_extractor=styles_extractor,
             numbering_extractor=numbering_extractor,
-            footnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
-            endnote_extractor=FootnoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote")
+            footnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/footnotes.xml")),
+            endnote_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/endnotes.xml"), key="endnote"),
+            comment_extractor=NoteExtractor(get_bs_from_zip(self.path, "word/comments.xml"), key="comment")
         )
 
     def __get_lines(self) -> List[LineWithMeta]:

diff --git a/dedoc/readers/docx_reader/data_structures/paragraph.py b/dedoc/readers/docx_reader/data_structures/paragraph.py
@@ -4,7 +4,7 @@
 
 from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
 from dedoc.readers.docx_reader.data_structures.run import Run
-from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
+from dedoc.readers.docx_reader.note_extractor import NoteExtractor
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.properties_extractor import change_paragraph_properties, change_run_properties
 from dedoc.readers.docx_reader.styles_extractor import StyleType, StylesExtractor
@@ -16,8 +16,9 @@ def __init__(self,
                  xml: Tag,
                  styles_extractor: StylesExtractor,
                  numbering_extractor: NumberingExtractor,
-                 footnote_extractor: FootnoteExtractor,
-                 endnote_extractor: FootnoteExtractor,
+                 footnote_extractor: NoteExtractor,
+                 endnote_extractor: NoteExtractor,
+                 comment_extractor: NoteExtractor,
                  uid: str) -> None:
         """
         Contains information about paragraph properties.
@@ -30,9 +31,10 @@ def __init__(self,
         self.xml = xml
         self.footnote_extractor = footnote_extractor
         self.endnote_extractor = endnote_extractor
+        self.comment_extractor = comment_extractor
         self.numbering_extractor = numbering_extractor
         self.styles_extractor = styles_extractor
-        self.footnotes = []
+        self.notes = []
         self.runs = []
         self.runs_ids = []  # list of (start, end) inside the paragraph text
         self.text = ""
@@ -85,12 +87,8 @@ def __parse(self) -> None:
         if hasattr(self, "caps") and self.caps:
             self.text = self.text.upper()
 
-        for key, extractor in [("w:footnoteReference", self.footnote_extractor), ("w:endnoteReference", self.endnote_extractor)]:
-            notes = self.xml.find_all(key)
-            for footnote in notes:
-                note_id = footnote.get("w:id")
-                if note_id in extractor.id2footnote:
-                    self.footnotes.append(extractor.id2footnote[note_id])
+        for extractor in [self.footnote_extractor, self.endnote_extractor]:
+            self.notes.extend(extractor.get_notes(self.xml))
 
     def __get_numbering_formatting(self) -> Optional[Run]:
         """
@@ -99,7 +97,7 @@ def __get_numbering_formatting(self) -> Optional[Run]:
         :returns: numbering run if there is the text in numbering else None
         """
         if self.xml.numPr and self.numbering_extractor:
-            numbering_run = Run(self, self.styles_extractor)
+            numbering_run = Run(self, self.styles_extractor, self.comment_extractor)
             self.numbering_extractor.parse(self.xml.numPr, self, numbering_run)
 
             if numbering_run.text:
@@ -115,7 +113,7 @@ def __make_run_list(self) -> None:
         run_list = self.xml.find_all("w:r")
 
         for run_tree in run_list:
-            new_run = Run(self, self.styles_extractor)
+            new_run = Run(self, self.styles_extractor, self.comment_extractor)
 
             if run_tree.rStyle:
                 self.styles_extractor.parse(run_tree.rStyle["w:val"], new_run, StyleType.CHARACTER)
@@ -126,6 +124,9 @@ def __make_run_list(self) -> None:
                 change_run_properties(new_run, run_tree.rPr)
             new_run.get_text(run_tree)
             if not new_run.text:
+                if new_run.linked_text and self.runs:
+                    prev_linked_text = self.runs[-1].linked_text
+                    self.runs[-1].linked_text = new_run.linked_text if not prev_linked_text else f"{prev_linked_text}; {new_run.linked_text}"
                 continue
 
             if self.runs and self.runs[-1] == new_run:

diff --git a/dedoc/readers/docx_reader/data_structures/run.py b/dedoc/readers/docx_reader/data_structures/run.py
@@ -3,28 +3,36 @@
 from bs4 import Tag
 
 from dedoc.readers.docx_reader.data_structures.base_props import BaseProperties
+from dedoc.readers.docx_reader.note_extractor import NoteExtractor
 from dedoc.readers.docx_reader.properties_extractor import change_caps
 
 
 class Run(BaseProperties):
 
-    def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor") -> None:  # noqa
+    def __init__(self, properties: Optional[BaseProperties], styles_extractor: "StylesExtractor", comment_extractor: Optional[NoteExtractor] = None) -> None:  # noqa
         """
         Contains information about run properties.
         :param properties: Paragraph or Run for copying its properties
         :param styles_extractor: StylesExtractor
+        :param comment_extractor: NoteExtractor for comments
         """
 
         self.name2char = dict(tab="\t", br="\n", cr="\r")
         self.text = ""
+        self.linked_text = ""
         self.styles_extractor = styles_extractor
+        self.comment_extractor = comment_extractor
         super().__init__(properties)
 
     def get_text(self, xml: Tag) -> None:
         """
         Makes the text of run.
         :param xml: BeautifulSoup tree with run properties
         """
+        notes = self.comment_extractor.get_notes(xml) if self.comment_extractor else None
+        if notes:
+            self.linked_text = "; ".join(notes)
+
         for tag in xml:
             tag_name = tag.name
 
@@ -56,4 +64,5 @@ def __eq__(self, other: "Run") -> bool:
         size_eq = self.size == other.size
         font_eq = self.bold == other.bold and self.italic == other.italic and self.underlined == other.underlined
         script_eq = self.superscript == other.superscript and self.subscript == other.subscript
-        return size_eq and font_eq and script_eq
+        linked_text_eq = self.linked_text == other.linked_text
+        return size_eq and font_eq and script_eq and linked_text_eq
diff --git a/dedoc/readers/docx_reader/data_structures/utils.py b/dedoc/readers/docx_reader/data_structures/utils.py
@@ -6,7 +6,7 @@
 from bs4 import Tag
 
 from dedoc.readers.docx_reader.data_structures.paragraph import Paragraph
-from dedoc.readers.docx_reader.footnote_extractor import FootnoteExtractor
+from dedoc.readers.docx_reader.note_extractor import NoteExtractor
 from dedoc.readers.docx_reader.numbering_extractor import NumberingExtractor
 from dedoc.readers.docx_reader.styles_extractor import StylesExtractor
 
@@ -35,14 +35,16 @@ def __init__(self,
                  counter: Counter,
                  styles_extractor: StylesExtractor,
                  numbering_extractor: NumberingExtractor,
-                 footnote_extractor: FootnoteExtractor,
-                 endnote_extractor: FootnoteExtractor) -> None:
+                 footnote_extractor: NoteExtractor,
+                 endnote_extractor: NoteExtractor,
+                 comment_extractor: NoteExtractor) -> None:
         self.counter = counter
         self.path_hash = path_hash
         self.styles_extractor = styles_extractor
         self.numbering_extractor = numbering_extractor
         self.footnote_extractor = footnote_extractor
         self.endnote_extractor = endnote_extractor
+        self.comment_extractor = comment_extractor
         self.uids_set = set()
 
     def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) -> Paragraph:
@@ -52,6 +54,7 @@ def make_paragraph(self, paragraph_xml: Tag, paragraph_list: List[Paragraph]) ->
                               numbering_extractor=self.numbering_extractor,
                               footnote_extractor=self.footnote_extractor,
                               endnote_extractor=self.endnote_extractor,
+                              comment_extractor=self.comment_extractor,
                               uid=uid)
         prev_paragraph = None if len(paragraph_list) == 0 else paragraph_list[-1]
         paragraph.spacing = paragraph.spacing_before if prev_paragraph is None else max(prev_paragraph.spacing_after, paragraph.spacing_before)

diff --git a/dedoc/readers/docx_reader/footnote_extractor.py b/dedoc/readers/docx_reader/footnote_extractor.py
diff --git a/dedoc/readers/docx_reader/line_with_meta_converter.py b/dedoc/readers/docx_reader/line_with_meta_converter.py
@@ -24,7 +24,9 @@ def __init__(self, paragraph: Paragraph, paragraph_id: int) -> None:
         Converts custom DOCX Paragraph to LineWithMeta class.
         :param paragraph: Paragraph for converting its properties to the unified representation.
         """
-        annotations = [BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation]
+        annotations = [
+            BoldAnnotation, ItalicAnnotation, UnderlinedAnnotation, StrikeAnnotation, SuperscriptAnnotation, SubscriptAnnotation, LinkedTextAnnotation
+        ]
         self.dict2annotation = {annotation.name: annotation for annotation in annotations}
         self.annotation_merger = AnnotationMerger()
 
@@ -37,8 +39,8 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:
             AlignmentAnnotation(start=0, end=len(paragraph.text), value=paragraph.jc),
             SpacingAnnotation(start=0, end=len(paragraph.text), value=str(paragraph.spacing))
         ]
-        for footnote in paragraph.footnotes:
-            annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=footnote))
+        for note in paragraph.notes:
+            annotations.append(LinkedTextAnnotation(start=0, end=len(paragraph.text), value=note))
 
         if paragraph.style_name is not None:
             annotations.append(StyleAnnotation(start=0, end=len(paragraph.text), value=paragraph.style_name))
@@ -47,7 +49,7 @@ def __parse(self, paragraph: Paragraph, paragraph_id: int) -> LineWithMeta:
 
         for run, (start, end) in zip(paragraph.runs, paragraph.runs_ids):
             annotations.append(SizeAnnotation(start=start, end=end, value=str(run.size / 2)))
-            for property_name in ["bold", "italic", "underlined", "strike", "superscript", "subscript"]:
+            for property_name in self.dict2annotation:
                 property_value = getattr(run, property_name)
                 if property_value:
                     annotations.append(self.dict2annotation[property_name](start=start, end=end, value=str(property_value)))

diff --git a/dedoc/readers/docx_reader/note_extractor.py b/dedoc/readers/docx_reader/note_extractor.py
@@ -0,0 +1,33 @@
+from typing import Dict, List, Optional
+
+from bs4 import BeautifulSoup, Tag
+
+
+class NoteExtractor:
+
+    def __init__(self, xml: Optional[BeautifulSoup], key: str = "footnote") -> None:
+        """
+        :param xml: BeautifulSoup tree with styles
+        :param key: footnote, endnote or comment
+        """
+        self.key = key
+        self.id2note: Dict[str, str] = {}
+        if not xml:
+            return
+
+        for note in xml.find_all(f"w:{key}"):
+            note_id = note.get("w:id")
+            note_text = " ".join(t.text for t in note.find_all("w:t") if t.text)
+            author = note.get("w:author")
+            note_text = f"{author}: {note_text}" if author else note_text
+            if note_id and note_text:
+                self.id2note[note_id] = note_text
+
+    def get_notes(self, xml: Tag) -> List[str]:
+        notes_xml = xml.find_all(f"w:{self.key}Reference")
+        notes = []
+        for note in notes_xml:
+            note_id = note.get("w:id")
+            if note_id in self.id2note:
+                notes.append(self.id2note[note_id])
+        return notes
diff --git a/dedoc/readers/excel_reader/excel_reader.py b/dedoc/readers/excel_reader/excel_reader.py
@@ -2,6 +2,7 @@
 
 from xlrd.sheet import Sheet
 
+from dedoc.data_structures.concrete_annotations.linked_text_annotation import LinkedTextAnnotation
 from dedoc.data_structures.table import Table
 from dedoc.data_structures.unstructured_document import UnstructuredDocument
 from dedoc.readers.base_reader import BaseReader
@@ -54,8 +55,13 @@ def __parse_sheet(self, sheet_id: int, sheet: Sheet) -> Table:
         for row_id in range(n_rows):
             row = []
             for col_id in range(n_cols):
-                value = str(sheet.cell_value(rowx=row_id, colx=col_id))
-                row.append(CellWithMeta(lines=[LineWithMeta(line=value, metadata=LineMetadata(page_id=sheet_id, line_id=0))]))
+                cell_text = str(sheet.cell_value(rowx=row_id, colx=col_id))
+                if (row_id, col_id) in sheet.cell_note_map:
+                    note_text = sheet.cell_note_map[(row_id, col_id)].text.replace("\n", " ")
+                    annotations = [LinkedTextAnnotation(start=0, end=len(cell_text), value=note_text)]
+                else:
+                    annotations = []
+                row.append(CellWithMeta(lines=[LineWithMeta(line=cell_text, metadata=LineMetadata(page_id=sheet_id, line_id=0), annotations=annotations)]))
             res.append(row)
         metadata = TableMetadata(page_id=sheet_id)
         return Table(cells=res, metadata=metadata)
diff --git a/docker-compose.yml b/docker-compose.yml
@@ -29,6 +29,6 @@ services:
       is_test: $test
 
   grobid:
-      image: "lfoppiano/grobid:0.8.0"
+      image: "grobid/grobid:0.8.2"
       ports:
         - 8070:8070
diff --git a/tests/api_tests/test_api_doctype_article.py b/tests/api_tests/test_api_doctype_article.py
@@ -40,12 +40,12 @@ def test_article(self) -> None:
 
         # check bibliography list
         self.assertEqual("bibliography", self._get_by_tree_path(tree, "0.12")["metadata"]["paragraph_type"])
-        self.assertEqual(65, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"]))
+        self.assertEqual(64, len(self._get_by_tree_path(tree, "0.12")["subparagraphs"]))
 
         # check bib_item 1 recognizing
         self.assertEqual("title", self._get_by_tree_path(tree, "0.12.0.0")["metadata"]["paragraph_type"])
         self.assertEqual("Leakage-resilient symmetric encryption via re-keying", self._get_by_tree_path(tree, "0.12.0.0")["text"])
-        self.assertEqual("title_conference_proceedings", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"])
+        self.assertEqual("title_journal", self._get_by_tree_path(tree, "0.12.0.1")["metadata"]["paragraph_type"])
         self.assertEqual("Bertoni and Coron", self._get_by_tree_path(tree, "0.12.0.1")["text"])
         self.assertEqual("author", self._get_by_tree_path(tree, "0.12.0.2")["metadata"]["paragraph_type"])  # author 1
         self.assertEqual("Michel Abdalla", self._get_by_tree_path(tree, "0.12.0.2")["text"])
@@ -55,12 +55,12 @@ def test_article(self) -> None:
         self.assertEqual("471-488", self._get_by_tree_path(tree, "0.12.0.6")["text"])
 
         # check cite on bib_item
-        bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"]  # checking on [58] references
+        bibliography_item_uuid = self._get_by_tree_path(tree, "0.12.57")["metadata"]["uid"]
         section = self._get_by_tree_path(tree, "0.4.0")
         bibliography_refs_in_text = [ann for ann in section["annotations"] if ann["name"] == "reference" and ann["value"] == bibliography_item_uuid]
-        # We must found two refs [58] in Introduction section
-        self.assertEqual(len(bibliography_refs_in_text), 2)
-        self.assertEqual(["58,", "58,"], [section["text"][bibliography_refs_in_text[n]["start"]:bibliography_refs_in_text[n]["end"]] for n in range(2)])
+        # We must found ref [59] in Introduction section
+        self.assertEqual(len(bibliography_refs_in_text), 1)
+        self.assertEqual("59]", section["text"][bibliography_refs_in_text[0]["start"]:bibliography_refs_in_text[0]["end"]])
 
         # check tables
         self.assertEqual(len(result["content"]["tables"]), 2)

diff --git a/tests/api_tests/test_api_format_docx.py b/tests/api_tests/test_api_format_docx.py
@@ -92,7 +92,25 @@ def test_not_stripped_xml(self) -> None:
         self._send_request("not_stripped_xml.docx", expected_code=200)
 
     def test_docx_with_comments(self) -> None:
-        _ = self._send_request("with_comments.docx", expected_code=200)
+        content = self._send_request("with_comments.docx")["content"]
+        structure = content["structure"]
+
+        node = get_by_tree_path(structure, "0.0.0")
+        annotations = [ann for ann in node["annotations"] if ann["name"] == "linked_text"]
+        self.assertEqual(len(annotations), 2)
+        self.assertIn("Interesting   entity   type", annotations[0]["value"])
+        self.assertIn("Some reply", annotations[0]["value"])
+        self.assertIn("New comment", annotations[1]["value"])
+
+        node = get_by_tree_path(structure, "0.0.1.6")
+        annotations = [ann for ann in node["annotations"] if ann["name"] == "linked_text"]
+        self.assertEqual(len(annotations), 1)
+        self.assertIn("Примечание об организации", annotations[0]["value"])
+
+        cell_node = content["tables"][1]["cells"][1][0]["lines"][0]
+        annotations = [ann for ann in cell_node["annotations"] if ann["name"] == "linked_text"]
+        self.assertEqual(len(annotations), 1)
+        self.assertIn("Примечание о методе  LSTM", annotations[0]["value"])
 
     def test_return_html(self) -> None:
         file_name = "example.doc"

diff --git a/tests/api_tests/test_api_format_excel.py b/tests/api_tests/test_api_format_excel.py
@@ -37,6 +37,22 @@ def test_xls(self) -> None:
         tables = result["content"]["tables"]
         self.__check_content(tables)
 
+    def test_xlsx_comments(self) -> None:
+        file_name = "with_comments.xlsx"
+        tables = self._send_request(file_name)["content"]["tables"]
+
+        self.__check_cell_comment(tables, 0, 1, 2, "Примечание что телефон указан не верно")
+        self.__check_cell_comment(tables, 0, 4, 0, "Заметка об организации")
+        self.__check_cell_comment(tables, 1, 0, 1, "Неточное название столбца")
+        self.__check_cell_comment(tables, 1, 5, 0, "Примечание о персоне Иванов Сергей")
+        self.__check_cell_comment(tables, 1, 9, 2, "Номер телефона под вопросом")
+
+    def __check_cell_comment(self, tables: dict, table_id: int, row_id: int, col_id: int, text: str) -> None:
+        cell_node = tables[table_id]["cells"][row_id][col_id]["lines"][0]
+        annotations = [ann for ann in cell_node["annotations"] if ann["name"] == "linked_text"]
+        self.assertEqual(len(annotations), 1)
+        self.assertIn(text, annotations[0]["value"])
+
     def test_ods_formulas(self) -> None:
         file_name = "example_formulas.ods"
         result = self._send_request(file_name)

diff --git a/tests/data/docx/with_comments.docx b/tests/data/docx/with_comments.docx
diff --git a/tests/data/xlsx/with_comments.xlsx b/tests/data/xlsx/with_comments.xlsx