weblyzard · AlbertWeichselbraun · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024 · Jan 31, 2024
diff --git a/README.rst b/README.rst
@@ -185,7 +185,7 @@ HTML to annotated text conversion
 ---------------------------------
 convert and annotate HTML from a Web page using the provided annotation rules. 
 
-Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
+Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::
 
   $ inscript https://www.fhgr.ch -r annotation-profile.json
 
@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
 specified with the ``-p`` or ``--postprocessor`` command line argument::
 
   $ inscript https://www.fhgr.ch \
-          -r ./examples/annotation-profile.json \
+          -r ./annotation/examples/annotation-profile.json \
           -p surface
 
 
@@ -474,7 +474,8 @@ be used within a program:
 .. code-block:: python
 
   import urllib.request
-  from inscriptis import get_annotated_text, ParserConfig
+  from inscriptis import get_annotated_text
+  from inscriptis.model.config import ParserConfig
 
   url = "https://www.fhgr.ch"
   html = urllib.request.urlopen(url).read().decode('utf-8')
@@ -533,16 +534,21 @@ If the fine-tuning options discussed above are not sufficient, you may even over
 
 .. code-block:: python
 
-    inscriptis = Inscriptis(html, config)
+    from inscriptis.html_engine import Inscriptis
+    from functools import partial
 
-    inscriptis.start_tag_handler_dict['a'] = my_handle_start_a
-    inscriptis.end_tag_handler_dict['a'] = my_handle_end_a
+    inscriptis = Inscriptis(html_tree, config)
+
+    inscriptis.start_tag_handler_dict['a'] = partial(my_handle_start_a, inscriptis)
+    inscriptis.end_tag_handler_dict['a'] = partial(my_handle_end_a, inscriptis)
     text = inscriptis.get_text()
 
 
 In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
 You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``. 
 
+Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
+
 Optimizing memory consumption
 -----------------------------
 

diff --git a/examples/annotation-profile.json → examples/annotation/annotation-profile.json b/examples/annotation-profile.json → examples/annotation/annotation-profile.json
diff --git a/examples/stackoverflow.json → examples/annotation/stackoverflow.json b/examples/stackoverflow.json → examples/annotation/stackoverflow.json
diff --git a/examples/table-annotation-profile.json → .../annotation/table-annotation-profile.json b/examples/table-annotation-profile.json → .../annotation/table-annotation-profile.json
diff --git a/examples/unittest.json → examples/annotation/unittest.json b/examples/unittest.json → examples/annotation/unittest.json
diff --git a/...les/wikipedia-entities-and-citations.json → ...ion/wikipedia-entities-and-citations.json b/...les/wikipedia-entities-and-citations.json → ...ion/wikipedia-entities-and-citations.json
diff --git a/examples/wikipedia.json → examples/annotation/wikipedia.json b/examples/wikipedia.json → examples/annotation/wikipedia.json
diff --git a/examples/xda-developers.json → examples/annotation/xda-developers.json b/examples/xda-developers.json → examples/annotation/xda-developers.json
diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+
+"""
+Custom HTML tag handling example.
+
+Add a custom HTML handler for the bold <b> tag which encloses
+bold text with "**".
+
+Example:
+    "Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
+"""
+
+
+from inscriptis.html_engine import Inscriptis
+from functools import partial
+from lxml.html import fromstring
+
+
+def my_handle_start_b(self, attrs):
+    """Handle the opening <b> tag."""
+    self.tags[-1].write("**")
+
+
+def my_handle_end_b(self):
+    """Handle the closing </b> tag."""
+    self.tags[-1].write("**")
+
+
+HTML = "Welcome to <b>Chur</b>"
+
+html_tree = fromstring(HTML)
+inscriptis = Inscriptis(html_tree)
+inscriptis.start_tag_handler_dict["b"] = partial(my_handle_start_b, inscriptis)
+inscriptis.end_tag_handler_dict["b"] = partial(my_handle_end_b, inscriptis)
+print(inscriptis.get_text())
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "inscriptis"
-version = "2.4.0.1"
+version = "2.5.0"
 authors = ["Albert Weichselbraun <[email protected]>", "Fabian Odoni <[email protected]>"]
 description = "inscriptis - HTML to text converter."
 keywords = ["HTML", "converter", "text"]
@@ -59,5 +59,5 @@ line-length = 88
 target-version = ["py38", "py39", "py310", "py311", "py312"]
 extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
 include = '''
-  ^/src/|^/tests/|^/benchmarking/
+  ^/src/|^/tests/|^/benchmarking/|^/examples/
 '''
diff --git a/src/inscriptis/__init__.py b/src/inscriptis/__init__.py
@@ -132,5 +132,6 @@ def get_annotated_text(
         return {}
 
     inscriptis = Inscriptis(html_tree, config)
+    text = inscriptis.get_text()
     labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
-    return {"text": inscriptis.get_text(), "label": labels}
+    return {"text": text, "label": labels}
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -40,6 +40,7 @@ class Inscriptis:
 
     def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
         # use the default configuration, if no config object is provided
+        self.html_tree = html_tree
         self.config = config or ParserConfig()
 
         # setup start and end tag call tables
@@ -66,19 +67,16 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
 
         # instance variables
         self.canvas = Canvas()
-        self.css = self.config.css
-        self.apply_attributes = self.config.attribute_handler.apply_attributes
+        self._css = self.config.css
+        self._apply_attributes = self.config.attribute_handler.apply_attributes
 
-        self.tags = [self.css["body"].set_canvas(self.canvas)]
+        self.tags = [self._css["body"].set_canvas(self.canvas)]
         self.current_table = []
-        self.li_counter = []
-        self.last_caption = None
+        self._li_counter = []
+        self._last_caption = None
 
         # used if display_links is enabled
-        self.link_target = ""
-
-        # crawl the html tree
-        self._parse_html_tree(html_tree)
+        self._link_target = ""
 
     def _parse_html_tree(self, tree):
         """Parse the HTML tree.
@@ -108,10 +106,16 @@ def _parse_html_tree(self, tree):
 
     def get_text(self) -> str:
         """Return the text extracted from the HTML page."""
+        self._parse_html_tree(self.html_tree)
         return self.canvas.get_text()
 
     def get_annotations(self) -> List[Annotation]:
         """Return the annotations extracted from the HTML page."""
+        if not self.canvas.get_text():
+            raise ValueError(
+                "No text to annotate available yet. "
+                "Have you already parsed the page with get_text?"
+            )
         return self.canvas.annotations
 
     def handle_starttag(self, tag, attrs):
@@ -132,9 +136,9 @@ def handle_starttag(self, tag, attrs):
         """
         # use the css to handle tags known to it :)
         cur = self.tags[-1].get_refined_html_element(
-            self.apply_attributes(
+            self._apply_attributes(
                 attrs,
-                html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT)
+                html_element=self._css.get(tag, DEFAULT_HTML_ELEMENT)
                 .__copy__()
                 .set_tag(tag),
             )
@@ -159,43 +163,43 @@ def handle_endtag(self, tag):
             handler()
 
     def _start_ul(self, _):
-        self.li_counter.append(self.get_bullet())
+        self._li_counter.append(self.get_bullet())
 
     def _end_ul(self):
-        self.li_counter.pop()
+        self._li_counter.pop()
 
     def _start_img(self, attrs):
         image_text = attrs.get("alt", "") or attrs.get("title", "")
         if image_text and not (
-            self.config.deduplicate_captions and image_text == self.last_caption
+            self.config.deduplicate_captions and image_text == self._last_caption
         ):
             self.tags[-1].write(f"[{image_text}]")
-            self.last_caption = image_text
+            self._last_caption = image_text
 
     def _start_a(self, attrs):
-        self.link_target = ""
+        self._link_target = ""
         if self.config.display_links:
-            self.link_target = attrs.get("href", "")
+            self._link_target = attrs.get("href", "")
         if self.config.display_anchors:
-            self.link_target = self.link_target or attrs.get("name", "")
+            self._link_target = self._link_target or attrs.get("name", "")
 
-        if self.link_target:
+        if self._link_target:
             self.tags[-1].write("[")
 
     def _end_a(self):
-        if self.link_target:
-            self.tags[-1].write(f"]({self.link_target})")
+        if self._link_target:
+            self.tags[-1].write(f"]({self._link_target})")
 
     def _start_ol(self, _):
-        self.li_counter.append(1)
+        self._li_counter.append(1)
 
     def _end_ol(self):
-        self.li_counter.pop()
+        self._li_counter.pop()
 
     def _start_li(self, _):
-        bullet = self.li_counter[-1] if self.li_counter else "* "
+        bullet = self._li_counter[-1] if self._li_counter else "* "
         if isinstance(bullet, int):
-            self.li_counter[-1] += 1
+            self._li_counter[-1] += 1
             self.tags[-1].list_bullet = f"{bullet}. "
         else:
             self.tags[-1].list_bullet = bullet
@@ -262,4 +266,4 @@ def _newline(self, _):
 
     def get_bullet(self) -> str:
         """Return the bullet that correspond to the given index."""
-        return Inscriptis.UL_COUNTER[len(self.li_counter) % Inscriptis.UL_COUNTER_LEN]
+        return Inscriptis.UL_COUNTER[len(self._li_counter) % Inscriptis.UL_COUNTER_LEN]
diff --git a/tests/test_annotation_engine.py b/tests/test_annotation_engine.py
@@ -0,0 +1,31 @@
+# test the annotation handling
+
+import pytest
+
+from inscriptis.annotation import Annotation
+from inscriptis.html_engine import Inscriptis
+from inscriptis.model.config import ParserConfig
+from lxml.html import fromstring
+
+
+def test_get_annotation():
+    """Test get_anntation from the Inscriptis class"""
+    html = "<b>Chur</b> is a City in <b>Switzerland</b>"
+    rules = {'b': ['bold']}
+    inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))
+
+    # gettext needs to be called prior to get_annotations, since
+    # otherwise no text to annotate is available.
+    with pytest.raises(ValueError):
+        annotation = inscriptis.get_annotations()
+
+    # correct order
+    text = inscriptis.get_text()
+    annotations = inscriptis.get_annotations()
+
+    assert text == "Chur is a City in Switzerland"
+    assert annotations == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]
+
+
+
+