Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix/bug 81 custom html handling #82

Closed
wants to merge 8 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 12 additions & 6 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ HTML to annotated text conversion
---------------------------------
convert and annotate HTML from a Web page using the provided annotation rules.

Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::

$ inscript https://www.fhgr.ch -r annotation-profile.json

Expand Down Expand Up @@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
specified with the ``-p`` or ``--postprocessor`` command line argument::

$ inscript https://www.fhgr.ch \
-r ./examples/annotation-profile.json \
-r ./annotation/examples/annotation-profile.json \
-p surface


Expand Down Expand Up @@ -474,7 +474,8 @@ be used within a program:
.. code-block:: python

import urllib.request
from inscriptis import get_annotated_text, ParserConfig
from inscriptis import get_annotated_text
from inscriptis.model.config import ParserConfig

url = "https://www.fhgr.ch"
html = urllib.request.urlopen(url).read().decode('utf-8')
Expand Down Expand Up @@ -533,16 +534,21 @@ If the fine-tuning options discussed above are not sufficient, you may even over

.. code-block:: python

inscriptis = Inscriptis(html, config)
from inscriptis.html_engine import Inscriptis
from functools import partial

inscriptis.start_tag_handler_dict['a'] = my_handle_start_a
inscriptis.end_tag_handler_dict['a'] = my_handle_end_a
inscriptis = Inscriptis(html_tree, config)

inscriptis.start_tag_handler_dict['a'] = partial(my_handle_start_a, inscriptis)
inscriptis.end_tag_handler_dict['a'] = partial(my_handle_end_a, inscriptis)
text = inscriptis.get_text()


In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``.

Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.

Optimizing memory consumption
-----------------------------

Expand Down
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
35 changes: 35 additions & 0 deletions examples/custom-html-handling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3

"""
Custom HTML tag handling example.

Add a custom HTML handler for the bold <b> tag which encloses
bold text with "**".

Example:
"Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
"""


from inscriptis.html_engine import Inscriptis
from functools import partial
from lxml.html import fromstring


def my_handle_start_b(self, attrs):
"""Handle the opening <b> tag."""
self.tags[-1].write("**")


def my_handle_end_b(self):
"""Handle the closing </b> tag."""
self.tags[-1].write("**")


HTML = "Welcome to <b>Chur</b>"

html_tree = fromstring(HTML)
inscriptis = Inscriptis(html_tree)
inscriptis.start_tag_handler_dict["b"] = partial(my_handle_start_b, inscriptis)
inscriptis.end_tag_handler_dict["b"] = partial(my_handle_end_b, inscriptis)
print(inscriptis.get_text())
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "inscriptis"
version = "2.4.0.1"
version = "2.5.0"
authors = ["Albert Weichselbraun <[email protected]>", "Fabian Odoni <[email protected]>"]
description = "inscriptis - HTML to text converter."
keywords = ["HTML", "converter", "text"]
Expand Down Expand Up @@ -59,5 +59,5 @@ line-length = 88
target-version = ["py38", "py39", "py310", "py311", "py312"]
extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
include = '''
^/src/|^/tests/|^/benchmarking/
^/src/|^/tests/|^/benchmarking/|^/examples/
'''
3 changes: 2 additions & 1 deletion src/inscriptis/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,5 +132,6 @@ def get_annotated_text(
return {}

inscriptis = Inscriptis(html_tree, config)
text = inscriptis.get_text()
labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
return {"text": inscriptis.get_text(), "label": labels}
return {"text": text, "label": labels}
56 changes: 30 additions & 26 deletions src/inscriptis/html_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class Inscriptis:

def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
# use the default configuration, if no config object is provided
self.html_tree = html_tree
self.config = config or ParserConfig()

# setup start and end tag call tables
Expand All @@ -66,19 +67,16 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None

# instance variables
self.canvas = Canvas()
self.css = self.config.css
self.apply_attributes = self.config.attribute_handler.apply_attributes
self._css = self.config.css
self._apply_attributes = self.config.attribute_handler.apply_attributes

self.tags = [self.css["body"].set_canvas(self.canvas)]
self.tags = [self._css["body"].set_canvas(self.canvas)]
self.current_table = []
self.li_counter = []
self.last_caption = None
self._li_counter = []
self._last_caption = None

# used if display_links is enabled
self.link_target = ""

# crawl the html tree
self._parse_html_tree(html_tree)
self._link_target = ""

def _parse_html_tree(self, tree):
"""Parse the HTML tree.
Expand Down Expand Up @@ -108,10 +106,16 @@ def _parse_html_tree(self, tree):

def get_text(self) -> str:
"""Return the text extracted from the HTML page."""
self._parse_html_tree(self.html_tree)
return self.canvas.get_text()

def get_annotations(self) -> List[Annotation]:
"""Return the annotations extracted from the HTML page."""
if not self.canvas.get_text():
raise ValueError(
"No text to annotate available yet. "
"Have you already parsed the page with get_text?"
)
return self.canvas.annotations

def handle_starttag(self, tag, attrs):
Expand All @@ -132,9 +136,9 @@ def handle_starttag(self, tag, attrs):
"""
# use the css to handle tags known to it :)
cur = self.tags[-1].get_refined_html_element(
self.apply_attributes(
self._apply_attributes(
attrs,
html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT)
html_element=self._css.get(tag, DEFAULT_HTML_ELEMENT)
.__copy__()
.set_tag(tag),
)
Expand All @@ -159,43 +163,43 @@ def handle_endtag(self, tag):
handler()

def _start_ul(self, _):
self.li_counter.append(self.get_bullet())
self._li_counter.append(self.get_bullet())

def _end_ul(self):
self.li_counter.pop()
self._li_counter.pop()

def _start_img(self, attrs):
image_text = attrs.get("alt", "") or attrs.get("title", "")
if image_text and not (
self.config.deduplicate_captions and image_text == self.last_caption
self.config.deduplicate_captions and image_text == self._last_caption
):
self.tags[-1].write(f"[{image_text}]")
self.last_caption = image_text
self._last_caption = image_text

def _start_a(self, attrs):
self.link_target = ""
self._link_target = ""
if self.config.display_links:
self.link_target = attrs.get("href", "")
self._link_target = attrs.get("href", "")
if self.config.display_anchors:
self.link_target = self.link_target or attrs.get("name", "")
self._link_target = self._link_target or attrs.get("name", "")

if self.link_target:
if self._link_target:
self.tags[-1].write("[")

def _end_a(self):
if self.link_target:
self.tags[-1].write(f"]({self.link_target})")
if self._link_target:
self.tags[-1].write(f"]({self._link_target})")

def _start_ol(self, _):
self.li_counter.append(1)
self._li_counter.append(1)

def _end_ol(self):
self.li_counter.pop()
self._li_counter.pop()

def _start_li(self, _):
bullet = self.li_counter[-1] if self.li_counter else "* "
bullet = self._li_counter[-1] if self._li_counter else "* "
if isinstance(bullet, int):
self.li_counter[-1] += 1
self._li_counter[-1] += 1
self.tags[-1].list_bullet = f"{bullet}. "
else:
self.tags[-1].list_bullet = bullet
Expand Down Expand Up @@ -262,4 +266,4 @@ def _newline(self, _):

def get_bullet(self) -> str:
"""Return the bullet that correspond to the given index."""
return Inscriptis.UL_COUNTER[len(self.li_counter) % Inscriptis.UL_COUNTER_LEN]
return Inscriptis.UL_COUNTER[len(self._li_counter) % Inscriptis.UL_COUNTER_LEN]
31 changes: 31 additions & 0 deletions tests/test_annotation_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# test the annotation handling

import pytest

from inscriptis.annotation import Annotation
from inscriptis.html_engine import Inscriptis
from inscriptis.model.config import ParserConfig
from lxml.html import fromstring


def test_get_annotation():
"""Test get_anntation from the Inscriptis class"""
html = "<b>Chur</b> is a City in <b>Switzerland</b>"
rules = {'b': ['bold']}
inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))

# gettext needs to be called prior to get_annotations, since
# otherwise no text to annotate is available.
with pytest.raises(ValueError):
annotation = inscriptis.get_annotations()

# correct order
text = inscriptis.get_text()
annotations = inscriptis.get_annotations()

assert text == "Chur is a City in Switzerland"
assert annotations == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]




Loading