Skip to content

Commit

Permalink
chg: new style application logic.
Browse files Browse the repository at this point in the history
  • Loading branch information
AlbertWeichselbraun committed Feb 10, 2024
1 parent b4ce0ae commit a5a1f81
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 42 deletions.
47 changes: 9 additions & 38 deletions src/inscriptis/html_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
from inscriptis.model.canvas import Canvas
from inscriptis.model.config import ParserConfig
from inscriptis.model.html_document_state import HtmlDocumentState
from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT
from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler
from inscriptis.model.tag.br_tag import br_start_handler
from inscriptis.model.tag.img_tag import img_start_handler
Expand Down Expand Up @@ -54,7 +53,7 @@ class Inscriptis:

def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
# use the default configuration, if no config object is provided
self.config = config or ParserConfig()
config = config or ParserConfig()

# setup start and end tag call tables
self.start_tag_handler_dict = {
Expand All @@ -66,21 +65,20 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
"ol": ol_start_handler,
"li": li_start_handler,
"br": br_start_handler,
"a": a_start_handler if self.config.parse_a() else None,
"img": img_start_handler if self.config.display_images else None,
"a": a_start_handler if config.parse_a() else None,
"img": img_start_handler if config.display_images else None,
}
self.end_tag_handler_dict = {
"table": table_end_handler,
"ul": ul_end_handler,
"ol": ol_end_handler,
"td": td_end_handler,
"th": td_end_handler,
"a": a_end_handler if self.config.parse_a() else None,
"a": a_end_handler if config.parse_a() else None,
}

# parse the HTML tree
state = HtmlDocumentState(config)
self.canvas = self._parse_html_tree(state, html_tree)
self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree)

def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
"""Parse the HTML tree.
Expand All @@ -89,7 +87,10 @@ def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas:
tree: the HTML tree to parse.
"""
if isinstance(tree.tag, str):
self.handle_starttag(state, tree.tag, tree.attrib)
state.apply_starttag_layout(tree.tag, tree.attrib)

if handler := self.start_tag_handler_dict.get(tree.tag):
handler(state, tree.attrib)
cur = state.tags[-1]
cur.canvas.open_tag(cur)

Expand Down Expand Up @@ -119,33 +120,3 @@ def get_text(self) -> str:
def get_annotations(self) -> List[Annotation]:
"""Return the annotations extracted from the HTML page."""
return self.canvas.annotations

def handle_starttag(self, state, tag, attrs, handler):
"""Handle HTML start tags.
Compute the style of the current :class:`HtmlElement`, based on
1. the used :attr:`css`,
2. apply attributes and css with :meth:`~Attribute.apply_attributes`
3. add the `HtmlElement` to the list of open tags.
Lookup and apply and tag-specific start tag handler in
:attr:`start_tag_handler_dict`.
Args:
tag: the HTML start tag to process.
attrs: a dictionary of HTML attributes and their respective values.
"""
# use the css to handle tags known to it :)
cur = state.tags[-1].get_refined_html_element(
state.apply_attributes(
attrs,
html_element=state.css.get(tag, DEFAULT_HTML_ELEMENT)
.__copy__()
.set_tag(tag),
)
)
state.tags.append(cur)

if handler:
handler(attrs)
33 changes: 32 additions & 1 deletion src/inscriptis/model/html_document_state.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,18 @@
"""Represents the state of an HTML document.
The provided `HtmlDocumentState` class contains and exposes all fields required for
representing the current state of the HTML to text conversion.
"""

from inscriptis import ParserConfig
from inscriptis.model.canvas import Canvas
from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT


class HtmlDocumentState:
"""Represents the state of the parsed html document."""

def __init__(self, config: ParserConfig = None):
def __init__(self, config: ParserConfig):
# instance variables
self.canvas = Canvas()
self.config = config
Expand All @@ -19,3 +26,27 @@ def __init__(self, config: ParserConfig = None):

# used if display_links is enabled
self.link_target = ""

def apply_starttag_layout(self, tag, attrs):
"""Compute the layout of the tag.
Compute the style of the current :class:`HtmlElement`, based on
1. the used :attr:`css`,
2. apply attributes and css with :meth:`~Attribute.apply_attributes`
3. add the `HtmlElement` to the list of open tags.
Args:
tag: the HTML start tag to process.
attrs: a dictionary of HTML attributes and their respective values.
"""
# use the css to handle tags known to it :)
cur = self.tags[-1].get_refined_html_element(
self.apply_attributes(
attrs,
html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT)
.__copy__()
.set_tag(tag),
)
)
self.tags.append(cur)
5 changes: 2 additions & 3 deletions tests/test_annotation_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ def test_get_annotation():
rules = {'b': ['bold']}

inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))
annotations = inscriptis.get_annotations()

assert text == "Chur is a City in Switzerland"
assert annotations == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]
assert inscriptis.get_text() == "Chur is a City in Switzerland"
assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]

0 comments on commit a5a1f81

Please sign in to comment.