From a5a1f81c1027129dae5d85f8bcaa8a1e5fb6beac Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Sat, 10 Feb 2024 07:42:26 +0100 Subject: [PATCH] chg: new style application logic. --- src/inscriptis/html_engine.py | 47 ++++----------------- src/inscriptis/model/html_document_state.py | 33 ++++++++++++++- tests/test_annotation_engine.py | 5 +-- 3 files changed, 43 insertions(+), 42 deletions(-) diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 99fc746..684cfd8 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -10,7 +10,6 @@ from inscriptis.model.canvas import Canvas from inscriptis.model.config import ParserConfig from inscriptis.model.html_document_state import HtmlDocumentState -from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT from inscriptis.model.tag.a_tag import a_start_handler, a_end_handler from inscriptis.model.tag.br_tag import br_start_handler from inscriptis.model.tag.img_tag import img_start_handler @@ -54,7 +53,7 @@ class Inscriptis: def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None): # use the default configuration, if no config object is provided - self.config = config or ParserConfig() + config = config or ParserConfig() # setup start and end tag call tables self.start_tag_handler_dict = { @@ -66,8 +65,8 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None "ol": ol_start_handler, "li": li_start_handler, "br": br_start_handler, - "a": a_start_handler if self.config.parse_a() else None, - "img": img_start_handler if self.config.display_images else None, + "a": a_start_handler if config.parse_a() else None, + "img": img_start_handler if config.display_images else None, } self.end_tag_handler_dict = { "table": table_end_handler, @@ -75,12 +74,11 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None "ol": ol_end_handler, "td": td_end_handler, "th": td_end_handler, - "a": a_end_handler if self.config.parse_a() else None, + "a": a_end_handler if config.parse_a() else None, } # parse the HTML tree - state = HtmlDocumentState(config) - self.canvas = self._parse_html_tree(state, html_tree) + self.canvas = self._parse_html_tree(HtmlDocumentState(config), html_tree) def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas: """Parse the HTML tree. @@ -89,7 +87,10 @@ def _parse_html_tree(self, state: HtmlDocumentState, tree) -> Canvas: tree: the HTML tree to parse. """ if isinstance(tree.tag, str): - self.handle_starttag(state, tree.tag, tree.attrib) + state.apply_starttag_layout(tree.tag, tree.attrib) + + if handler := self.start_tag_handler_dict.get(tree.tag): + handler(state, tree.attrib) cur = state.tags[-1] cur.canvas.open_tag(cur) @@ -119,33 +120,3 @@ def get_text(self) -> str: def get_annotations(self) -> List[Annotation]: """Return the annotations extracted from the HTML page.""" return self.canvas.annotations - - def handle_starttag(self, state, tag, attrs, handler): - """Handle HTML start tags. - - Compute the style of the current :class:`HtmlElement`, based on - - 1. the used :attr:`css`, - 2. apply attributes and css with :meth:`~Attribute.apply_attributes` - 3. add the `HtmlElement` to the list of open tags. - - Lookup and apply and tag-specific start tag handler in - :attr:`start_tag_handler_dict`. - - Args: - tag: the HTML start tag to process. - attrs: a dictionary of HTML attributes and their respective values. - """ - # use the css to handle tags known to it :) - cur = state.tags[-1].get_refined_html_element( - state.apply_attributes( - attrs, - html_element=state.css.get(tag, DEFAULT_HTML_ELEMENT) - .__copy__() - .set_tag(tag), - ) - ) - state.tags.append(cur) - - if handler: - handler(attrs) diff --git a/src/inscriptis/model/html_document_state.py b/src/inscriptis/model/html_document_state.py index a5affa5..528e628 100644 --- a/src/inscriptis/model/html_document_state.py +++ b/src/inscriptis/model/html_document_state.py @@ -1,11 +1,18 @@ +"""Represents the state of an HTML document. + +The provided `HtmlDocumentState` class contains and exposes all fields required for +representing the current state of the HTML to text conversion. +""" + from inscriptis import ParserConfig from inscriptis.model.canvas import Canvas +from inscriptis.model.html_element import DEFAULT_HTML_ELEMENT class HtmlDocumentState: """Represents the state of the parsed html document.""" - def __init__(self, config: ParserConfig = None): + def __init__(self, config: ParserConfig): # instance variables self.canvas = Canvas() self.config = config @@ -19,3 +26,27 @@ def __init__(self, config: ParserConfig = None): # used if display_links is enabled self.link_target = "" + + def apply_starttag_layout(self, tag, attrs): + """Compute the layout of the tag. + + Compute the style of the current :class:`HtmlElement`, based on + + 1. the used :attr:`css`, + 2. apply attributes and css with :meth:`~Attribute.apply_attributes` + 3. add the `HtmlElement` to the list of open tags. + + Args: + tag: the HTML start tag to process. + attrs: a dictionary of HTML attributes and their respective values. + """ + # use the css to handle tags known to it :) + cur = self.tags[-1].get_refined_html_element( + self.apply_attributes( + attrs, + html_element=self.css.get(tag, DEFAULT_HTML_ELEMENT) + .__copy__() + .set_tag(tag), + ) + ) + self.tags.append(cur) diff --git a/tests/test_annotation_engine.py b/tests/test_annotation_engine.py index 7e78cbb..67b9050 100644 --- a/tests/test_annotation_engine.py +++ b/tests/test_annotation_engine.py @@ -14,7 +14,6 @@ def test_get_annotation(): rules = {'b': ['bold']} inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules)) - annotations = inscriptis.get_annotations() - assert text == "Chur is a City in Switzerland" - assert annotations == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')] + assert inscriptis.get_text() == "Chur is a City in Switzerland" + assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]