From 0f3280d8728bef78a9f2e7a96d2ecf6dd0e550a0 Mon Sep 17 00:00:00 2001 From: Albert Weichselbraun Date: Fri, 16 Feb 2024 20:21:26 +0100 Subject: [PATCH] chg: improved documentation and code cleanup. --- examples/custom-html-handling.py | 4 ++-- src/inscriptis/html_engine.py | 8 ++++---- src/inscriptis/model/tag/__init__.py | 11 ++++++++--- 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/examples/custom-html-handling.py b/examples/custom-html-handling.py index f215fb5..03253a4 100755 --- a/examples/custom-html-handling.py +++ b/examples/custom-html-handling.py @@ -27,8 +27,8 @@ def my_handle_end_b(state: HtmlDocumentState): MY_MAPPING = CustomHtmlTagHandlerMapping( - start_tag_handler_mapping={"b": my_handle_start_b}, - end_tag_handler_mapping={"b": my_handle_end_b}, + start_tag_mapping={"b": my_handle_start_b}, + end_tag_mapping={"b": my_handle_end_b}, ) diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py index 3d0638a..42d849e 100644 --- a/src/inscriptis/html_engine.py +++ b/src/inscriptis/html_engine.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # coding:utf-8 """The HTML Engine is responsible for converting HTML to text.""" -from typing import List, Dict, Callable, Any +from typing import List, Dict, Callable import lxml.html from lxml.etree import Comment @@ -57,7 +57,7 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None # setup start and end tag call tables self.start_tag_handler_dict: Dict[ - str, Callable[[HtmlDocumentState, Any], None] + str, Callable[[HtmlDocumentState, Dict], None] ] = { "table": table_start_handler, "tr": tr_start_handler, @@ -81,10 +81,10 @@ def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None if config.custom_html_tag_handler_mapping: self.start_tag_handler_dict.update( - config.custom_html_tag_handler_mapping.start_tag_handler_mapping + config.custom_html_tag_handler_mapping.start_tag_mapping ) self.end_tag_handler_dict.update( - config.custom_html_tag_handler_mapping.end_tag_handler_mapping + config.custom_html_tag_handler_mapping.end_tag_mapping ) # parse the HTML tree diff --git a/src/inscriptis/model/tag/__init__.py b/src/inscriptis/model/tag/__init__.py index d329a2e..c0d29c2 100644 --- a/src/inscriptis/model/tag/__init__.py +++ b/src/inscriptis/model/tag/__init__.py @@ -9,7 +9,12 @@ class CustomHtmlTagHandlerMapping(NamedTuple): - """Provide a custom HTML Tag handler mapping.""" + """Refine the standard HTML Tag handling with the provided mapping. - start_tag_handler_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]] - end_tag_handler_mapping: Dict[str, Callable[[HtmlDocumentState], None]] + Attributes: + start_tag_mapping: a dictionary of custom start tag handlers. + end_tag_mapping: a dictionary of custom end tag handlers. + """ + + start_tag_mapping: Dict[str, Callable[[HtmlDocumentState, Dict], None]] + end_tag_mapping: Dict[str, Callable[[HtmlDocumentState], None]]