-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml_utils.py
253 lines (220 loc) · 10.5 KB
/
html_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
#! /usr/bin/env python
#
# Micellaneous HTML utility functions, in particular with support for resolving
# HTML rendered via JavaScript. This was motivated by the desire to extract
# images from pubchem.ncbi.nlm.nih.gov web pages for drugs (e.g., Ibuprofen,
# as illustrated below).
#
#-------------------------------------------------------------------------------
# Example usage:
#
# TODO: see what html_file should be set to
# $ PATH="$PATH:/usr/local/programs/selenium" DEBUG_LEVEL=6 MOZ_HEADLESS=1 $PYTHON html_utils.py "$html_file" > _html-utils-pubchem-ibuprofen.log7 2>&
# $ cd $TMPDIR
# $ wc *ibuprofen*
# 13 65337 954268 post-https%3A%2F%2Fpubchem.ncbi.nlm.nih.gov%2Fcompound%2FIbuprofen
# 3973 24689 178909 post-https%3A%2F%2Fpubchem.ncbi.nlm.nih.gov%2Fcompound%2FIbuprofen.txt
# 60 1152 48221 pre-https%3A%2F%2Fpubchem.ncbi.nlm.nih.gov%2Fcompound%2FIbuprofen
#
# $ count_it.perl "<img" pre-https%3A%2F%2Fpubchem.ncbi.nlm.nih.gov%2Fcompound%2FIbuprofen
# $ count_it.perl "<img" post-https%3A%2F%2Fpubchem.ncbi.nlm.nih.gov%2Fcompound%2FIbuprofen
# <img 7
#
# $ diff pre-https%3A%2F%2Fpubchem.ncbi.nlm.nih.gov%2Fcompound%2FIbuprofen post-https%3A%2F%2Fpubchem.ncbi.nlm.nih.gov%2Fcompound%2FIbuprofen
# ...
# < <body>
# < <div id="js-rendered-content"></div>
# ---
# >
# > <div id="js-rendered-content"><div class="relative flex-container-vertical"><header class="bckg-white b-bottom"><div class="bckg
# 54c7
# < <script type="text/javascript" async src="/pcfe/summary/summary-v3.min.js"></script><!--<![endif]-->
# ---
# > <script type="text/javascript" async="" src="/pcfe/summary/summary-v3.min.js"></script><!--<![endif]-->
# 58,60c11,13
# < <script type="text/javascript" async src="https://www.ncbi.nlm.nih.gov/core/pinger/pinger.js"></script>
# < </body>
#
#-------------------------------------------------------------------------------
# TODO:
# - Standardize naming convention for URL parameter accessors (e.g., get_url_param vs. get_url_parameter).
#
"""HTML utility functions"""
import sys
import time
# Note: selenium import now optional
## OLD: from selenium import webdriver
import debug
import system
# Constants
MAX_DOWNLOAD_TIME = system.getenv_integer("MAX_DOWNLOAD_TIME", 60)
MID_DOWNLOAD_SLEEP_SECONDS = system.getenv_integer("MID_DOWNLOAD_SLEEP_SECONDS", 60)
POST_DOWNLOAD_SLEEP_SECONDS = system.getenv_integer("POST_DOWNLOAD_SLEEP_SECONDS", 0)
SKIP_BROWSER_CACHE = system.getenv_boolean("SKIP_BROWSER_CACHE", False)
USE_BROWSER_CACHE = (not SKIP_BROWSER_CACHE)
# Globals
# note: for convenience in Mako template code
user_parameters = {}
#-------------------------------------------------------------------------------
# Helper functions (TODO, put in system.py)
TEMP_DIR = system.getenv_text("TMPDIR", "/tmp")
#
def write_temp_file(filename, text):
"""Create FILENAME in temp. directory using TEXT"""
temp_path = system.form_path(TEMP_DIR, filename)
system.write_file(temp_path, text)
return
#-------------------------------------------------------------------------------
# HTML utility functions
browser_cache = {}
##
def get_browser(url):
"""Get existing browser for URL or create new one"""
browser = None
global browser_cache
# Check for cached version of browser. If none, create one and access page.
browser = browser_cache.get(url) if USE_BROWSER_CACHE else None
if not browser:
# HACK: unclean import (i.e., buried in function)
from selenium import webdriver # pylint: disable=import-error, import-outside-toplevel
browser = webdriver.Firefox()
if USE_BROWSER_CACHE:
browser_cache[url] = browser
browser.get(url)
if POST_DOWNLOAD_SLEEP_SECONDS:
time.sleep(POST_DOWNLOAD_SLEEP_SECONDS)
# Make sure the bare minimum is included (i.e., "<body></body>"
debug.assertion(len(browser.execute_script("return document.body.outerHTML")) > 13)
debug.trace_fmt(5, "get_browser({u}) => {b}", u=url, b=browser)
return browser
def get_inner_html(url):
"""Return the fully-rendered version of the URL HTML source (e.g., after JavaScript DOM manipulation"""
# Based on https://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Scraping_a_Webpage_Rendered_by_Javascript_Using_Python.php
# Navigate to the page (or get browser instance with existing page)
browser = get_browser(url)
# Wait for Javascript to finish processing
wait_until_ready(url)
# Extract fully-rendered HTML
inner_html = browser.execute_script("return document.body.innerHTML")
debug.trace_fmt(7, "get_inner_html({u}) => {h}", u=url, h=inner_html)
return inner_html
def get_inner_text(url):
"""Get text of URL after JavaScript processing"""
# See https://developer.mozilla.org/en-US/docs/Web/API/HTMLElement/innerText
# Navigate to the page (or get browser instance with existing page)
browser = get_browser(url)
# Wait for Javascript to finish processing
wait_until_ready(url)
# Extract fully-rendered text
inner_text = browser.execute_script("return document.body.innerText")
debug.trace_fmt(7, "get_inner_text({u}) => {t}", u=url, h=inner_text)
return inner_text
def document_ready(url):
"""Determine whether document for URL has completed loading"""
# See https://developer.mozilla.org/en-US/docs/Web/API/Document/readyState
browser = get_browser(url)
ready_state = browser.execute_script("return document.readyState")
is_ready = (ready_state == "complete")
debug.trace_fmt(6, "document_ready({u}) => {r}; state={s}",
u=url, r=is_ready, s=ready_state)
return is_ready
def wait_until_ready(url):
"""Wait for document ready and pause to allow loading to finish"""
# TODO: make sure the sleep is proper way to pause
debug.trace_fmt(5, "in wait_until_ready({u})", u=url)
start_time = time.time()
end_time = start_time + MAX_DOWNLOAD_TIME
while ((start_time < end_time) and (not document_ready(url))):
time.sleep(MID_DOWNLOAD_SLEEP_SECONDS)
if (not document_ready(url)):
debug.trace_fmt(5, "Warning: time out ({s} secs) in accessing URL '{u}')'", s=system.round_num(end_time - start_time, 1), u=url)
debug.trace_fmt(5, "out wait_until_ready(); elapsed={t}s",
t=(time.time() - start_time))
return
def escape_html_value(value):
"""Escape VALUE for HTML embedding"""
return system.escape_html_text(value)
def unescape_html_value(value):
"""Undo escaped VALUE for HTML embedding"""
return system.unescape_html_text(value)
def escape_hash_value(hash_table, key):
"""Wrapper around escape_html_value for HASH_TABLE[KEY] (or "" if missing).
Note: newlines are converted into <br>'s."""
escaped_item_value = escape_html_value(hash_table.get(key, ""))
escaped_value = escaped_item_value.replace("\n", "<br>")
debug.trace_fmtd(7, "escape_hash_value({h}, {k}) => '{r}'", h=hash_table, k=key, r=escaped_value)
return escaped_value
def get_param_dict(param_dict=None):
"""Returns parameter dict using PARAM_DICT if non-Null else USER_PARAMETERS
Note: """
return (param_dict if param_dict else user_parameters)
def set_param_dict(param_dict):
"""Sets global user_parameters to value of PARAM_DICT"""
global user_parameters
user_parameters = param_dict
def get_url_param(name, default_value="", param_dict=None):
"""Get value for NAME from PARAM_DICT (e.g., USER_PARAMETERS), using DEFAULT_VALUE (normally "").
Note: It will be escaped for use in HTML."""
param_dict = get_param_dict(param_dict)
value = escape_html_value(param_dict.get(name, default_value))
value = system.to_unicode(value)
debug.trace_fmtd(4, "get_url_param({n}, [{d}]) => {v})",
n=name, d=default_value, v=value)
return value
#
get_url_parameter = get_url_param
def get_url_param_checkbox_spec(name, default_value="", param_dict=None):
"""Get value of boolean parameters formatted for checkbox (i.e., 'checked' iff True or on) from PARAM_DICT"""
# TODO: implement in terms of get_url_param
param_dict = get_param_dict(param_dict)
param_value = param_dict.get(name, default_value)
param_value = system.to_unicode(param_value)
value = "checked" if (param_value in [True, "on"]) else ""
debug.trace_fmtd(4, "get_url_param_checkbox_spec({n}, [{d}]) => {v})",
n=name, d=default_value, v=value)
return value
#
get_url_parameter_checkbox_spec = get_url_param_checkbox_spec
def get_url_parameter_bool(param, default_value=False, param_dict=None):
"""Get boolean value for PARAM from PARAM_DICT, with "on" treated as True. @note the hash defaults to user_parameters, and the default value is False"""
# TODO: implement in terms of get_url_param
param_dict = get_param_dict(param_dict)
## OLD:
result = (param_dict.get(param, default_value) in ["on", True])
## HACK: result = ((system.to_unicode(param_dict.get(param, default_value))) in ["on", True])
debug.trace_fmtd(4, "get_url_parameter_bool({p}, {dft}, _) => {r}",
p=param, dft=default_value, r=result)
return result
#
get_url_param_bool = get_url_parameter_bool
#-------------------------------------------------------------------------------
def main(args):
"""Supporting code for command-line processing"""
debug.trace_fmtd(6, "main({a})", a=args)
user = system.getenv_text("USER")
system.print_stderr("Warning, {u}: Not really intended for direct invocation".format(u=user))
# HACK: Do simple test of inner-HTML support
if (len(args) > 1):
debug.trace_fmt(4, "browser_cache: {bc}", bc=browser_cache)
url = args[1]
html_data = system.download_web_document(url)
filename = system.quote_url_text(url)
if debug.debugging():
write_temp_file("pre-" + filename, html_data)
## OLD: wait_until_ready(url)
## BAD: rendered_html = render(html_data)
rendered_html = get_inner_html(url)
if debug.debugging():
write_temp_file("post-" + filename, rendered_html)
print("Rendered html:")
print(system.to_utf8(rendered_html))
if debug.debugging():
rendered_text = get_inner_text(url)
debug.trace_fmt(5, "type(rendered_text): {t}", t=rendered_text)
write_temp_file("post-" + filename + ".txt", rendered_text)
debug.trace_fmt(4, "browser_cache: {bc}", bc=browser_cache)
else:
print("Specify a URL as argument 1 for a simple test of inner access")
return
if __name__ == '__main__':
main(sys.argv)