Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 33 additions & 9 deletions newspaper/article.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ def __init__(self, url, title='', source_url='', config=None, **kwargs):
# A property dict for users to store custom data.
self.additional_data = {}

# The final URL after redirects and meta refresh
self.final_url = None

def build(self):
"""Build a lone article from a URL independent of the source (newspaper).
Don't normally call this method b/c it's good to multithread articles
Expand All @@ -173,7 +176,9 @@ def _parse_scheme_file(self, path):

def _parse_scheme_http(self):
try:
return network.get_html_2XX_only(self.url, self.config)
html, final_url = network.get_html_2XX_only(self.url, self.config, return_final_url=True)
self.final_url = final_url
return html
except requests.exceptions.RequestException as e:
self.download_state = ArticleDownloadState.FAILED_RESPONSE
self.download_exception_msg = str(e)
Expand All @@ -190,18 +195,27 @@ def download(self, input_html=None, title=None, recursion_counter=0):
parsed_url = urlparse(self.url)
if parsed_url.scheme == "file":
html = self._parse_scheme_file(parsed_url.path)
# For file scheme, the final URL is the same as the initial URL
if self.final_url is None:
self.final_url = self.url
else:
html = self._parse_scheme_http()
# final_url is already set in _parse_scheme_http
if html is None:
log.debug('Download failed on URL %s because of %s' %
(self.url, self.download_exception_msg))
return
else:
html = input_html
# If HTML is provided directly and final_url not set, use the current URL
if self.final_url is None:
self.final_url = self.url

if self.config.follow_meta_refresh:
meta_refresh_url = extract_meta_refresh(html)
if meta_refresh_url and recursion_counter < 1:
# Update final_url to the meta refresh URL
self.final_url = meta_refresh_url
return self.download(
input_html=network.get_html(meta_refresh_url),
recursion_counter=recursion_counter + 1)
Expand All @@ -213,19 +227,22 @@ def parse(self):
self.throw_if_not_downloaded_verbose()

self.doc = self.config.get_parser().fromstring(self.html)
self.clean_doc = copy.deepcopy(self.doc)

if self.doc is None:
# `parse` call failed, return nothing
return

document_cleaner = DocumentCleaner(self.config)
output_formatter = OutputFormatter(self.config)

self.clean_doc = copy.deepcopy(self.doc)
# Before any computations on the body, clean DOM object
self.clean_doc = document_cleaner.clean(self.clean_doc)

# TODO: Fix this, sync in our fix_url() method
parse_candidate = self.get_parse_candidate()
self.link_hash = parse_candidate.link_hash # MD5

document_cleaner = DocumentCleaner(self.config)
output_formatter = OutputFormatter(self.config)

title = self.extractor.get_title(self.clean_doc)
self.set_title(title)

Expand Down Expand Up @@ -267,16 +284,23 @@ def parse(self):
self.url,
self.clean_doc)

# Before any computations on the body, clean DOM object
self.doc = document_cleaner.clean(self.doc)

self.top_node = self.extractor.calculate_best_node(self.doc)
if self.top_node is None:
self.top_node = self.extractor.calculate_best_node(self.clean_doc)
if self.top_node is None:
self.top_node = self.extractor.parser.getElementById(self.doc, 'content')
if self.top_node is None:
for tag in ['article', 'main']:
nodes = self.extractor.parser.getElementsByTag(self.doc, tag=tag)
if len(nodes) > 0:
self.top_node = nodes[0]
break
if self.top_node is not None:
video_extractor = VideoExtractor(self.config, self.top_node)
self.set_movies(video_extractor.get_videos())

self.top_node = self.extractor.post_cleanup(self.top_node)
self.clean_top_node = copy.deepcopy(self.top_node)
self.clean_top_node = self.extractor.post_cleanup(self.clean_top_node)

text, article_html = output_formatter.get_formatted(
self.top_node)
Expand Down
5 changes: 5 additions & 0 deletions newspaper/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ def __init__(self):
# Fail for error responses (e.g. 404 page)
self.http_success_only = True

# Allow redirects (enabled by default)
self.allow_redirects = True

self.ignored_images_suffix_list = []
# English is the fallback
self._language = 'en'

Expand All @@ -68,6 +72,7 @@ def __init__(self):
self.request_timeout = 7
self.proxies = {}
self.number_threads = 10
self.verify_ssl_cert = True

self.verbose = False # for debugging

Expand Down
27 changes: 22 additions & 5 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@

import copy
import logging
import os.path
import re
import re
from collections import defaultdict
Expand Down Expand Up @@ -449,26 +450,34 @@ def get_meta_img_url(self, article_url, doc):
"""
top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
try_one = None if self.image_is_ignored(try_one) else try_one
if not try_one:
link_img_src_kwargs = \
{'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}
elems = self.parser.getElementsByTag(doc, use_regex=True, **link_img_src_kwargs)
try_two = elems[0].get('href') if elems else None

try_two = None if self.image_is_ignored(try_two) else try_two
if not try_two:
try_three = self.get_meta_content(doc, 'meta[name="og:image"]')

try_three = None if self.image_is_ignored(try_three) else try_three
if not try_three:
link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
elems = self.parser.getElementsByTag(doc, **link_icon_kwargs)
try_four = elems[0].get('href') if elems else None
try_four = None if self.image_is_ignored(try_four) else try_four

top_meta_image = try_one or try_two or try_three or try_four

if top_meta_image:
return urljoin(article_url, top_meta_image)
return ''

def image_is_ignored(self, image):
return any([True for x in self.config.ignored_images_suffix_list if image and image != '' and self.match_image(x, os.path.basename(image))])

def match_image(self, pattern, image):
return re.search(pattern, image) is not None

def get_meta_type(self, doc):
"""Returns meta type of article, open graph protocol
"""
Expand Down Expand Up @@ -575,6 +584,7 @@ def get_img_urls(self, article_url, doc):
for img_tag in img_tags if img_tag.get('src')]
img_links = set([urljoin(article_url, url)
for url in urls])
img_links = set([x for x in img_links if not self.image_is_ignored(x)])
return img_links

def get_first_img_url(self, article_url, top_node):
Expand Down Expand Up @@ -1014,9 +1024,16 @@ def nodes_to_check(self, doc):
on like paragraphs and tables
"""
nodes_to_check = []
for tag in ['p', 'pre', 'td']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
articles = self.parser.getElementsByTag(doc, tag='article')
if len(articles) > 0 and self.get_meta_site_name(doc) == 'Medium':
# Specific heuristic for Medium articles
sections = self.parser.getElementsByTag(articles[0], tag='section')
if len(sections) > 1:
nodes_to_check = sections
if len(nodes_to_check) == 0:
for tag in ['p', 'pre', 'td', 'ol', 'ul']:
items = self.parser.getElementsByTag(doc, tag=tag)
nodes_to_check += items
return nodes_to_check

def is_table_and_no_para_exist(self, e):
Expand Down
35 changes: 34 additions & 1 deletion newspaper/images.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,37 @@ def clean_url(url):
return url


def get_full_image_dimensions(image_url):
"""Fallback in case PIL can't open the streamed image
"""
try:
response = requests.get(image_url) # No stream=True needed
response.raise_for_status() # Raise an exception for bad status codes

# Use io.BytesIO to treat the response content (bytes) as a file
image_bytes = io.BytesIO(response.content)

# Open the image directly from the bytes stream
img = Image.open(image_bytes)

sz = img.size

# It's good practice to close the image when done
img.close()

return sz

except requests.exceptions.RequestException as e:
log.warning(f"Method 2 (Direct): Error fetching the image via requests: {e}")
return None
except FileNotFoundError:
log.warning("Method 2 (Direct): Error: io.BytesIO did not behave as expected (treated as file not found).")
return None
except Exception as e:
log.warning(f"Method 2 (Direct): An unexpected error occurred while opening image: {e}")
return None


def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
cur_try = 0
nothing = None if dimension else (None, None)
Expand Down Expand Up @@ -143,7 +174,9 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
if dimension and p.image:
return p.image.size
elif dimension:
return nothing
# we did read the image, but it failed to parse for some reason
# try to download it in one go
return get_full_image_dimensions(url)
elif dimension:
# expected an image, but didn't get one
return nothing
Expand Down
23 changes: 16 additions & 7 deletions newspaper/network.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,17 @@
FAIL_ENCODING = 'ISO-8859-1'


def get_request_kwargs(timeout, useragent, proxies, headers):
def get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects, verify_ssl_cert):
"""This Wrapper method exists b/c some values in req_kwargs dict
are methods which need to be called every time we make a request
"""
return {
'headers': headers if headers else {'User-Agent': useragent},
'cookies': cj(),
'timeout': timeout,
'allow_redirects': True,
'proxies': proxies
'allow_redirects': allow_redirects,
'proxies': proxies,
'verify': verify_ssl_cert,
}


Expand All @@ -44,7 +45,7 @@ def get_html(url, config=None, response=None):
return ''


def get_html_2XX_only(url, config=None, response=None):
def get_html_2XX_only(url, config=None, response=None, return_final_url=False):
"""Consolidated logic for http requests from newspaper. We handle error cases:
- Attempt to find encoding of the html by using HTTP header. Fallback to
'ISO-8859-1' if not provided.
Expand All @@ -55,19 +56,27 @@ def get_html_2XX_only(url, config=None, response=None):
timeout = config.request_timeout
proxies = config.proxies
headers = config.headers
verify_ssl_cert = config.verify_ssl_cert
allow_redirects = config.allow_redirects

if response is not None:
return _get_html_from_response(response, config)
html = _get_html_from_response(response, config)
if return_final_url:
return html, getattr(response, 'url', url)
return html

response = requests.get(
url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects, verify_ssl_cert))

html = _get_html_from_response(response, config)
final_url = response.url

if config.http_success_only:
# fail if HTTP sends a non 2XX response
response.raise_for_status()

if return_final_url:
return html, final_url
return html


Expand Down Expand Up @@ -107,7 +116,7 @@ def __init__(self, url, config=None):
def send(self):
try:
self.resp = requests.get(self.url, **get_request_kwargs(
self.timeout, self.useragent, self.proxies, self.headers))
self.timeout, self.useragent, self.proxies, self.headers, self.config.allow_redirects))
if self.config.http_success_only:
self.resp.raise_for_status()
except requests.exceptions.RequestException as e:
Expand Down
3 changes: 2 additions & 1 deletion newspaper/outputformatters.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from html import unescape
import logging
import copy

from .text import innerTrim

Expand Down Expand Up @@ -42,7 +43,7 @@ def get_formatted(self, top_node):
"""Returns the body text of an article, and also the body article
html if specified. Returns in (text, html) form
"""
self.top_node = top_node
self.top_node = copy.deepcopy(top_node)
html, text = '', ''

self.remove_negativescores_nodes()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ cssselect>=0.9.2
feedfinder2>=0.0.4
feedparser>=5.2.1
jieba3k>=0.35.1
lxml>=3.6.0
lxml==5.1.0 # https://lxml.de/5.2/changes-5.2.0.html
nltk>=3.2.1
Pillow>=3.3.0
pythainlp>=1.7.2
Expand Down
Loading