codelucas · idoshamun · Jun 17, 2021 · Jul 28, 2021 · Jul 28, 2021 · Oct 19, 2021
diff --git a/newspaper/article.py b/newspaper/article.py
@@ -153,6 +153,9 @@ def __init__(self, url, title='', source_url='', config=None, **kwargs):
         # A property dict for users to store custom data.
         self.additional_data = {}
 
+        # The final URL after redirects and meta refresh
+        self.final_url = None
+
     def build(self):
         """Build a lone article from a URL independent of the source (newspaper).
         Don't normally call this method b/c it's good to multithread articles
@@ -173,7 +176,9 @@ def _parse_scheme_file(self, path):
 
     def _parse_scheme_http(self):
         try:
-            return network.get_html_2XX_only(self.url, self.config)
+            html, final_url = network.get_html_2XX_only(self.url, self.config, return_final_url=True)
+            self.final_url = final_url
+            return html
         except requests.exceptions.RequestException as e:
             self.download_state = ArticleDownloadState.FAILED_RESPONSE
             self.download_exception_msg = str(e)
@@ -190,18 +195,27 @@ def download(self, input_html=None, title=None, recursion_counter=0):
             parsed_url = urlparse(self.url)
             if parsed_url.scheme == "file":
                 html = self._parse_scheme_file(parsed_url.path)
+                # For file scheme, the final URL is the same as the initial URL
+                if self.final_url is None:
+                    self.final_url = self.url
             else:
                 html = self._parse_scheme_http()
+                # final_url is already set in _parse_scheme_http
             if html is None:
                 log.debug('Download failed on URL %s because of %s' %
                           (self.url, self.download_exception_msg))
                 return
         else:
             html = input_html
+            # If HTML is provided directly and final_url not set, use the current URL
+            if self.final_url is None:
+                self.final_url = self.url
 
         if self.config.follow_meta_refresh:
             meta_refresh_url = extract_meta_refresh(html)
             if meta_refresh_url and recursion_counter < 1:
+                # Update final_url to the meta refresh URL
+                self.final_url = meta_refresh_url
                 return self.download(
                     input_html=network.get_html(meta_refresh_url),
                     recursion_counter=recursion_counter + 1)
@@ -213,19 +227,22 @@ def parse(self):
         self.throw_if_not_downloaded_verbose()
 
         self.doc = self.config.get_parser().fromstring(self.html)
-        self.clean_doc = copy.deepcopy(self.doc)
 
         if self.doc is None:
             # `parse` call failed, return nothing
             return
 
+        document_cleaner = DocumentCleaner(self.config)
+        output_formatter = OutputFormatter(self.config)
+
+        self.clean_doc = copy.deepcopy(self.doc)
+        # Before any computations on the body, clean DOM object
+        self.clean_doc = document_cleaner.clean(self.clean_doc)
+
         # TODO: Fix this, sync in our fix_url() method
         parse_candidate = self.get_parse_candidate()
         self.link_hash = parse_candidate.link_hash  # MD5
 
-        document_cleaner = DocumentCleaner(self.config)
-        output_formatter = OutputFormatter(self.config)
-
         title = self.extractor.get_title(self.clean_doc)
         self.set_title(title)
 
@@ -267,16 +284,23 @@ def parse(self):
             self.url,
             self.clean_doc)
 
-        # Before any computations on the body, clean DOM object
-        self.doc = document_cleaner.clean(self.doc)
-
         self.top_node = self.extractor.calculate_best_node(self.doc)
+        if self.top_node is None:
+            self.top_node = self.extractor.calculate_best_node(self.clean_doc)
+        if self.top_node is None:
+            self.top_node = self.extractor.parser.getElementById(self.doc, 'content')
+        if self.top_node is None:
+            for tag in ['article', 'main']:
+                nodes = self.extractor.parser.getElementsByTag(self.doc, tag=tag)
+                if len(nodes) > 0:
+                    self.top_node = nodes[0]
+                    break
         if self.top_node is not None:
             video_extractor = VideoExtractor(self.config, self.top_node)
             self.set_movies(video_extractor.get_videos())
 
-            self.top_node = self.extractor.post_cleanup(self.top_node)
             self.clean_top_node = copy.deepcopy(self.top_node)
+            self.clean_top_node = self.extractor.post_cleanup(self.clean_top_node)
 
             text, article_html = output_formatter.get_formatted(
                 self.top_node)

diff --git a/newspaper/configuration.py b/newspaper/configuration.py
@@ -57,6 +57,10 @@ def __init__(self):
         # Fail for error responses (e.g. 404 page)
         self.http_success_only = True
 
+        # Allow redirects (enabled by default)
+        self.allow_redirects = True
+
+        self.ignored_images_suffix_list = []
         # English is the fallback
         self._language = 'en'
 
@@ -68,6 +72,7 @@ def __init__(self):
         self.request_timeout = 7
         self.proxies = {}
         self.number_threads = 10
+        self.verify_ssl_cert = True
 
         self.verbose = False  # for debugging
 

diff --git a/newspaper/extractors.py b/newspaper/extractors.py
@@ -13,6 +13,7 @@
 
 import copy
 import logging
+import os.path
 import re
 import re
 from collections import defaultdict
@@ -449,26 +450,34 @@ def get_meta_img_url(self, article_url, doc):
         """
         top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
         try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
+        try_one = None if self.image_is_ignored(try_one) else try_one
         if not try_one:
             link_img_src_kwargs = \
                 {'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}
             elems = self.parser.getElementsByTag(doc, use_regex=True, **link_img_src_kwargs)
             try_two = elems[0].get('href') if elems else None
-
+            try_two = None if self.image_is_ignored(try_two) else try_two
             if not try_two:
                 try_three = self.get_meta_content(doc, 'meta[name="og:image"]')
-
+                try_three = None if self.image_is_ignored(try_three) else try_three
                 if not try_three:
                     link_icon_kwargs = {'tag': 'link', 'attr': 'rel', 'value': 'icon'}
                     elems = self.parser.getElementsByTag(doc, **link_icon_kwargs)
                     try_four = elems[0].get('href') if elems else None
+                    try_four = None if self.image_is_ignored(try_four) else try_four
 
         top_meta_image = try_one or try_two or try_three or try_four
 
         if top_meta_image:
             return urljoin(article_url, top_meta_image)
         return ''
 
+    def image_is_ignored(self, image):
+        return any([True for x in self.config.ignored_images_suffix_list if image and image != '' and self.match_image(x, os.path.basename(image))])
+
+    def match_image(self, pattern, image):
+        return re.search(pattern, image) is not None
+
     def get_meta_type(self, doc):
         """Returns meta type of article, open graph protocol
         """
@@ -575,6 +584,7 @@ def get_img_urls(self, article_url, doc):
                 for img_tag in img_tags if img_tag.get('src')]
         img_links = set([urljoin(article_url, url)
                          for url in urls])
+        img_links = set([x for x in img_links if not self.image_is_ignored(x)])
         return img_links
 
     def get_first_img_url(self, article_url, top_node):
@@ -1014,9 +1024,16 @@ def nodes_to_check(self, doc):
         on like paragraphs and tables
         """
         nodes_to_check = []
-        for tag in ['p', 'pre', 'td']:
-            items = self.parser.getElementsByTag(doc, tag=tag)
-            nodes_to_check += items
+        articles = self.parser.getElementsByTag(doc, tag='article')
+        if len(articles) > 0 and self.get_meta_site_name(doc) == 'Medium':
+            # Specific heuristic for Medium articles
+            sections = self.parser.getElementsByTag(articles[0], tag='section')
+            if len(sections) > 1:
+                nodes_to_check = sections
+        if len(nodes_to_check) == 0:
+            for tag in ['p', 'pre', 'td', 'ol', 'ul']:
+                items = self.parser.getElementsByTag(doc, tag=tag)
+                nodes_to_check += items
         return nodes_to_check
 
     def is_table_and_no_para_exist(self, e):

diff --git a/newspaper/images.py b/newspaper/images.py
@@ -83,6 +83,37 @@ def clean_url(url):
     return url
 
 
+def get_full_image_dimensions(image_url):
+    """Fallback in case PIL can't open the streamed image
+    """
+    try:
+        response = requests.get(image_url) # No stream=True needed
+        response.raise_for_status() # Raise an exception for bad status codes
+
+        # Use io.BytesIO to treat the response content (bytes) as a file
+        image_bytes = io.BytesIO(response.content)
+
+        # Open the image directly from the bytes stream
+        img = Image.open(image_bytes)
+
+        sz = img.size
+
+        # It's good practice to close the image when done
+        img.close()
+
+        return sz
+
+    except requests.exceptions.RequestException as e:
+        log.warning(f"Method 2 (Direct): Error fetching the image via requests: {e}")
+        return None
+    except FileNotFoundError:
+        log.warning("Method 2 (Direct): Error: io.BytesIO did not behave as expected (treated as file not found).")
+        return None
+    except Exception as e:
+        log.warning(f"Method 2 (Direct): An unexpected error occurred while opening image: {e}")
+        return None
+
+
 def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
     cur_try = 0
     nothing = None if dimension else (None, None)
@@ -143,7 +174,9 @@ def fetch_url(url, useragent, referer=None, retries=1, dimension=False):
                 if dimension and p.image:
                     return p.image.size
                 elif dimension:
-                    return nothing
+                    # we did read the image, but it failed to parse for some reason
+                    # try to download it in one go
+                    return get_full_image_dimensions(url)
             elif dimension:
                 # expected an image, but didn't get one
                 return nothing

diff --git a/newspaper/network.py b/newspaper/network.py
@@ -21,16 +21,17 @@
 FAIL_ENCODING = 'ISO-8859-1'
 
 
-def get_request_kwargs(timeout, useragent, proxies, headers):
+def get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects, verify_ssl_cert):
     """This Wrapper method exists b/c some values in req_kwargs dict
     are methods which need to be called every time we make a request
     """
     return {
         'headers': headers if headers else {'User-Agent': useragent},
         'cookies': cj(),
         'timeout': timeout,
-        'allow_redirects': True,
-        'proxies': proxies
+        'allow_redirects': allow_redirects,
+        'proxies': proxies,
+        'verify': verify_ssl_cert,
     }
 
 
@@ -44,7 +45,7 @@ def get_html(url, config=None, response=None):
         return ''
 
 
-def get_html_2XX_only(url, config=None, response=None):
+def get_html_2XX_only(url, config=None, response=None, return_final_url=False):
     """Consolidated logic for http requests from newspaper. We handle error cases:
     - Attempt to find encoding of the html by using HTTP header. Fallback to
       'ISO-8859-1' if not provided.
@@ -55,19 +56,27 @@ def get_html_2XX_only(url, config=None, response=None):
     timeout = config.request_timeout
     proxies = config.proxies
     headers = config.headers
+    verify_ssl_cert = config.verify_ssl_cert
+    allow_redirects = config.allow_redirects
 
     if response is not None:
-        return _get_html_from_response(response, config)
+        html = _get_html_from_response(response, config)
+        if return_final_url:
+            return html, getattr(response, 'url', url)
+        return html
 
     response = requests.get(
-        url=url, **get_request_kwargs(timeout, useragent, proxies, headers))
+        url=url, **get_request_kwargs(timeout, useragent, proxies, headers, allow_redirects, verify_ssl_cert))
 
     html = _get_html_from_response(response, config)
+    final_url = response.url
 
     if config.http_success_only:
         # fail if HTTP sends a non 2XX response
         response.raise_for_status()
 
+    if return_final_url:
+        return html, final_url
     return html
 
 
@@ -107,7 +116,7 @@ def __init__(self, url, config=None):
     def send(self):
         try:
             self.resp = requests.get(self.url, **get_request_kwargs(
-                self.timeout, self.useragent, self.proxies, self.headers))
+                self.timeout, self.useragent, self.proxies, self.headers, self.config.allow_redirects))
             if self.config.http_success_only:
                 self.resp.raise_for_status()
         except requests.exceptions.RequestException as e:

diff --git a/newspaper/outputformatters.py b/newspaper/outputformatters.py
@@ -9,6 +9,7 @@
 
 from html import unescape
 import logging
+import copy
 
 from .text import innerTrim
 
@@ -42,7 +43,7 @@ def get_formatted(self, top_node):
         """Returns the body text of an article, and also the body article
         html if specified. Returns in (text, html) form
         """
-        self.top_node = top_node
+        self.top_node = copy.deepcopy(top_node)
         html, text = '', ''
 
         self.remove_negativescores_nodes()

diff --git a/requirements.txt b/requirements.txt
@@ -3,7 +3,7 @@ cssselect>=0.9.2
 feedfinder2>=0.0.4
 feedparser>=5.2.1
 jieba3k>=0.35.1
-lxml>=3.6.0
+lxml==5.1.0 # https://lxml.de/5.2/changes-5.2.0.html
 nltk>=3.2.1
 Pillow>=3.3.0
 pythainlp>=1.7.2