diff --git a/newspaper/extractors.py b/newspaper/extractors.py index 962554014..dc94e5e1a 100644 --- a/newspaper/extractors.py +++ b/newspaper/extractors.py @@ -14,7 +14,7 @@ import copy import logging import re -import re +import json from collections import defaultdict from dateutil.parser import parse as date_parser @@ -170,13 +170,14 @@ def parse_byline(search_str): # return authors def get_publishing_date(self, url, doc): - """3 strategies for publishing date extraction. The strategies + """4 strategies for publishing date extraction. The strategies are descending in accuracy and the next strategy is only attempted if a preferred one fails. 1. Pubdate from URL - 2. Pubdate from metadata - 3. Raw regex searches in the HTML + added heuristics + 2. Yoast SEO WebPage graph informations + 3. Pubdate from metadata + 4. Raw regex searches in the HTML + added heuristics """ def parse_date_str(date_str): @@ -195,6 +196,19 @@ def parse_date_str(date_str): if datetime_obj: return datetime_obj + yoast_seo = self.parser.getElementsByTag(doc, attr='class', value='yoast-schema-graph') + + if len(yoast_seo) > 0: + yoast_seo_parsed = json.loads(yoast_seo[0].text) + webpage_graph = list( + filter(lambda g: g.get("@type", None) == "WebPage", yoast_seo_parsed.get("@graph", []))) + if len(webpage_graph) > 0: + webpage_graph = webpage_graph[0] + + date_published = webpage_graph.get("datePublished", None) + if date_published is not None: + return parse_date_str(date_published) + PUBLISH_DATE_TAGS = [ {'attribute': 'property', 'value': 'rnews:datePublished', 'content': 'content'}, @@ -448,7 +462,7 @@ def get_meta_img_url(self, article_url, doc): """Returns the 'top img' as specified by the website """ top_meta_image, try_one, try_two, try_three, try_four = [None] * 5 - try_one = self.get_meta_content(doc, 'meta[property="og:image"]') + try_one = self.get_meta_content(doc, 'meta[property="og:image"]') or self.get_meta_content(doc, 'meta[name="twitter:image"]') if not try_one: link_img_src_kwargs = \ {'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}