Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 19 additions & 5 deletions newspaper/extractors.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
import copy
import logging
import re
import re
import json
from collections import defaultdict

from dateutil.parser import parse as date_parser
Expand Down Expand Up @@ -170,13 +170,14 @@ def parse_byline(search_str):
# return authors

def get_publishing_date(self, url, doc):
"""3 strategies for publishing date extraction. The strategies
"""4 strategies for publishing date extraction. The strategies
are descending in accuracy and the next strategy is only
attempted if a preferred one fails.

1. Pubdate from URL
2. Pubdate from metadata
3. Raw regex searches in the HTML + added heuristics
2. Yoast SEO WebPage graph informations
3. Pubdate from metadata
4. Raw regex searches in the HTML + added heuristics
"""

def parse_date_str(date_str):
Expand All @@ -195,6 +196,19 @@ def parse_date_str(date_str):
if datetime_obj:
return datetime_obj

yoast_seo = self.parser.getElementsByTag(doc, attr='class', value='yoast-schema-graph')

if len(yoast_seo) > 0:
yoast_seo_parsed = json.loads(yoast_seo[0].text)
webpage_graph = list(
filter(lambda g: g.get("@type", None) == "WebPage", yoast_seo_parsed.get("@graph", [])))
if len(webpage_graph) > 0:
webpage_graph = webpage_graph[0]

date_published = webpage_graph.get("datePublished", None)
if date_published is not None:
return parse_date_str(date_published)

PUBLISH_DATE_TAGS = [
{'attribute': 'property', 'value': 'rnews:datePublished',
'content': 'content'},
Expand Down Expand Up @@ -448,7 +462,7 @@ def get_meta_img_url(self, article_url, doc):
"""Returns the 'top img' as specified by the website
"""
top_meta_image, try_one, try_two, try_three, try_four = [None] * 5
try_one = self.get_meta_content(doc, 'meta[property="og:image"]')
try_one = self.get_meta_content(doc, 'meta[property="og:image"]') or self.get_meta_content(doc, 'meta[name="twitter:image"]')
if not try_one:
link_img_src_kwargs = \
{'tag': 'link', 'attr': 'rel', 'value': 'img_src|image_src'}
Expand Down