Skip to content

Commit

Permalink
update deps and version
Browse files Browse the repository at this point in the history
  • Loading branch information
wwqgtxx committed Nov 28, 2018
1 parent 11422c6 commit b77b023
Show file tree
Hide file tree
Showing 209 changed files with 13,142 additions and 12,303 deletions.
4 changes: 4 additions & 0 deletions .idea/encodings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

87 changes: 71 additions & 16 deletions wwqLyParse/lib/bs4_lib/bs4/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,14 +21,15 @@
# found in the LICENSE file.

__author__ = "Leonard Richardson ([email protected])"
__version__ = "4.6.0"
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
__version__ = "4.6.3"
__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
__license__ = "MIT"

__all__ = ['BeautifulSoup']

import os
import re
import sys
import traceback
import warnings

Expand Down Expand Up @@ -82,14 +83,46 @@ class BeautifulSoup(Tag):

ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'

NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"

def __init__(self, markup="", features=None, builder=None,
parse_only=None, from_encoding=None, exclude_encodings=None,
**kwargs):
"""The Soup object is initialized as the 'root tag', and the
provided markup (which can be a string or a file-like object)
is fed into the underlying parser."""
"""Constructor.
:param markup: A string or a file-like object representing
markup to be parsed.
:param features: Desirable features of the parser to be used. This
may be the name of a specific parser ("lxml", "lxml-xml",
"html.parser", or "html5lib") or it may be the type of markup
to be used ("html", "html5", "xml"). It's recommended that you
name a specific parser, so that Beautiful Soup gives you the
same results across platforms and virtual environments.
:param builder: A specific TreeBuilder to use instead of looking one
up based on `features`. You shouldn't need to use this.
:param parse_only: A SoupStrainer. Only parts of the document
matching the SoupStrainer will be considered. This is useful
when parsing part of a document that would otherwise be too
large to fit into memory.
:param from_encoding: A string indicating the encoding of the
document to be parsed. Pass this in if Beautiful Soup is
guessing wrongly about the document's encoding.
:param exclude_encodings: A list of strings indicating
encodings known to be wrong. Pass this in if you don't know
the document's encoding but you know Beautiful Soup's guess is
wrong.
:param kwargs: For backwards compatibility purposes, the
constructor accepts certain keyword arguments used in
Beautiful Soup 3. None of these arguments do anything in
Beautiful Soup 4 and there's no need to actually pass keyword
arguments into the constructor.
"""

if 'convertEntities' in kwargs:
warnings.warn(
Expand Down Expand Up @@ -171,14 +204,35 @@ def deprecated_argument(old_name, new_name):
else:
markup_type = "HTML"

caller = traceback.extract_stack()[0]
filename = caller[0]
line_number = caller[1]
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type))
# This code adapted from warnings.py so that we get the same line
# of code as our warnings.warn() call gets, even if the answer is wrong
# (as it may be in a multithreading situation).
caller = None
try:
caller = sys._getframe(1)
except ValueError:
pass
if caller:
globals = caller.f_globals
line_number = caller.f_lineno
else:
globals = sys.__dict__
line_number= 1
filename = globals.get('__file__')
if filename:
fnl = filename.lower()
if fnl.endswith((".pyc", ".pyo")):
filename = filename[:-1]
if filename:
# If there is no filename at all, the user is most likely in a REPL,
# and the warning is not necessary.
values = dict(
filename=filename,
line_number=line_number,
parser=builder.NAME,
markup_type=markup_type
)
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)

self.builder = builder
self.is_xml = builder.is_xml
Expand Down Expand Up @@ -302,9 +356,10 @@ def reset(self):
self.preserve_whitespace_tag_stack = []
self.pushTag(self)

def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
"""Create a new tag associated with this soup."""
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
kwattrs.update(attrs)
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)

def new_string(self, s, subclass=NavigableString):
"""Create a new NavigableString associated with this soup."""
Expand Down
16 changes: 11 additions & 5 deletions wwqLyParse/lib/bs4_lib/bs4/builder/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class TreeBuilder(object):
preserve_whitespace_tags = set()
empty_element_tags = None # A tag will be considered an empty-element
# tag when and only when it has no contents.

# A value for these tag/attribute combinations is a space- or
# comma-separated list of CDATA, rather than a single CDATA.
cdata_list_attributes = {}
Expand Down Expand Up @@ -125,7 +125,7 @@ def can_be_empty_element(self, tag_name):
if self.empty_element_tags is None:
return True
return tag_name in self.empty_element_tags

def feed(self, markup):
raise NotImplementedError()

Expand Down Expand Up @@ -235,11 +235,17 @@ class HTMLTreeBuilder(TreeBuilder):
empty_element_tags = set([
# These are from HTML5.
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',

# These are from HTML4, removed in HTML5.
'spacer', 'frame'
# These are from earlier versions of HTML and are removed in HTML5.
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
])

# The HTML standard defines these as block-level elements. Beautiful
# Soup does not treat these elements differently from other elements,
# but it may do so eventually, and this information is available if
# you need to use it.
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])

# The HTML standard defines these attributes as containing a
# space-separated list of values, not a single value. That is,
# class="foo bar" means that the 'class' attribute has two values,
Expand Down
47 changes: 40 additions & 7 deletions wwqLyParse/lib/bs4_lib/bs4/builder/_htmlparser.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# encoding: utf-8
"""Use the HTMLParser library to parse HTML files that aren't too bad."""

# Use of this source code is governed by a BSD-style license that can be
Expand Down Expand Up @@ -64,7 +65,18 @@ def __init__(self, *args, **kwargs):
# order. It's a list of closing tags we've already handled and
# will ignore, assuming they ever show up.
self.already_closed_empty_element = []


def error(self, msg):
"""In Python 3, HTMLParser subclasses must implement error(), although this
requirement doesn't appear to be documented.
In Python 2, HTMLParser implements error() as raising an exception.
In any event, this method is called only on very strange markup and our best strategy
is to pretend it didn't happen and keep going.
"""
warnings.warn(msg)

def handle_startendtag(self, name, attrs):
# This is only called when the markup looks like
# <tag/>.
Expand Down Expand Up @@ -129,19 +141,39 @@ def handle_charref(self, name):
else:
real_name = int(name)

try:
data = chr(real_name)
except (ValueError, OverflowError) as e:
data = "\N{REPLACEMENT CHARACTER}"

data = None
if real_name < 256:
# HTML numeric entities are supposed to reference Unicode
# code points, but sometimes they reference code points in
# some other encoding (ahem, Windows-1252). E.g. &#147;
# instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
# code tries to detect this situation and compensate.
for encoding in (self.soup.original_encoding, 'windows-1252'):
if not encoding:
continue
try:
data = bytearray([real_name]).decode(encoding)
except UnicodeDecodeError as e:
pass
if not data:
try:
data = chr(real_name)
except (ValueError, OverflowError) as e:
pass
data = data or "\N{REPLACEMENT CHARACTER}"
self.handle_data(data)

def handle_entityref(self, name):
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
if character is not None:
data = character
else:
data = "&%s;" % name
# If this were XML, it would be ambiguous whether "&foo"
# was an character entity reference with a missing
# semicolon or the literal string "&foo". Since this is
# HTML, we have a complete list of all character entity references,
# and this one wasn't found, so assume it's the literal string "&foo".
data = "&%s" % name
self.handle_data(data)

def handle_comment(self, data):
Expand Down Expand Up @@ -213,6 +245,7 @@ def feed(self, markup):
parser.soup = self.soup
try:
parser.feed(markup)
parser.close()
except HTMLParseError as e:
warnings.warn(RuntimeWarning(
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
Expand Down
18 changes: 11 additions & 7 deletions wwqLyParse/lib/bs4_lib/bs4/builder/_lxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,13 @@
'LXMLTreeBuilder',
]

try:
from collections.abc import Callable # Python 3.6
except ImportError as e:
from collections import Callable

from io import BytesIO
from io import StringIO
import collections
from lxml import etree
from bs4.element import (
Comment,
Expand Down Expand Up @@ -58,7 +62,7 @@ def parser_for(self, encoding):
# Use the default parser.
parser = self.default_parser(encoding)

if isinstance(parser, collections.Callable):
if isinstance(parser, Callable):
# Instantiate the parser with default arguments
parser = parser(target=self, strip_cdata=False, encoding=encoding)
return parser
Expand Down Expand Up @@ -147,11 +151,11 @@ def start(self, name, attrs, nsmap={}):
attrs = dict(attrs)
nsprefix = None
# Invert each namespace map as it comes in.
if len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
if len(nsmap) == 0 and len(self.nsmaps) > 1:
# There are no new namespaces for this tag, but
# non-default namespaces are in play, so we need a
# separate tag stack to know when they end.
self.nsmaps.append(None)
elif len(nsmap) > 0:
# A new namespace mapping has come into play.
inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))
Expand Down
6 changes: 3 additions & 3 deletions wwqLyParse/lib/bs4_lib/bs4/dammit.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,9 @@ def chardet_dammit(s):
pass

xml_encoding_re = re.compile(
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
html_meta_re = re.compile(
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)

class EntitySubstitution(object):

Expand Down Expand Up @@ -82,7 +82,7 @@ def _populate_class_variables():
}

BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
")")

AMPERSAND_OR_BRACKET = re.compile("([<>&])")
Expand Down
20 changes: 13 additions & 7 deletions wwqLyParse/lib/bs4_lib/bs4/diagnose.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def diagnose(data):
name))

if 'lxml' in basic_parsers:
basic_parsers.append(["lxml", "xml"])
basic_parsers.append("lxml-xml")
try:
from lxml import etree
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
Expand All @@ -56,21 +56,27 @@ def diagnose(data):

if hasattr(data, 'read'):
data = data.read()
elif os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data)
with open(data) as fp:
data = fp.read()
elif data.startswith("http:") or data.startswith("https:"):
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
return
print()
else:
try:
if os.path.exists(data):
print('"%s" looks like a filename. Reading data from the file.' % data)
with open(data) as fp:
data = fp.read()
except ValueError:
# This can happen on some platforms when the 'filename' is
# too long. Assume it's data and not a filename.
pass
print()

for parser in basic_parsers:
print("Trying to parse your markup with %s" % parser)
success = False
try:
soup = BeautifulSoup(data, parser)
soup = BeautifulSoup(data, features=parser)
success = True
except Exception as e:
print("%s could not parse the markup." % parser)
Expand Down
Loading

0 comments on commit b77b023

Please sign in to comment.