update deps and version

wwqgtxx · Nov 28, 2018 · b77b023 · b77b023
1 parent 11422c6
commit b77b023
Show file tree

Hide file tree

Showing 209 changed files with 13,142 additions and 12,303 deletions.
diff --git a/.idea/encodings.xml b/.idea/encodings.xml
diff --git a/wwqLyParse/annie/src/github.com/iawia002/annie b/wwqLyParse/annie/src/github.com/iawia002/annie
diff --git a/wwqLyParse/lib/bs4_lib/bs4/__init__.py b/wwqLyParse/lib/bs4_lib/bs4/__init__.py
@@ -21,14 +21,15 @@
 # found in the LICENSE file.
 
 __author__ = "Leonard Richardson ([email protected])"
-__version__ = "4.6.0"
-__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
+__version__ = "4.6.3"
+__copyright__ = "Copyright (c) 2004-2018 Leonard Richardson"
 __license__ = "MIT"
 
 __all__ = ['BeautifulSoup']
 
 import os
 import re
+import sys
 import traceback
 import warnings
 
@@ -82,14 +83,46 @@ class BeautifulSoup(Tag):
 
     ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
 
-    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
+    NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
 
     def __init__(self, markup="", features=None, builder=None,
                  parse_only=None, from_encoding=None, exclude_encodings=None,
                  **kwargs):
-        """The Soup object is initialized as the 'root tag', and the
-        provided markup (which can be a string or a file-like object)
-        is fed into the underlying parser."""
+        """Constructor.
+
+        :param markup: A string or a file-like object representing
+        markup to be parsed.
+
+        :param features: Desirable features of the parser to be used. This
+        may be the name of a specific parser ("lxml", "lxml-xml",
+        "html.parser", or "html5lib") or it may be the type of markup
+        to be used ("html", "html5", "xml"). It's recommended that you
+        name a specific parser, so that Beautiful Soup gives you the
+        same results across platforms and virtual environments.
+
+        :param builder: A specific TreeBuilder to use instead of looking one
+        up based on `features`. You shouldn't need to use this.
+
+        :param parse_only: A SoupStrainer. Only parts of the document
+        matching the SoupStrainer will be considered. This is useful
+        when parsing part of a document that would otherwise be too
+        large to fit into memory.
+
+        :param from_encoding: A string indicating the encoding of the
+        document to be parsed. Pass this in if Beautiful Soup is
+        guessing wrongly about the document's encoding.
+
+        :param exclude_encodings: A list of strings indicating
+        encodings known to be wrong. Pass this in if you don't know
+        the document's encoding but you know Beautiful Soup's guess is
+        wrong.
+
+        :param kwargs: For backwards compatibility purposes, the
+        constructor accepts certain keyword arguments used in
+        Beautiful Soup 3. None of these arguments do anything in
+        Beautiful Soup 4 and there's no need to actually pass keyword
+        arguments into the constructor.
+        """
 
         if 'convertEntities' in kwargs:
             warnings.warn(
@@ -171,14 +204,35 @@ def deprecated_argument(old_name, new_name):
                 else:
                     markup_type = "HTML"
 
-                caller = traceback.extract_stack()[0]
-                filename = caller[0]
-                line_number = caller[1]
-                warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
-                    filename=filename,
-                    line_number=line_number,
-                    parser=builder.NAME,
-                    markup_type=markup_type))
+                # This code adapted from warnings.py so that we get the same line
+                # of code as our warnings.warn() call gets, even if the answer is wrong
+                # (as it may be in a multithreading situation).
+                caller = None
+                try:
+                    caller = sys._getframe(1)
+                except ValueError:
+                    pass
+                if caller:
+                    globals = caller.f_globals
+                    line_number = caller.f_lineno
+                else:
+                    globals = sys.__dict__
+                    line_number= 1                    
+                filename = globals.get('__file__')
+                if filename:
+                    fnl = filename.lower()
+                    if fnl.endswith((".pyc", ".pyo")):
+                        filename = filename[:-1]
+                if filename:
+                    # If there is no filename at all, the user is most likely in a REPL,
+                    # and the warning is not necessary.
+                    values = dict(
+                        filename=filename,
+                        line_number=line_number,
+                        parser=builder.NAME,
+                        markup_type=markup_type
+                    )
+                    warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
 
         self.builder = builder
         self.is_xml = builder.is_xml
@@ -302,9 +356,10 @@ def reset(self):
         self.preserve_whitespace_tag_stack = []
         self.pushTag(self)
 
-    def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
+    def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
         """Create a new tag associated with this soup."""
-        return Tag(None, self.builder, name, namespace, nsprefix, attrs)
+        kwattrs.update(attrs)
+        return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
 
     def new_string(self, s, subclass=NavigableString):
         """Create a new NavigableString associated with this soup."""

diff --git a/wwqLyParse/lib/bs4_lib/bs4/builder/__init__.py b/wwqLyParse/lib/bs4_lib/bs4/builder/__init__.py
@@ -93,7 +93,7 @@ class TreeBuilder(object):
     preserve_whitespace_tags = set()
     empty_element_tags = None # A tag will be considered an empty-element
                               # tag when and only when it has no contents.
-
+    
     # A value for these tag/attribute combinations is a space- or
     # comma-separated list of CDATA, rather than a single CDATA.
     cdata_list_attributes = {}
@@ -125,7 +125,7 @@ def can_be_empty_element(self, tag_name):
         if self.empty_element_tags is None:
             return True
         return tag_name in self.empty_element_tags
-
+        
     def feed(self, markup):
         raise NotImplementedError()
 
@@ -235,11 +235,17 @@ class HTMLTreeBuilder(TreeBuilder):
     empty_element_tags = set([
         # These are from HTML5.
         'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
-
-        # These are from HTML4, removed in HTML5.
-        'spacer', 'frame'
+        
+        # These are from earlier versions of HTML and are removed in HTML5.
+        'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
     ])
 
+    # The HTML standard defines these as block-level elements. Beautiful
+    # Soup does not treat these elements differently from other elements,
+    # but it may do so eventually, and this information is available if
+    # you need to use it.
+    block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
+
     # The HTML standard defines these attributes as containing a
     # space-separated list of values, not a single value. That is,
     # class="foo bar" means that the 'class' attribute has two values,

diff --git a/wwqLyParse/lib/bs4_lib/bs4/builder/_htmlparser.py b/wwqLyParse/lib/bs4_lib/bs4/builder/_htmlparser.py
@@ -1,3 +1,4 @@
+# encoding: utf-8
 """Use the HTMLParser library to parse HTML files that aren't too bad."""
 
 # Use of this source code is governed by a BSD-style license that can be
@@ -64,7 +65,18 @@ def __init__(self, *args, **kwargs):
         # order. It's a list of closing tags we've already handled and
         # will ignore, assuming they ever show up.
         self.already_closed_empty_element = []
-
+
+    def error(self, msg):
+        """In Python 3, HTMLParser subclasses must implement error(), although this
+        requirement doesn't appear to be documented.
+
+        In Python 2, HTMLParser implements error() as raising an exception.
+
+        In any event, this method is called only on very strange markup and our best strategy
+        is to pretend it didn't happen and keep going.
+        """
+        warnings.warn(msg)
+
     def handle_startendtag(self, name, attrs):
         # This is only called when the markup looks like
         # <tag/>.
@@ -129,19 +141,39 @@ def handle_charref(self, name):
         else:
             real_name = int(name)
 
-        try:
-            data = chr(real_name)
-        except (ValueError, OverflowError) as e:
-            data = "\N{REPLACEMENT CHARACTER}"
-
+        data = None
+        if real_name < 256:
+            # HTML numeric entities are supposed to reference Unicode
+            # code points, but sometimes they reference code points in
+            # some other encoding (ahem, Windows-1252). E.g. &#147;
+            # instead of &#201; for LEFT DOUBLE QUOTATION MARK. This
+            # code tries to detect this situation and compensate.
+            for encoding in (self.soup.original_encoding, 'windows-1252'):
+                if not encoding:
+                    continue
+                try:
+                    data = bytearray([real_name]).decode(encoding)
+                except UnicodeDecodeError as e:
+                    pass
+        if not data:
+            try:
+                data = chr(real_name)
+            except (ValueError, OverflowError) as e:
+                pass
+        data = data or "\N{REPLACEMENT CHARACTER}"
         self.handle_data(data)
 
     def handle_entityref(self, name):
         character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
         if character is not None:
             data = character
         else:
-            data = "&%s;" % name
+            # If this were XML, it would be ambiguous whether "&foo"
+            # was an character entity reference with a missing
+            # semicolon or the literal string "&foo". Since this is
+            # HTML, we have a complete list of all character entity references,
+            # and this one wasn't found, so assume it's the literal string "&foo".
+            data = "&%s" % name
         self.handle_data(data)
 
     def handle_comment(self, data):
@@ -213,6 +245,7 @@ def feed(self, markup):
         parser.soup = self.soup
         try:
             parser.feed(markup)
+            parser.close()
         except HTMLParseError as e:
             warnings.warn(RuntimeWarning(
                 "Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))

diff --git a/wwqLyParse/lib/bs4_lib/bs4/builder/_lxml.py b/wwqLyParse/lib/bs4_lib/bs4/builder/_lxml.py
@@ -5,9 +5,13 @@
     'LXMLTreeBuilder',
     ]
 
+try:
+    from collections.abc import Callable # Python 3.6
+except ImportError as e:
+    from collections import Callable
+
 from io import BytesIO
 from io import StringIO
-import collections
 from lxml import etree
 from bs4.element import (
     Comment,
@@ -58,7 +62,7 @@ def parser_for(self, encoding):
         # Use the default parser.
         parser = self.default_parser(encoding)
 
-        if isinstance(parser, collections.Callable):
+        if isinstance(parser, Callable):
             # Instantiate the parser with default arguments
             parser = parser(target=self, strip_cdata=False, encoding=encoding)
         return parser
@@ -147,11 +151,11 @@ def start(self, name, attrs, nsmap={}):
         attrs = dict(attrs)
         nsprefix = None
         # Invert each namespace map as it comes in.
-        if len(self.nsmaps) > 1:
-            # There are no new namespaces for this tag, but
-            # non-default namespaces are in play, so we need a
-            # separate tag stack to know when they end.
-            self.nsmaps.append(None)
+        if len(nsmap) == 0 and len(self.nsmaps) > 1:
+                # There are no new namespaces for this tag, but
+                # non-default namespaces are in play, so we need a
+                # separate tag stack to know when they end.
+                self.nsmaps.append(None)
         elif len(nsmap) > 0:
             # A new namespace mapping has come into play.
             inverted_nsmap = dict((value, key) for key, value in list(nsmap.items()))

diff --git a/wwqLyParse/lib/bs4_lib/bs4/dammit.py b/wwqLyParse/lib/bs4_lib/bs4/dammit.py
@@ -46,9 +46,9 @@ def chardet_dammit(s):
     pass
 
 xml_encoding_re = re.compile(
-    '^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
+    '^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
 html_meta_re = re.compile(
-    '<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
+    '<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
 
 class EntitySubstitution(object):
 
@@ -82,7 +82,7 @@ def _populate_class_variables():
         }
 
     BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
-                                           "&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
+                                           "&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
                                            ")")
 
     AMPERSAND_OR_BRACKET = re.compile("([<>&])")

diff --git a/wwqLyParse/lib/bs4_lib/bs4/diagnose.py b/wwqLyParse/lib/bs4_lib/bs4/diagnose.py
@@ -37,7 +37,7 @@ def diagnose(data):
                 name))
 
     if 'lxml' in basic_parsers:
-        basic_parsers.append(["lxml", "xml"])
+        basic_parsers.append("lxml-xml")
         try:
             from lxml import etree
             print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
@@ -56,21 +56,27 @@ def diagnose(data):
 
     if hasattr(data, 'read'):
         data = data.read()
-    elif os.path.exists(data):
-        print('"%s" looks like a filename. Reading data from the file.' % data)
-        with open(data) as fp:
-            data = fp.read()
     elif data.startswith("http:") or data.startswith("https:"):
         print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
         print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
         return
-    print()
+    else:
+        try:
+            if os.path.exists(data):
+                print('"%s" looks like a filename. Reading data from the file.' % data)
+                with open(data) as fp:
+                    data = fp.read()
+        except ValueError:
+            # This can happen on some platforms when the 'filename' is
+            # too long. Assume it's data and not a filename.
+            pass
+        print()
 
     for parser in basic_parsers:
         print("Trying to parse your markup with %s" % parser)
         success = False
         try:
-            soup = BeautifulSoup(data, parser)
+            soup = BeautifulSoup(data, features=parser)
             success = True
         except Exception as e:
             print("%s could not parse the markup." % parser)
+1 −1		.travis.yml
+51 −34		README.md
+2 −0		config/config.go
+1 −1		config/version.go
+21 −153		downloader/downloader.go
+1 −1		downloader/downloader_test.go
+141 −0		downloader/types.go
+3 −2		extractors/bcy/bcy_test.go
+0 −1		extractors/bilibili/bilibili.go
+1 −1		extractors/douyin/douyin.go
+1 −0		extractors/douyin/douyin_test.go
+1 −1		extractors/facebook/facebook.go
+1 −9		extractors/facebook/facebook_test.go
+1 −1		extractors/instagram/instagram.go
+3 −3		extractors/instagram/instagram_test.go
+1 −1		extractors/iqiyi/iqiyi.go
+55 −0		extractors/netease/netease.go
+48 −0		extractors/netease/netease_test.go
+1 −1		extractors/pixivision/pixivision_test.go
+1 −1		extractors/qq/qq.go
+1 −8		extractors/tumblr/tumblr_test.go
+1 −0		extractors/twitter/twitter_test.go
+1 −1		extractors/universal/universal.go
+66 −6		extractors/weibo/weibo.go
+10 −1		extractors/weibo/weibo_test.go
+32 −0		extractors/yinyuetai/types.go
+76 −0		extractors/yinyuetai/yinyuetai.go
+60 −0		extractors/yinyuetai/yinyuetai_test.go
+1 −1		extractors/youku/youku.go
+1 −1		extractors/youku/youku_test.go
+9 −4		extractors/youtube/youtube.go
+26 −5		extractors/youtube/youtube_test.go
+14 −5		main.go
+2 −2		parser/parser.go
+73 −0		static/jetbrains-variant-3.svg