mirror of
https://github.com/morpheus65535/bazarr.git
synced 2026-02-19 07:05:42 -05:00
WIP
This commit is contained in:
@@ -5,7 +5,6 @@ import os
|
||||
from simpleconfigparser import simpleconfigparser
|
||||
|
||||
from get_args import args
|
||||
from six import PY3
|
||||
|
||||
defaults = {
|
||||
'general': {
|
||||
@@ -140,10 +139,7 @@ defaults = {
|
||||
}
|
||||
}
|
||||
|
||||
if PY3:
|
||||
settings = simpleconfigparser(defaults=defaults, interpolation=None)
|
||||
else:
|
||||
settings = simpleconfigparser(defaults=defaults)
|
||||
settings = simpleconfigparser(defaults=defaults, interpolation=None)
|
||||
settings.read(os.path.join(args.config_dir, 'config', 'config.ini'))
|
||||
|
||||
base_url = settings.general.base_url
|
||||
|
||||
@@ -1126,28 +1126,17 @@ def postprocessing(command, path):
|
||||
try:
|
||||
encoding = getpreferredencoding()
|
||||
if os.name == 'nt':
|
||||
if six.PY3:
|
||||
codepage = subprocess.Popen("chcp", shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE, encoding=getpreferredencoding())
|
||||
else:
|
||||
codepage = subprocess.Popen("chcp", shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
codepage = subprocess.Popen("chcp", shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE, encoding=getpreferredencoding())
|
||||
# wait for the process to terminate
|
||||
out_codepage, err_codepage = codepage.communicate()
|
||||
encoding = out_codepage.split(':')[-1].strip()
|
||||
|
||||
if six.PY3:
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE, encoding=encoding)
|
||||
else:
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE)
|
||||
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE, encoding=encoding)
|
||||
# wait for the process to terminate
|
||||
out, err = process.communicate()
|
||||
|
||||
if six.PY2:
|
||||
out = out.decode(encoding)
|
||||
|
||||
out = out.replace('\n', ' ').replace('\r', ' ')
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -18,11 +18,6 @@ def clean_libs():
|
||||
|
||||
def set_libs():
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs/'))
|
||||
from six import PY3
|
||||
if PY3:
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs3/'))
|
||||
else:
|
||||
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs2/'))
|
||||
|
||||
|
||||
clean_libs()
|
||||
|
||||
@@ -379,17 +379,11 @@ def guess_external_subtitles(dest_folder, subtitles):
|
||||
continue
|
||||
detected_language = None
|
||||
|
||||
if six.PY3:
|
||||
with open(subtitle_path, 'r', errors='ignore') as f:
|
||||
text = f.read()
|
||||
else:
|
||||
with open(subtitle_path, 'r') as f:
|
||||
text = f.read()
|
||||
with open(subtitle_path, 'r', errors='ignore') as f:
|
||||
text = f.read()
|
||||
|
||||
try:
|
||||
encoding = UnicodeDammit(text)
|
||||
if six.PY2:
|
||||
text = text.decode(encoding.original_encoding)
|
||||
detected_language = langdetect.detect(text)
|
||||
except Exception as e:
|
||||
logging.exception('BAZARR Error trying to detect language for this subtitles file: ' +
|
||||
|
||||
@@ -42,8 +42,7 @@ class NoExceptionFormatter(logging.Formatter):
|
||||
|
||||
|
||||
def configure_logging(debug=False):
|
||||
if six.PY3:
|
||||
warnings.simplefilter('ignore', category=ResourceWarning)
|
||||
warnings.simplefilter('ignore', category=ResourceWarning)
|
||||
|
||||
if not debug:
|
||||
log_level = "INFO"
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
# coding=utf-8
|
||||
|
||||
bazarr_version = '0.8.4.1'
|
||||
bazarr_version = '0.9'
|
||||
|
||||
import os
|
||||
os.environ["SZ_USER_AGENT"] = "Bazarr/1"
|
||||
@@ -41,7 +41,7 @@ from notifier import update_notifier
|
||||
from cherrypy.wsgiserver import CherryPyWSGIServer
|
||||
|
||||
from io import BytesIO
|
||||
from six import text_type, PY2
|
||||
from six import text_type
|
||||
from datetime import timedelta
|
||||
from get_languages import load_language_in_db, language_from_alpha3, language_from_alpha2, alpha2_from_alpha3
|
||||
from flask import make_response, request, redirect, abort, render_template, Response, session, flash, url_for, \
|
||||
@@ -1647,8 +1647,7 @@ def movie_history(no):
|
||||
|
||||
# Mute DeprecationWarning
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
if six.PY3:
|
||||
warnings.simplefilter("ignore", BrokenPipeError)
|
||||
warnings.simplefilter("ignore", BrokenPipeError)
|
||||
if args.dev:
|
||||
server = app.run(
|
||||
host=str(settings.general.ip), port=(int(args.port) if args.port else int(settings.general.port)))
|
||||
|
||||
@@ -1,43 +0,0 @@
|
||||
Behold, mortal, the origins of Beautiful Soup...
|
||||
================================================
|
||||
|
||||
Leonard Richardson is the primary programmer.
|
||||
|
||||
Aaron DeVore is awesome.
|
||||
|
||||
Mark Pilgrim provided the encoding detection code that forms the base
|
||||
of UnicodeDammit.
|
||||
|
||||
Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful
|
||||
Soup 4 working under Python 3.
|
||||
|
||||
Simon Willison wrote soupselect, which was used to make Beautiful Soup
|
||||
support CSS selectors.
|
||||
|
||||
Sam Ruby helped with a lot of edge cases.
|
||||
|
||||
Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his
|
||||
work in solving the nestable tags conundrum.
|
||||
|
||||
An incomplete list of people have contributed patches to Beautiful
|
||||
Soup:
|
||||
|
||||
Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang,
|
||||
Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris
|
||||
Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren,
|
||||
Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed
|
||||
Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko
|
||||
Samastur, Jouni Sepp<70>nen, Alexander Schmolck, Andy Theyers, Glyn
|
||||
Webster, Paul Wright, Danny Yoo
|
||||
|
||||
An incomplete list of people who made suggestions or found bugs or
|
||||
found ways to break Beautiful Soup:
|
||||
|
||||
Hanno B<>ck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel,
|
||||
Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes,
|
||||
Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams,
|
||||
warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison,
|
||||
Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed
|
||||
Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart
|
||||
Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de
|
||||
Sousa Rocha, Yichun Wei, Per Vognsen
|
||||
@@ -1,27 +0,0 @@
|
||||
Beautiful Soup is made available under the MIT license:
|
||||
|
||||
Copyright (c) 2004-2015 Leonard Richardson
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining
|
||||
a copy of this software and associated documentation files (the
|
||||
"Software"), to deal in the Software without restriction, including
|
||||
without limitation the rights to use, copy, modify, merge, publish,
|
||||
distribute, sublicense, and/or sell copies of the Software, and to
|
||||
permit persons to whom the Software is furnished to do so, subject to
|
||||
the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be
|
||||
included in all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
|
||||
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
|
||||
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
||||
SOFTWARE.
|
||||
|
||||
Beautiful Soup incorporates code from the html5lib library, which is
|
||||
also made available under the MIT license. Copyright (c) 2006-2013
|
||||
James Graham and other contributors
|
||||
1190
libs2/bs4/NEWS.txt
1190
libs2/bs4/NEWS.txt
File diff suppressed because it is too large
Load Diff
@@ -1,63 +0,0 @@
|
||||
= Introduction =
|
||||
|
||||
>>> from bs4 import BeautifulSoup
|
||||
>>> soup = BeautifulSoup("<p>Some<b>bad<i>HTML")
|
||||
>>> print soup.prettify()
|
||||
<html>
|
||||
<body>
|
||||
<p>
|
||||
Some
|
||||
<b>
|
||||
bad
|
||||
<i>
|
||||
HTML
|
||||
</i>
|
||||
</b>
|
||||
</p>
|
||||
</body>
|
||||
</html>
|
||||
>>> soup.find(text="bad")
|
||||
u'bad'
|
||||
|
||||
>>> soup.i
|
||||
<i>HTML</i>
|
||||
|
||||
>>> soup = BeautifulSoup("<tag1>Some<tag2/>bad<tag3>XML", "xml")
|
||||
>>> print soup.prettify()
|
||||
<?xml version="1.0" encoding="utf-8">
|
||||
<tag1>
|
||||
Some
|
||||
<tag2 />
|
||||
bad
|
||||
<tag3>
|
||||
XML
|
||||
</tag3>
|
||||
</tag1>
|
||||
|
||||
= Full documentation =
|
||||
|
||||
The bs4/doc/ directory contains full documentation in Sphinx
|
||||
format. Run "make html" in that directory to create HTML
|
||||
documentation.
|
||||
|
||||
= Running the unit tests =
|
||||
|
||||
Beautiful Soup supports unit test discovery from the project root directory:
|
||||
|
||||
$ nosetests
|
||||
|
||||
$ python -m unittest discover -s bs4 # Python 2.7 and up
|
||||
|
||||
If you checked out the source tree, you should see a script in the
|
||||
home directory called test-all-versions. This script will run the unit
|
||||
tests under Python 2.7, then create a temporary Python 3 conversion of
|
||||
the source and run the unit tests again under Python 3.
|
||||
|
||||
= Links =
|
||||
|
||||
Homepage: http://www.crummy.com/software/BeautifulSoup/bs4/
|
||||
Documentation: http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
http://readthedocs.org/docs/beautiful-soup-4/
|
||||
Discussion group: http://groups.google.com/group/beautifulsoup/
|
||||
Development: https://code.launchpad.net/beautifulsoup/
|
||||
Bug tracker: https://bugs.launchpad.net/beautifulsoup/
|
||||
@@ -1,31 +0,0 @@
|
||||
Additions
|
||||
---------
|
||||
|
||||
More of the jQuery API: nextUntil?
|
||||
|
||||
Optimizations
|
||||
-------------
|
||||
|
||||
The html5lib tree builder doesn't use the standard tree-building API,
|
||||
which worries me and has resulted in a number of bugs.
|
||||
|
||||
markup_attr_map can be optimized since it's always a map now.
|
||||
|
||||
Upon encountering UTF-16LE data or some other uncommon serialization
|
||||
of Unicode, UnicodeDammit will convert the data to Unicode, then
|
||||
encode it at UTF-8. This is wasteful because it will just get decoded
|
||||
back to Unicode.
|
||||
|
||||
CDATA
|
||||
-----
|
||||
|
||||
The elementtree XMLParser has a strip_cdata argument that, when set to
|
||||
False, should allow Beautiful Soup to preserve CDATA sections instead
|
||||
of treating them as text. Except it doesn't. (This argument is also
|
||||
present for HTMLParser, and also does nothing there.)
|
||||
|
||||
Currently, htm5lib converts CDATA sections into comments. An
|
||||
as-yet-unreleased version of html5lib changes the parser's handling of
|
||||
CDATA sections to allow CDATA sections in tags like <svg> and
|
||||
<math>. The HTML5TreeBuilder will need to be updated to create CData
|
||||
objects instead of Comment objects in this situation.
|
||||
@@ -1,529 +0,0 @@
|
||||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||
provides methods and Pythonic idioms that make it easy to navigate,
|
||||
search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
|
||||
"""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.6.0"
|
||||
__copyright__ = "Copyright (c) 2004-2017 Leonard Richardson"
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import os
|
||||
import re
|
||||
import traceback
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry, ParserRejectedMarkup
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'<>'You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
'data' events, and "done with data" events.
|
||||
|
||||
If you encounter an empty-element tag (aka a self-closing tag,
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = u'[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, change code that looks like this:\n\n BeautifulSoup(YOUR_MARKUP})\n\nto this:\n\n BeautifulSoup(YOUR_MARKUP, \"%(parser)s\")\n"
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||
**kwargs):
|
||||
"""The Soup object is initialized as the 'root tag', and the
|
||||
provided markup (which can be a string or a file-like object)
|
||||
is fed into the underlying parser."""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'markupMassage' in kwargs:
|
||||
del kwargs['markupMassage']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the markupMassage argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for any necessary markup massage.")
|
||||
|
||||
if 'smartQuotesTo' in kwargs:
|
||||
del kwargs['smartQuotesTo']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the smartQuotesTo argument to the "
|
||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'selfClosingTags' in kwargs:
|
||||
del kwargs['selfClosingTags']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the selfClosingTags argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for understanding self-closing tags.")
|
||||
|
||||
if 'isHTML' in kwargs:
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. Suggest you use "
|
||||
"features='lxml' for HTML and features='lxml-xml' for "
|
||||
"XML.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
"parseOnlyThese", "parse_only")
|
||||
|
||||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if from_encoding and isinstance(markup, unicode):
|
||||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||
from_encoding = None
|
||||
|
||||
if len(kwargs) > 0:
|
||||
arg = kwargs.keys().pop()
|
||||
raise TypeError(
|
||||
"__init__() got an unexpected keyword argument '%s'" % arg)
|
||||
|
||||
if builder is None:
|
||||
original_features = features
|
||||
if isinstance(features, basestring):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
builder_class = builder_registry.lookup(*features)
|
||||
if builder_class is None:
|
||||
raise FeatureNotFound(
|
||||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
builder = builder_class()
|
||||
if not (original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES):
|
||||
if builder.is_xml:
|
||||
markup_type = "XML"
|
||||
else:
|
||||
markup_type = "HTML"
|
||||
|
||||
caller = traceback.extract_stack()[0]
|
||||
filename = caller[0]
|
||||
line_number = caller[1]
|
||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % dict(
|
||||
filename=filename,
|
||||
line_number=line_number,
|
||||
parser=builder.NAME,
|
||||
markup_type=markup_type))
|
||||
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.known_xml = self.is_xml
|
||||
self.builder.soup = self
|
||||
|
||||
self.parse_only = parse_only
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
elif len(markup) <= 256 and (
|
||||
(isinstance(markup, bytes) and not b'<' in markup)
|
||||
or (isinstance(markup, unicode) and not u'<' in markup)
|
||||
):
|
||||
# Print out warnings for a couple beginner problems
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, unicode)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
possible_filename = markup
|
||||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception, e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
if isinstance(markup, unicode):
|
||||
markup = markup.encode("utf8")
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should'
|
||||
' probably open this file and pass the filehandle into'
|
||||
' Beautiful Soup.' % markup)
|
||||
self._check_markup_is_url(markup)
|
||||
|
||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) in (
|
||||
self.builder.prepare_markup(
|
||||
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||
self.reset()
|
||||
try:
|
||||
self._feed()
|
||||
break
|
||||
except ParserRejectedMarkup:
|
||||
pass
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def __copy__(self):
|
||||
copy = type(self)(
|
||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||
)
|
||||
|
||||
# Although we encoded the tree to UTF-8, that may not have
|
||||
# been the encoding of the original markup. Set the copy's
|
||||
# .original_encoding to reflect the original object's
|
||||
# .original_encoding.
|
||||
copy.original_encoding = self.original_encoding
|
||||
return copy
|
||||
|
||||
def __getstate__(self):
|
||||
# Frequently a tree builder can't be pickled.
|
||||
d = dict(self.__dict__)
|
||||
if 'builder' in d and not self.builder.picklable:
|
||||
d['builder'] = None
|
||||
return d
|
||||
|
||||
@staticmethod
|
||||
def _check_markup_is_url(markup):
|
||||
"""
|
||||
Check if markup looks like it's actually a url and raise a warning
|
||||
if so. Markup can be unicode or str (py2) / bytes (py3).
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
space = b' '
|
||||
cant_start_with = (b"http:", b"https:")
|
||||
elif isinstance(markup, unicode):
|
||||
space = u' '
|
||||
cant_start_with = (u"http:", u"https:")
|
||||
else:
|
||||
return
|
||||
|
||||
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||
if not space in markup:
|
||||
if isinstance(markup, bytes):
|
||||
decoded_markup = markup.decode('utf-8', 'replace')
|
||||
else:
|
||||
decoded_markup = markup
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||
' HTTP client. You should probably use an HTTP client like'
|
||||
' requests to get the document behind the URL, and feed'
|
||||
' that document to Beautiful Soup.' % decoded_markup
|
||||
)
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
self.builder.feed(self.markup)
|
||||
# Close out any unfinished strings and close all the open tags.
|
||||
self.endData()
|
||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.current_data = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.preserve_whitespace_tag_stack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, **attrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, attrs)
|
||||
|
||||
def new_string(self, s, subclass=NavigableString):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
return subclass(s)
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
tag = self.tagStack.pop()
|
||||
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
|
||||
self.preserve_whitespace_tag_stack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
if tag.name in self.builder.preserve_whitespace_tags:
|
||||
self.preserve_whitespace_tag_stack.append(tag)
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.current_data:
|
||||
current_data = u''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
# nothing but ASCII spaces, replace it with a single space
|
||||
# or newline.
|
||||
if not self.preserve_whitespace_tag_stack:
|
||||
strippable = True
|
||||
for i in current_data:
|
||||
if i not in self.ASCII_SPACES:
|
||||
strippable = False
|
||||
break
|
||||
if strippable:
|
||||
if '\n' in current_data:
|
||||
current_data = '\n'
|
||||
else:
|
||||
current_data = ' '
|
||||
|
||||
# Reset the data collector.
|
||||
self.current_data = []
|
||||
|
||||
# Should we add this string to the tree at all?
|
||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||
(not self.parse_only.text or \
|
||||
not self.parse_only.search(current_data)):
|
||||
return
|
||||
|
||||
o = containerClass(current_data)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
parent = parent or self.currentTag
|
||||
previous_element = most_recent_element or self._most_recent_element
|
||||
|
||||
next_element = previous_sibling = next_sibling = None
|
||||
if isinstance(o, Tag):
|
||||
next_element = o.next_element
|
||||
next_sibling = o.next_sibling
|
||||
previous_sibling = o.previous_sibling
|
||||
if not previous_element:
|
||||
previous_element = o.previous_element
|
||||
|
||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
if parent.next_sibling:
|
||||
# This node is being inserted into an element that has
|
||||
# already been parsed. Deal with any dangling references.
|
||||
index = len(parent.contents)-1
|
||||
while index >= 0:
|
||||
if parent.contents[index] is o:
|
||||
break
|
||||
index -= 1
|
||||
else:
|
||||
raise ValueError(
|
||||
"Error building tree: supposedly %r was inserted "
|
||||
"into %r after the fact, but I don't see it!" % (
|
||||
o, parent
|
||||
)
|
||||
)
|
||||
if index == 0:
|
||||
previous_element = parent
|
||||
previous_sibling = None
|
||||
else:
|
||||
previous_element = previous_sibling = parent.contents[index-1]
|
||||
if index == len(parent.contents)-1:
|
||||
next_element = parent.next_sibling
|
||||
next_sibling = None
|
||||
else:
|
||||
next_element = next_sibling = parent.contents[index+1]
|
||||
|
||||
o.previous_element = previous_element
|
||||
if previous_element:
|
||||
previous_element.next_element = o
|
||||
o.next_element = next_element
|
||||
if next_element:
|
||||
next_element.previous_element = o
|
||||
o.next_sibling = next_sibling
|
||||
if next_sibling:
|
||||
next_sibling.previous_sibling = o
|
||||
o.previous_sibling = previous_sibling
|
||||
if previous_sibling:
|
||||
previous_sibling.next_sibling = o
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
# The BeautifulSoup object itself can never be popped.
|
||||
return
|
||||
|
||||
most_recently_popped = None
|
||||
|
||||
stack_size = len(self.tagStack)
|
||||
for i in range(stack_size - 1, 0, -1):
|
||||
t = self.tagStack[i]
|
||||
if (name == t.name and nsprefix == t.prefix):
|
||||
if inclusivePop:
|
||||
most_recently_popped = self.popTag()
|
||||
break
|
||||
most_recently_popped = self.popTag()
|
||||
|
||||
return most_recently_popped
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
SoupStrainer. You should proceed as if the tag had not occurred
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
and (self.parse_only.text
|
||||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self._most_recent_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self._most_recent_element:
|
||||
self._most_recent_element.next_element = tag
|
||||
self._most_recent_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.current_data.append(data)
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = u'<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = u''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
# Alias to make it easier to type import: 'from bs4 import _soup'
|
||||
_s = BeautifulSoup
|
||||
_soup = BeautifulSoup
|
||||
|
||||
class BeautifulStoneSoup(BeautifulSoup):
|
||||
"""Deprecated interface to an XML parser."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
pass
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print soup.prettify()
|
||||
@@ -1,333 +0,0 @@
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
HTMLAwareEntitySubstitution,
|
||||
whitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'HTMLTreeBuilder',
|
||||
'SAXTreeBuilder',
|
||||
'TreeBuilder',
|
||||
'TreeBuilderRegistry',
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = 'fast'
|
||||
PERMISSIVE = 'permissive'
|
||||
STRICT = 'strict'
|
||||
XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
|
||||
def __init__(self):
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class):
|
||||
"""Register a treebuilder based on its advertised features."""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features):
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
features = list(features)
|
||||
features.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(features) > 0:
|
||||
feature = features.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
else:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(
|
||||
set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# The BeautifulSoup class will take feature lists from developers and use them
|
||||
# to look up builders in this registry.
|
||||
builder_registry = TreeBuilderRegistry()
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
NAME = "[Unknown tree builder]"
|
||||
ALTERNATE_NAMES = []
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
picklable = False
|
||||
preserve_whitespace_tags = set()
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
cdata_list_attributes = {}
|
||||
|
||||
|
||||
def __init__(self):
|
||||
self.soup = None
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name):
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p />".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no contents.
|
||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
return markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of tests.
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
Modifies its input in place.
|
||||
"""
|
||||
if not attrs:
|
||||
return attrs
|
||||
if self.cdata_list_attributes:
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), None)
|
||||
for attr in attrs.keys():
|
||||
if attr in universal or (tag_specific and attr in tag_specific):
|
||||
# We have a "class"-type attribute whose string
|
||||
# value is a whitespace-separated list of
|
||||
# values. Split it into a list.
|
||||
value = attrs[attr]
|
||||
if isinstance(value, basestring):
|
||||
values = whitespace_re.split(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
# tree. On the second call the attribute value
|
||||
# here is already a list. If this happens,
|
||||
# leave the value alone rather than trying to
|
||||
# split it again.
|
||||
values = value
|
||||
attrs[attr] = values
|
||||
return attrs
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||
#print "Start %s, %r" % (name, attrs)
|
||||
self.soup.handle_starttag(name, attrs)
|
||||
|
||||
def endElement(self, name):
|
||||
#print "End %s" % name
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple, nodeName):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix, nodeValue):
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix):
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self):
|
||||
pass
|
||||
|
||||
def endDocument(self):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML.
|
||||
|
||||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
preserve_whitespace_tags = HTMLAwareEntitySubstitution.preserve_whitespace_tags
|
||||
empty_element_tags = set([
|
||||
# These are from HTML5.
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
|
||||
# These are from HTML4, removed in HTML5.
|
||||
'spacer', 'frame'
|
||||
])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
cdata_list_attributes = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
"td" : ["headers"],
|
||||
"th" : ["headers"],
|
||||
"td" : ["headers"],
|
||||
"form" : ["accept-charset"],
|
||||
"object" : ["archive"],
|
||||
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area" : ["rel"],
|
||||
"icon" : ["sizes"],
|
||||
"iframe" : ["sandbox"],
|
||||
"output" : ["for"],
|
||||
}
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
return False
|
||||
|
||||
http_equiv = tag.get('http-equiv')
|
||||
content = tag.get('content')
|
||||
charset = tag.get('charset')
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
meta_encoding = None
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
meta_encoding = charset
|
||||
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||
|
||||
elif (content is not None and http_equiv is not None
|
||||
and http_equiv.lower() == 'content-type'):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag['content'] = ContentMetaAttributeValue(content)
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
# I'm fairly sure this is not the best way to do this.
|
||||
this_module = sys.modules['bs4.builder']
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
class ParserRejectedMarkup(Exception):
|
||||
pass
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last result.
|
||||
from . import _htmlparser
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
||||
@@ -1,426 +0,0 @@
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
import warnings
|
||||
import re
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
whitespace_re,
|
||||
)
|
||||
import html5lib
|
||||
from html5lib.constants import (
|
||||
namespaces,
|
||||
prefixes,
|
||||
)
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
try:
|
||||
# Pre-0.99999999
|
||||
from html5lib.treebuilders import _base as treebuilder_base
|
||||
new_html5lib = False
|
||||
except ImportError, e:
|
||||
# 0.99999999 and up
|
||||
from html5lib.treebuilders import base as treebuilder_base
|
||||
new_html5lib = True
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
NAME = "html5lib"
|
||||
|
||||
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
|
||||
# document_declared_encoding and exclude_encodings aren't used
|
||||
# ATM because the html5lib TreeBuilder doesn't use
|
||||
# UnicodeDammit.
|
||||
if exclude_encodings:
|
||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
|
||||
extra_kwargs = dict()
|
||||
if not isinstance(markup, unicode):
|
||||
if new_html5lib:
|
||||
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||
else:
|
||||
extra_kwargs['encoding'] = self.user_specified_encoding
|
||||
doc = parser.parse(markup, **extra_kwargs)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, unicode):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
if not isinstance(original_encoding, basestring):
|
||||
# In 0.99999999 and up, the encoding is an html5lib
|
||||
# Encoding object. We want to use a string for compatibility
|
||||
# with other tree builders.
|
||||
original_encoding = original_encoding.name
|
||||
doc.original_encoding = original_encoding
|
||||
|
||||
def create_treebuilder(self, namespaceHTMLElements):
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
namespaceHTMLElements, self.soup)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||
|
||||
def __init__(self, namespaceHTMLElements, soup=None):
|
||||
if soup:
|
||||
self.soup = soup
|
||||
else:
|
||||
from bs4 import BeautifulSoup
|
||||
self.soup = BeautifulSoup("", "html.parser")
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
tag = self.soup.new_tag(name, namespace)
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
from bs4 import BeautifulSoup
|
||||
self.soup = BeautifulSoup("", "html.parser")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
# XXX This code is not covered by the BS4 tests.
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return treebuilder_base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def testSerializer(self, element):
|
||||
from bs4 import BeautifulSoup
|
||||
rv = []
|
||||
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, BeautifulSoup):
|
||||
pass
|
||||
if isinstance(element, Doctype):
|
||||
m = doctype_re.match(element)
|
||||
if m:
|
||||
name = m.group(1)
|
||||
if m.lastindex > 1:
|
||||
publicId = m.group(2) or ""
|
||||
systemId = m.group(3) or m.group(4) or ""
|
||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(' ' * indent, name, publicId, systemId))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||
elif isinstance(element, Comment):
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
|
||||
elif isinstance(element, NavigableString):
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||
else:
|
||||
if element.namespace:
|
||||
name = "%s %s" % (prefixes[element.namespace],
|
||||
element.name)
|
||||
else:
|
||||
name = element.name
|
||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
if element.attrs:
|
||||
attributes = []
|
||||
for name, value in element.attrs.items():
|
||||
if isinstance(name, NamespacedAttribute):
|
||||
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||
if isinstance(value, list):
|
||||
value = " ".join(value)
|
||||
attributes.append((name, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
indent += 2
|
||||
for child in element.children:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = HTML5TreeBuilder.cdata_list_attributes
|
||||
if (name in list_attr['*']
|
||||
or (self.element.name in list_attr
|
||||
and name in list_attr[self.element.name])):
|
||||
# A node that is being cloned may have already undergone
|
||||
# this procedure.
|
||||
if not isinstance(value, list):
|
||||
value = whitespace_re.split(value)
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
def keys(self):
|
||||
return list(self.attrs.keys())
|
||||
def __len__(self):
|
||||
return len(self.attrs)
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class Element(treebuilder_base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
treebuilder_base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def appendChild(self, node):
|
||||
string_child = child = None
|
||||
if isinstance(node, basestring):
|
||||
# Some other piece of code decided to pass in a string
|
||||
# instead of creating a TextElement object to contain the
|
||||
# string.
|
||||
string_child = child = node
|
||||
elif isinstance(node, Tag):
|
||||
# Some other piece of code decided to pass in a Tag
|
||||
# instead of creating an Element object to contain the
|
||||
# Tag.
|
||||
child = node
|
||||
elif node.element.__class__ == NavigableString:
|
||||
string_child = child = node.element
|
||||
node.parent = self
|
||||
else:
|
||||
child = node.element
|
||||
node.parent = self
|
||||
|
||||
if not isinstance(child, basestring) and child.parent is not None:
|
||||
node.element.extract()
|
||||
|
||||
if (string_child and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# We are appending a string onto another string.
|
||||
# TODO This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.element.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + string_child)
|
||||
old_element.replace_with(new_element)
|
||||
self.soup._most_recent_element = new_element
|
||||
else:
|
||||
if isinstance(node, basestring):
|
||||
# Create a brand new NavigableString from this string.
|
||||
child = self.soup.new_string(node)
|
||||
|
||||
# Tell Beautiful Soup to act as if it parsed this element
|
||||
# immediately after the parent's last descendant. (Or
|
||||
# immediately after the parent, if it has no children.)
|
||||
if self.element.contents:
|
||||
most_recent_element = self.element._last_descendant(False)
|
||||
elif self.element.next_element is not None:
|
||||
# Something from further ahead in the parse tree is
|
||||
# being inserted into this earlier element. This is
|
||||
# very annoying because it means an expensive search
|
||||
# for the last element in the tree.
|
||||
most_recent_element = self.soup._last_descendant()
|
||||
else:
|
||||
most_recent_element = self.element
|
||||
|
||||
self.soup.object_was_parsed(
|
||||
child, parent=self.element,
|
||||
most_recent_element=most_recent_element)
|
||||
|
||||
def getAttributes(self):
|
||||
if isinstance(self.element, Comment):
|
||||
return {}
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in attributes.items():
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.element)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.element.contents[index-1]
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, new_parent):
|
||||
"""Move all of this tag's children into another tag."""
|
||||
# print "MOVE", self.element.contents
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent.element
|
||||
|
||||
element = self.element
|
||||
new_parent_element = new_parent.element
|
||||
# Determine what this tag's next_element will be once all the children
|
||||
# are removed.
|
||||
final_next_element = element.next_sibling
|
||||
|
||||
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
|
||||
if len(new_parent_element.contents) > 0:
|
||||
# The new parent already contains children. We will be
|
||||
# appending this tag's children to the end.
|
||||
new_parents_last_child = new_parent_element.contents[-1]
|
||||
new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
|
||||
else:
|
||||
# The new parent contains no children.
|
||||
new_parents_last_child = None
|
||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||
|
||||
to_append = element.contents
|
||||
if len(to_append) > 0:
|
||||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
if new_parents_last_descendant:
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
else:
|
||||
first_child.previous_element = new_parent_element
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
if new_parents_last_descendant:
|
||||
new_parents_last_descendant.next_element = first_child
|
||||
else:
|
||||
new_parent_element.next_element = first_child
|
||||
if new_parents_last_child:
|
||||
new_parents_last_child.next_sibling = first_child
|
||||
|
||||
# Find the very last element being moved. It is now the
|
||||
# parent's last descendant. It has no .next_sibling and
|
||||
# its .next_element is whatever the previous last
|
||||
# descendant had.
|
||||
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||
|
||||
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||
if new_parents_last_descendant_next_element:
|
||||
# TODO: This code has no test coverage and I'm not sure
|
||||
# how to get html5lib to go through this path, but it's
|
||||
# just the other side of the previous line.
|
||||
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
|
||||
last_childs_last_descendant.next_sibling = None
|
||||
|
||||
for child in to_append:
|
||||
child.parent = new_parent_element
|
||||
new_parent_element.contents.append(child)
|
||||
|
||||
# Now that this element has no children, change its .next_element.
|
||||
element.contents = []
|
||||
element.next_element = final_next_element
|
||||
|
||||
# print "DONE WITH MOVE"
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent_element
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
treebuilder_base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
@@ -1,314 +0,0 @@
|
||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from HTMLParser import HTMLParser
|
||||
|
||||
try:
|
||||
from HTMLParser import HTMLParseError
|
||||
except ImportError, e:
|
||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||
class HTMLParseError(Exception):
|
||||
pass
|
||||
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
||||
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
# Keep a list of empty-element tags that were encountered
|
||||
# without an explicit closing tag. If we encounter a closing tag
|
||||
# of this type, we'll associate it with one of those entries.
|
||||
#
|
||||
# This isn't a stack because we don't care about the
|
||||
# order. It's a list of closing tags we've already handled and
|
||||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
def handle_startendtag(self, name, attrs):
|
||||
# This is only called when the markup looks like
|
||||
# <tag/>.
|
||||
|
||||
# is_startend() tells handle_starttag not to close the tag
|
||||
# just because its name matches a known empty-element tag. We
|
||||
# know that this is an empty-element tag and we want to call
|
||||
# handle_endtag ourselves.
|
||||
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
|
||||
self.handle_endtag(name)
|
||||
|
||||
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||
# XXX namespace
|
||||
attr_dict = {}
|
||||
for key, value in attrs:
|
||||
# Change None attribute values to the empty string
|
||||
# for consistency with the other tree builders.
|
||||
if value is None:
|
||||
value = ''
|
||||
attr_dict[key] = value
|
||||
attrvalue = '""'
|
||||
#print "START", name
|
||||
tag = self.soup.handle_starttag(name, None, None, attr_dict)
|
||||
if tag and tag.is_empty_element and handle_empty_element:
|
||||
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||
# events for empty-element tags. (It's handled in
|
||||
# handle_startendtag, but only if the original markup looked like
|
||||
# <tag/>.)
|
||||
#
|
||||
# So we need to call handle_endtag() ourselves. Since we
|
||||
# know the start event is identical to the end event, we
|
||||
# don't want handle_endtag() to cross off any previous end
|
||||
# events for tags of this name.
|
||||
self.handle_endtag(name, check_already_closed=False)
|
||||
|
||||
# But we might encounter an explicit closing tag for this tag
|
||||
# later on. If so, we want to ignore it.
|
||||
self.already_closed_empty_element.append(name)
|
||||
|
||||
def handle_endtag(self, name, check_already_closed=True):
|
||||
#print "END", name
|
||||
if check_already_closed and name in self.already_closed_empty_element:
|
||||
# This is a redundant end tag for an empty-element tag.
|
||||
# We've already called handle_endtag() for it, so just
|
||||
# check it off the list.
|
||||
# print "ALREADY CLOSED", name
|
||||
self.already_closed_empty_element.remove(name)
|
||||
else:
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed in all supported versions.
|
||||
# http://bugs.python.org/issue13633
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
elif name.startswith('X'):
|
||||
real_name = int(name.lstrip('X'), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
try:
|
||||
data = unichr(real_name)
|
||||
except (ValueError, OverflowError), e:
|
||||
data = u"\N{REPLACEMENT CHARACTER}"
|
||||
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
data = "&%s;" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.soup.endData()
|
||||
if data.startswith("DOCTYPE "):
|
||||
data = data[len("DOCTYPE "):]
|
||||
elif data == 'DOCTYPE':
|
||||
# i.e. "<!DOCTYPE>"
|
||||
data = ''
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
if data.upper().startswith('CDATA['):
|
||||
cls = CData
|
||||
data = data[len('CDATA['):]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
picklable = True
|
||||
NAME = HTMLPARSER
|
||||
features = [NAME, HTML, STRICT]
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||
kwargs['strict'] = False
|
||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||
kwargs['convert_charrefs'] = False
|
||||
self.parser_args = (args, kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, unicode):
|
||||
yield (markup, None, None, False)
|
||||
return
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
||||
exclude_encodings=exclude_encodings)
|
||||
yield (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
except HTMLParseError, e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
parser.already_closed_empty_element = []
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
||||
@@ -1,258 +0,0 @@
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
from io import BytesIO
|
||||
from StringIO import StringIO
|
||||
import collections
|
||||
from lxml import etree
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NamespacedAttribute,
|
||||
ProcessingInstruction,
|
||||
XMLProcessingInstruction,
|
||||
)
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
ParserRejectedMarkup,
|
||||
TreeBuilder,
|
||||
XML)
|
||||
from bs4.dammit import EncodingDetector
|
||||
|
||||
LXML = 'lxml'
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
is_xml = True
|
||||
processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
NAME = "lxml-xml"
|
||||
ALTERNATE_NAMES = ["xml"]
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = {'http://www.w3.org/XML/1998/namespace' : "xml"}
|
||||
|
||||
def default_parser(self, encoding):
|
||||
# This can either return a parser object or a class, which
|
||||
# will be instantiated with default arguments.
|
||||
if self._default_parser is not None:
|
||||
return self._default_parser
|
||||
return etree.XMLParser(
|
||||
target=self, strip_cdata=False, recover=True, encoding=encoding)
|
||||
|
||||
def parser_for(self, encoding):
|
||||
# Use the default parser.
|
||||
parser = self.default_parser(encoding)
|
||||
|
||||
if isinstance(parser, collections.Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||
return parser
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None):
|
||||
# TODO: Issue a warning if parser is present but not a
|
||||
# callable, since that means there's no way to create new
|
||||
# parsers for different encodings.
|
||||
self._default_parser = parser
|
||||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == '{':
|
||||
return tuple(tag[1:].split('}', 1))
|
||||
else:
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
exclude_encodings=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:yield: A series of 4-tuples.
|
||||
(markup, encoding, declared encoding,
|
||||
has undergone character replacement)
|
||||
|
||||
Each 4-tuple represents a strategy for parsing the document.
|
||||
"""
|
||||
# Instead of using UnicodeDammit to convert the bytestring to
|
||||
# Unicode using different encodings, use EncodingDetector to
|
||||
# iterate over the encodings, and tell lxml to try to parse
|
||||
# the document as each one in turn.
|
||||
is_html = not self.is_xml
|
||||
if is_html:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
else:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, unicode):
|
||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||
# tell lxml to parse it as UTF-8.
|
||||
yield (markup.encode("utf8"), "utf8",
|
||||
document_declared_encoding, False)
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
detector = EncodingDetector(
|
||||
markup, try_encodings, is_html, exclude_encodings)
|
||||
for encoding in detector.encodings:
|
||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, bytes):
|
||||
markup = BytesIO(markup)
|
||||
elif isinstance(markup, unicode):
|
||||
markup = StringIO(markup)
|
||||
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
try:
|
||||
self.parser = self.parser_for(self.soup.original_encoding)
|
||||
self.parser.feed(data)
|
||||
while len(data) != 0:
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
if len(data) != 0:
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
inverted_nsmap = dict((value, key) for key, value in nsmap.items())
|
||||
self.nsmaps.append(inverted_nsmap)
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in nsmap.items():
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in attrs.items():
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
return None
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
return None
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
completed_tag = self.soup.tagStack[-1]
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(target + ' ' + data)
|
||||
self.soup.endData(self.processing_instruction_class)
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def doctype(self, name, pubid, system):
|
||||
self.soup.endData()
|
||||
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def comment(self, content):
|
||||
"Handle comments as Comment objects."
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(content)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
NAME = LXML
|
||||
ALTERNATE_NAMES = ["lxml-html"]
|
||||
|
||||
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
processing_instruction_class = ProcessingInstruction
|
||||
|
||||
def default_parser(self, encoding):
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup):
|
||||
encoding = self.soup.original_encoding
|
||||
try:
|
||||
self.parser = self.parser_for(encoding)
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError), e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return u'<html><body>%s</body></html>' % fragment
|
||||
@@ -1,842 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||||
|
||||
This library converts a bytestream to Unicode through any means
|
||||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__license__ = "MIT"
|
||||
|
||||
import codecs
|
||||
from htmlentitydefs import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
return cchardet.detect(s)['encoding']
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
def chardet_dammit(s):
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\?.*encoding=[\'"](.*?)[\'"].*\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\s*meta[^>]+charset\s*=\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
for codepoint, name in list(codepoint2name.items()):
|
||||
character = unichr(codepoint)
|
||||
if codepoint != 34:
|
||||
# There's no point in turning the quotation mark into
|
||||
# ", unless it happens within an attribute value, which
|
||||
# is handled elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to turn " into the quotation mark.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
CHARACTER_TO_XML_ENTITY = {
|
||||
"'": "apos",
|
||||
'"': "quot",
|
||||
"&": "amp",
|
||||
"<": "lt",
|
||||
">": "gt",
|
||||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\d+;|#x[0-9a-fA-F]+;|\w+;)"
|
||||
")")
|
||||
|
||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def quoted_attribute_value(self, value):
|
||||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||||
|
||||
Most strings will be quoted using double quotes.
|
||||
|
||||
Bob's Bar -> "Bob's Bar"
|
||||
|
||||
If a string contains double quotes, it will be quoted using
|
||||
single quotes.
|
||||
|
||||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||||
|
||||
If a string contains both single and double quotes, the
|
||||
double quotes will be escaped, and the string will be quoted
|
||||
using double quotes.
|
||||
|
||||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||||
"""
|
||||
quote_with = '"'
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
# The string contains both single and double
|
||||
# quotes. Turn the double quotes into
|
||||
# entities. We quote the double quotes rather than
|
||||
# the single quotes because the entity name is
|
||||
# """ whether this is HTML or XML. If we
|
||||
# quoted the single quotes, we'd have to decide
|
||||
# between ' and &squot;.
|
||||
replace_with = """
|
||||
value = value.replace('"', replace_with)
|
||||
else:
|
||||
# There are double quotes but no single quotes.
|
||||
# We can use single quotes to quote the attribute.
|
||||
quote_with = "'"
|
||||
return quote_with + value + quote_with
|
||||
|
||||
@classmethod
|
||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign
|
||||
will become <, the greater-than sign will become >,
|
||||
and any ampersands will become &. If you want ampersands
|
||||
that appear to be part of an entity definition to be left
|
||||
alone, use substitute_xml_containing_entities() instead.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets and ampersands.
|
||||
value = cls.AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_xml_containing_entities(
|
||||
cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign will
|
||||
become <, the greater-than sign will become >, and any
|
||||
ampersands that are not part of an entity defition will
|
||||
become &.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets, and ampersands that aren't part of
|
||||
# entities.
|
||||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_html(cls, s):
|
||||
"""Replace certain Unicode characters with named HTML entities.
|
||||
|
||||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||||
in that the goal is to make the result more readable (to those
|
||||
with ASCII displays) rather than to recover from
|
||||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
||||
|
||||
class EncodingDetector:
|
||||
"""Suggests a number of possible encodings for a bytestring.
|
||||
|
||||
Order of precedence:
|
||||
|
||||
1. Encodings you specifically tell EncodingDetector to try first
|
||||
(the override_encodings argument to the constructor).
|
||||
|
||||
2. An encoding declared within the bytestring itself, either in an
|
||||
XML declaration (if the bytestring is to be interpreted as an XML
|
||||
document), or in a <meta> tag (if the bytestring is to be
|
||||
interpreted as an HTML document.)
|
||||
|
||||
3. An encoding detected through textual analysis by chardet,
|
||||
cchardet, or a similar external library.
|
||||
|
||||
4. UTF-8.
|
||||
|
||||
5. Windows-1252.
|
||||
"""
|
||||
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||
exclude_encodings=None):
|
||||
self.override_encodings = override_encodings or []
|
||||
exclude_encodings = exclude_encodings or []
|
||||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||
self.chardet_encoding = None
|
||||
self.is_html = is_html
|
||||
self.declared_encoding = None
|
||||
|
||||
# First order of business: strip a byte-order mark.
|
||||
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
||||
|
||||
def _usable(self, encoding, tried):
|
||||
if encoding is not None:
|
||||
encoding = encoding.lower()
|
||||
if encoding in self.exclude_encodings:
|
||||
return False
|
||||
if encoding not in tried:
|
||||
tried.add(encoding)
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def encodings(self):
|
||||
"""Yield a number of encodings that might work for this markup."""
|
||||
tried = set()
|
||||
for e in self.override_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
# Did the document originally start with a byte-order mark
|
||||
# that indicated its encoding?
|
||||
if self._usable(self.sniffed_encoding, tried):
|
||||
yield self.sniffed_encoding
|
||||
|
||||
# Look within the document for an XML or HTML encoding
|
||||
# declaration.
|
||||
if self.declared_encoding is None:
|
||||
self.declared_encoding = self.find_declared_encoding(
|
||||
self.markup, self.is_html)
|
||||
if self._usable(self.declared_encoding, tried):
|
||||
yield self.declared_encoding
|
||||
|
||||
# Use third-party character set detection to guess at the
|
||||
# encoding.
|
||||
if self.chardet_encoding is None:
|
||||
self.chardet_encoding = chardet_dammit(self.markup)
|
||||
if self._usable(self.chardet_encoding, tried):
|
||||
yield self.chardet_encoding
|
||||
|
||||
# As a last-ditch effort, try utf-8 and windows-1252.
|
||||
for e in ('utf-8', 'windows-1252'):
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
@classmethod
|
||||
def strip_byte_order_mark(cls, data):
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||
encoding = None
|
||||
if isinstance(data, unicode):
|
||||
# Unicode data cannot have a byte-order mark.
|
||||
return data, encoding
|
||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == b'\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == b'\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == b'\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
return data, encoding
|
||||
|
||||
@classmethod
|
||||
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
|
||||
"""Given a document, tries to find its declared encoding.
|
||||
|
||||
An XML encoding is declared at the beginning of the document.
|
||||
|
||||
An HTML encoding is declared in a <meta> tag, hopefully near the
|
||||
beginning of the document.
|
||||
"""
|
||||
if search_entire_document:
|
||||
xml_endpos = html_endpos = len(markup)
|
||||
else:
|
||||
xml_endpos = 1024
|
||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||
|
||||
declared_encoding = None
|
||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
||||
if not declared_encoding_match and is_html:
|
||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
||||
if declared_encoding_match is not None:
|
||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
||||
'ascii', 'replace')
|
||||
if declared_encoding:
|
||||
return declared_encoding.lower()
|
||||
return None
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||||
"x-sjis": "shift-jis"}
|
||||
|
||||
ENCODINGS_WITH_SMART_QUOTES = [
|
||||
"windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
self.is_html = is_html
|
||||
self.log = logging.getLogger(__name__)
|
||||
self.detector = EncodingDetector(
|
||||
markup, override_encodings, is_html, exclude_encodings)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, unicode) or markup == '':
|
||||
self.markup = markup
|
||||
self.unicode_markup = unicode(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
# The encoding detector may have stripped a byte-order mark.
|
||||
# Use the stripped markup from this point on.
|
||||
self.markup = self.detector.markup
|
||||
|
||||
u = None
|
||||
for encoding in self.detector.encodings:
|
||||
markup = self.detector.markup
|
||||
u = self._convert_from(encoding)
|
||||
if u is not None:
|
||||
break
|
||||
|
||||
if not u:
|
||||
# None of the encodings worked. As an absolute last resort,
|
||||
# try them again with character replacement.
|
||||
|
||||
for encoding in self.detector.encodings:
|
||||
if encoding != "ascii":
|
||||
u = self._convert_from(encoding, "replace")
|
||||
if u is not None:
|
||||
self.log.warning(
|
||||
"Some characters could not be decoded, and were "
|
||||
"replaced with REPLACEMENT CHARACTER."
|
||||
)
|
||||
self.contains_replacement_characters = True
|
||||
break
|
||||
|
||||
# If none of that worked, we could at this point force it to
|
||||
# ASCII, but that would destroy so much data that I think
|
||||
# giving up is better.
|
||||
self.unicode_markup = u
|
||||
if not u:
|
||||
self.original_encoding = None
|
||||
|
||||
def _sub_ms_char(self, match):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity, or an ASCII character."""
|
||||
orig = match.group(1)
|
||||
if self.smart_quotes_to == 'ascii':
|
||||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||||
else:
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if type(sub) == tuple:
|
||||
if self.smart_quotes_to == 'xml':
|
||||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||||
else:
|
||||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||||
else:
|
||||
sub = sub.encode()
|
||||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
self.tried_encodings.append((proposed, errors))
|
||||
markup = self.markup
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if (self.smart_quotes_to is not None
|
||||
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
|
||||
smart_quotes_re = b"([\x80-\x9f])"
|
||||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
return unicode(data, encoding, errors)
|
||||
|
||||
@property
|
||||
def declared_html_encoding(self):
|
||||
if not self.is_html:
|
||||
return None
|
||||
return self.detector.declared_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
||||
or (charset and self._codec(charset.replace("-", "")))
|
||||
or (charset and self._codec(charset.replace("-", "_")))
|
||||
or (charset and charset.lower())
|
||||
or charset
|
||||
)
|
||||
if value:
|
||||
return value.lower()
|
||||
return None
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset:
|
||||
return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
|
||||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||||
b'\x81': ' ',
|
||||
b'\x82': ('sbquo', '201A'),
|
||||
b'\x83': ('fnof', '192'),
|
||||
b'\x84': ('bdquo', '201E'),
|
||||
b'\x85': ('hellip', '2026'),
|
||||
b'\x86': ('dagger', '2020'),
|
||||
b'\x87': ('Dagger', '2021'),
|
||||
b'\x88': ('circ', '2C6'),
|
||||
b'\x89': ('permil', '2030'),
|
||||
b'\x8A': ('Scaron', '160'),
|
||||
b'\x8B': ('lsaquo', '2039'),
|
||||
b'\x8C': ('OElig', '152'),
|
||||
b'\x8D': '?',
|
||||
b'\x8E': ('#x17D', '17D'),
|
||||
b'\x8F': '?',
|
||||
b'\x90': '?',
|
||||
b'\x91': ('lsquo', '2018'),
|
||||
b'\x92': ('rsquo', '2019'),
|
||||
b'\x93': ('ldquo', '201C'),
|
||||
b'\x94': ('rdquo', '201D'),
|
||||
b'\x95': ('bull', '2022'),
|
||||
b'\x96': ('ndash', '2013'),
|
||||
b'\x97': ('mdash', '2014'),
|
||||
b'\x98': ('tilde', '2DC'),
|
||||
b'\x99': ('trade', '2122'),
|
||||
b'\x9a': ('scaron', '161'),
|
||||
b'\x9b': ('rsaquo', '203A'),
|
||||
b'\x9c': ('oelig', '153'),
|
||||
b'\x9d': '?',
|
||||
b'\x9e': ('#x17E', '17E'),
|
||||
b'\x9f': ('Yuml', ''),}
|
||||
|
||||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||||
# horrors like stripping diacritical marks to turn á into a, but also
|
||||
# contains non-horrors like turning “ into ".
|
||||
MS_CHARS_TO_ASCII = {
|
||||
b'\x80' : 'EUR',
|
||||
b'\x81' : ' ',
|
||||
b'\x82' : ',',
|
||||
b'\x83' : 'f',
|
||||
b'\x84' : ',,',
|
||||
b'\x85' : '...',
|
||||
b'\x86' : '+',
|
||||
b'\x87' : '++',
|
||||
b'\x88' : '^',
|
||||
b'\x89' : '%',
|
||||
b'\x8a' : 'S',
|
||||
b'\x8b' : '<',
|
||||
b'\x8c' : 'OE',
|
||||
b'\x8d' : '?',
|
||||
b'\x8e' : 'Z',
|
||||
b'\x8f' : '?',
|
||||
b'\x90' : '?',
|
||||
b'\x91' : "'",
|
||||
b'\x92' : "'",
|
||||
b'\x93' : '"',
|
||||
b'\x94' : '"',
|
||||
b'\x95' : '*',
|
||||
b'\x96' : '-',
|
||||
b'\x97' : '--',
|
||||
b'\x98' : '~',
|
||||
b'\x99' : '(TM)',
|
||||
b'\x9a' : 's',
|
||||
b'\x9b' : '>',
|
||||
b'\x9c' : 'oe',
|
||||
b'\x9d' : '?',
|
||||
b'\x9e' : 'z',
|
||||
b'\x9f' : 'Y',
|
||||
b'\xa0' : ' ',
|
||||
b'\xa1' : '!',
|
||||
b'\xa2' : 'c',
|
||||
b'\xa3' : 'GBP',
|
||||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||||
#generic currency symbol.
|
||||
b'\xa5' : 'YEN',
|
||||
b'\xa6' : '|',
|
||||
b'\xa7' : 'S',
|
||||
b'\xa8' : '..',
|
||||
b'\xa9' : '',
|
||||
b'\xaa' : '(th)',
|
||||
b'\xab' : '<<',
|
||||
b'\xac' : '!',
|
||||
b'\xad' : ' ',
|
||||
b'\xae' : '(R)',
|
||||
b'\xaf' : '-',
|
||||
b'\xb0' : 'o',
|
||||
b'\xb1' : '+-',
|
||||
b'\xb2' : '2',
|
||||
b'\xb3' : '3',
|
||||
b'\xb4' : ("'", 'acute'),
|
||||
b'\xb5' : 'u',
|
||||
b'\xb6' : 'P',
|
||||
b'\xb7' : '*',
|
||||
b'\xb8' : ',',
|
||||
b'\xb9' : '1',
|
||||
b'\xba' : '(th)',
|
||||
b'\xbb' : '>>',
|
||||
b'\xbc' : '1/4',
|
||||
b'\xbd' : '1/2',
|
||||
b'\xbe' : '3/4',
|
||||
b'\xbf' : '?',
|
||||
b'\xc0' : 'A',
|
||||
b'\xc1' : 'A',
|
||||
b'\xc2' : 'A',
|
||||
b'\xc3' : 'A',
|
||||
b'\xc4' : 'A',
|
||||
b'\xc5' : 'A',
|
||||
b'\xc6' : 'AE',
|
||||
b'\xc7' : 'C',
|
||||
b'\xc8' : 'E',
|
||||
b'\xc9' : 'E',
|
||||
b'\xca' : 'E',
|
||||
b'\xcb' : 'E',
|
||||
b'\xcc' : 'I',
|
||||
b'\xcd' : 'I',
|
||||
b'\xce' : 'I',
|
||||
b'\xcf' : 'I',
|
||||
b'\xd0' : 'D',
|
||||
b'\xd1' : 'N',
|
||||
b'\xd2' : 'O',
|
||||
b'\xd3' : 'O',
|
||||
b'\xd4' : 'O',
|
||||
b'\xd5' : 'O',
|
||||
b'\xd6' : 'O',
|
||||
b'\xd7' : '*',
|
||||
b'\xd8' : 'O',
|
||||
b'\xd9' : 'U',
|
||||
b'\xda' : 'U',
|
||||
b'\xdb' : 'U',
|
||||
b'\xdc' : 'U',
|
||||
b'\xdd' : 'Y',
|
||||
b'\xde' : 'b',
|
||||
b'\xdf' : 'B',
|
||||
b'\xe0' : 'a',
|
||||
b'\xe1' : 'a',
|
||||
b'\xe2' : 'a',
|
||||
b'\xe3' : 'a',
|
||||
b'\xe4' : 'a',
|
||||
b'\xe5' : 'a',
|
||||
b'\xe6' : 'ae',
|
||||
b'\xe7' : 'c',
|
||||
b'\xe8' : 'e',
|
||||
b'\xe9' : 'e',
|
||||
b'\xea' : 'e',
|
||||
b'\xeb' : 'e',
|
||||
b'\xec' : 'i',
|
||||
b'\xed' : 'i',
|
||||
b'\xee' : 'i',
|
||||
b'\xef' : 'i',
|
||||
b'\xf0' : 'o',
|
||||
b'\xf1' : 'n',
|
||||
b'\xf2' : 'o',
|
||||
b'\xf3' : 'o',
|
||||
b'\xf4' : 'o',
|
||||
b'\xf5' : 'o',
|
||||
b'\xf6' : 'o',
|
||||
b'\xf7' : '/',
|
||||
b'\xf8' : 'o',
|
||||
b'\xf9' : 'u',
|
||||
b'\xfa' : 'u',
|
||||
b'\xfb' : 'u',
|
||||
b'\xfc' : 'u',
|
||||
b'\xfd' : 'y',
|
||||
b'\xfe' : 'b',
|
||||
b'\xff' : 'y',
|
||||
}
|
||||
|
||||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||||
# characters in otherwise UTF-8 documents.
|
||||
#
|
||||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||||
# Windows-1252.
|
||||
WINDOWS_1252_TO_UTF8 = {
|
||||
0x80 : b'\xe2\x82\xac', # €
|
||||
0x82 : b'\xe2\x80\x9a', # ‚
|
||||
0x83 : b'\xc6\x92', # ƒ
|
||||
0x84 : b'\xe2\x80\x9e', # „
|
||||
0x85 : b'\xe2\x80\xa6', # …
|
||||
0x86 : b'\xe2\x80\xa0', # †
|
||||
0x87 : b'\xe2\x80\xa1', # ‡
|
||||
0x88 : b'\xcb\x86', # ˆ
|
||||
0x89 : b'\xe2\x80\xb0', # ‰
|
||||
0x8a : b'\xc5\xa0', # Š
|
||||
0x8b : b'\xe2\x80\xb9', # ‹
|
||||
0x8c : b'\xc5\x92', # Œ
|
||||
0x8e : b'\xc5\xbd', # Ž
|
||||
0x91 : b'\xe2\x80\x98', # ‘
|
||||
0x92 : b'\xe2\x80\x99', # ’
|
||||
0x93 : b'\xe2\x80\x9c', # “
|
||||
0x94 : b'\xe2\x80\x9d', # ”
|
||||
0x95 : b'\xe2\x80\xa2', # •
|
||||
0x96 : b'\xe2\x80\x93', # –
|
||||
0x97 : b'\xe2\x80\x94', # —
|
||||
0x98 : b'\xcb\x9c', # ˜
|
||||
0x99 : b'\xe2\x84\xa2', # ™
|
||||
0x9a : b'\xc5\xa1', # š
|
||||
0x9b : b'\xe2\x80\xba', # ›
|
||||
0x9c : b'\xc5\x93', # œ
|
||||
0x9e : b'\xc5\xbe', # ž
|
||||
0x9f : b'\xc5\xb8', # Ÿ
|
||||
0xa0 : b'\xc2\xa0', #
|
||||
0xa1 : b'\xc2\xa1', # ¡
|
||||
0xa2 : b'\xc2\xa2', # ¢
|
||||
0xa3 : b'\xc2\xa3', # £
|
||||
0xa4 : b'\xc2\xa4', # ¤
|
||||
0xa5 : b'\xc2\xa5', # ¥
|
||||
0xa6 : b'\xc2\xa6', # ¦
|
||||
0xa7 : b'\xc2\xa7', # §
|
||||
0xa8 : b'\xc2\xa8', # ¨
|
||||
0xa9 : b'\xc2\xa9', # ©
|
||||
0xaa : b'\xc2\xaa', # ª
|
||||
0xab : b'\xc2\xab', # «
|
||||
0xac : b'\xc2\xac', # ¬
|
||||
0xad : b'\xc2\xad', #
|
||||
0xae : b'\xc2\xae', # ®
|
||||
0xaf : b'\xc2\xaf', # ¯
|
||||
0xb0 : b'\xc2\xb0', # °
|
||||
0xb1 : b'\xc2\xb1', # ±
|
||||
0xb2 : b'\xc2\xb2', # ²
|
||||
0xb3 : b'\xc2\xb3', # ³
|
||||
0xb4 : b'\xc2\xb4', # ´
|
||||
0xb5 : b'\xc2\xb5', # µ
|
||||
0xb6 : b'\xc2\xb6', # ¶
|
||||
0xb7 : b'\xc2\xb7', # ·
|
||||
0xb8 : b'\xc2\xb8', # ¸
|
||||
0xb9 : b'\xc2\xb9', # ¹
|
||||
0xba : b'\xc2\xba', # º
|
||||
0xbb : b'\xc2\xbb', # »
|
||||
0xbc : b'\xc2\xbc', # ¼
|
||||
0xbd : b'\xc2\xbd', # ½
|
||||
0xbe : b'\xc2\xbe', # ¾
|
||||
0xbf : b'\xc2\xbf', # ¿
|
||||
0xc0 : b'\xc3\x80', # À
|
||||
0xc1 : b'\xc3\x81', # Á
|
||||
0xc2 : b'\xc3\x82', # Â
|
||||
0xc3 : b'\xc3\x83', # Ã
|
||||
0xc4 : b'\xc3\x84', # Ä
|
||||
0xc5 : b'\xc3\x85', # Å
|
||||
0xc6 : b'\xc3\x86', # Æ
|
||||
0xc7 : b'\xc3\x87', # Ç
|
||||
0xc8 : b'\xc3\x88', # È
|
||||
0xc9 : b'\xc3\x89', # É
|
||||
0xca : b'\xc3\x8a', # Ê
|
||||
0xcb : b'\xc3\x8b', # Ë
|
||||
0xcc : b'\xc3\x8c', # Ì
|
||||
0xcd : b'\xc3\x8d', # Í
|
||||
0xce : b'\xc3\x8e', # Î
|
||||
0xcf : b'\xc3\x8f', # Ï
|
||||
0xd0 : b'\xc3\x90', # Ð
|
||||
0xd1 : b'\xc3\x91', # Ñ
|
||||
0xd2 : b'\xc3\x92', # Ò
|
||||
0xd3 : b'\xc3\x93', # Ó
|
||||
0xd4 : b'\xc3\x94', # Ô
|
||||
0xd5 : b'\xc3\x95', # Õ
|
||||
0xd6 : b'\xc3\x96', # Ö
|
||||
0xd7 : b'\xc3\x97', # ×
|
||||
0xd8 : b'\xc3\x98', # Ø
|
||||
0xd9 : b'\xc3\x99', # Ù
|
||||
0xda : b'\xc3\x9a', # Ú
|
||||
0xdb : b'\xc3\x9b', # Û
|
||||
0xdc : b'\xc3\x9c', # Ü
|
||||
0xdd : b'\xc3\x9d', # Ý
|
||||
0xde : b'\xc3\x9e', # Þ
|
||||
0xdf : b'\xc3\x9f', # ß
|
||||
0xe0 : b'\xc3\xa0', # à
|
||||
0xe1 : b'\xa1', # á
|
||||
0xe2 : b'\xc3\xa2', # â
|
||||
0xe3 : b'\xc3\xa3', # ã
|
||||
0xe4 : b'\xc3\xa4', # ä
|
||||
0xe5 : b'\xc3\xa5', # å
|
||||
0xe6 : b'\xc3\xa6', # æ
|
||||
0xe7 : b'\xc3\xa7', # ç
|
||||
0xe8 : b'\xc3\xa8', # è
|
||||
0xe9 : b'\xc3\xa9', # é
|
||||
0xea : b'\xc3\xaa', # ê
|
||||
0xeb : b'\xc3\xab', # ë
|
||||
0xec : b'\xc3\xac', # ì
|
||||
0xed : b'\xc3\xad', # í
|
||||
0xee : b'\xc3\xae', # î
|
||||
0xef : b'\xc3\xaf', # ï
|
||||
0xf0 : b'\xc3\xb0', # ð
|
||||
0xf1 : b'\xc3\xb1', # ñ
|
||||
0xf2 : b'\xc3\xb2', # ò
|
||||
0xf3 : b'\xc3\xb3', # ó
|
||||
0xf4 : b'\xc3\xb4', # ô
|
||||
0xf5 : b'\xc3\xb5', # õ
|
||||
0xf6 : b'\xc3\xb6', # ö
|
||||
0xf7 : b'\xc3\xb7', # ÷
|
||||
0xf8 : b'\xc3\xb8', # ø
|
||||
0xf9 : b'\xc3\xb9', # ù
|
||||
0xfa : b'\xc3\xba', # ú
|
||||
0xfb : b'\xc3\xbb', # û
|
||||
0xfc : b'\xc3\xbc', # ü
|
||||
0xfd : b'\xc3\xbd', # ý
|
||||
0xfe : b'\xc3\xbe', # þ
|
||||
}
|
||||
|
||||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||||
]
|
||||
|
||||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||||
|
||||
@classmethod
|
||||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||||
embedded_encoding="windows-1252"):
|
||||
"""Fix characters from one encoding embedded in some other encoding.
|
||||
|
||||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
raise NotImplementedError(
|
||||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||||
"embedded encodings.")
|
||||
|
||||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||||
raise NotImplementedError(
|
||||
"UTF-8 is the only currently supported main encoding.")
|
||||
|
||||
byte_chunks = []
|
||||
|
||||
chunk_start = 0
|
||||
pos = 0
|
||||
while pos < len(in_bytes):
|
||||
byte = in_bytes[pos]
|
||||
if not isinstance(byte, int):
|
||||
# Python 2.x
|
||||
byte = ord(byte)
|
||||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||||
# This is the start of a UTF-8 multibyte character. Skip
|
||||
# to the end.
|
||||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||||
if byte >= start and byte <= end:
|
||||
pos += size
|
||||
break
|
||||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||||
# We found a Windows-1252 character!
|
||||
# Save the string up to this point as a chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||||
|
||||
# Now translate the Windows-1252 character into UTF-8
|
||||
# and add it as another, one-byte chunk.
|
||||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||||
pos += 1
|
||||
chunk_start = pos
|
||||
else:
|
||||
# Go on to the next character.
|
||||
pos += 1
|
||||
if chunk_start == 0:
|
||||
# The string is unchanged.
|
||||
return in_bytes
|
||||
else:
|
||||
# Store the final chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:])
|
||||
return b''.join(byte_chunks)
|
||||
|
||||
@@ -1,219 +0,0 @@
|
||||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__license__ = "MIT"
|
||||
|
||||
import cProfile
|
||||
from StringIO import StringIO
|
||||
from HTMLParser import HTMLParser
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup, __version__
|
||||
from bs4.builder import builder_registry
|
||||
|
||||
import os
|
||||
import pstats
|
||||
import random
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
import sys
|
||||
import cProfile
|
||||
|
||||
def diagnose(data):
|
||||
"""Diagnostic suite for isolating common problems."""
|
||||
print "Diagnostic running on Beautiful Soup %s" % __version__
|
||||
print "Python version %s" % sys.version
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
for builder in builder_registry.builders:
|
||||
if name in builder.features:
|
||||
break
|
||||
else:
|
||||
basic_parsers.remove(name)
|
||||
print (
|
||||
"I noticed that %s is not installed. Installing it may help." %
|
||||
name)
|
||||
|
||||
if 'lxml' in basic_parsers:
|
||||
basic_parsers.append(["lxml", "xml"])
|
||||
try:
|
||||
from lxml import etree
|
||||
print "Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION))
|
||||
except ImportError, e:
|
||||
print (
|
||||
"lxml is not installed or couldn't be imported.")
|
||||
|
||||
|
||||
if 'html5lib' in basic_parsers:
|
||||
try:
|
||||
import html5lib
|
||||
print "Found html5lib version %s" % html5lib.__version__
|
||||
except ImportError, e:
|
||||
print (
|
||||
"html5lib is not installed or couldn't be imported.")
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif os.path.exists(data):
|
||||
print '"%s" looks like a filename. Reading data from the file.' % data
|
||||
with open(data) as fp:
|
||||
data = fp.read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print '"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data
|
||||
print "You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup."
|
||||
return
|
||||
print
|
||||
|
||||
for parser in basic_parsers:
|
||||
print "Trying to parse your markup with %s" % parser
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, parser)
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "Here's what %s did with the markup:" % parser
|
||||
print soup.prettify()
|
||||
|
||||
print "-" * 80
|
||||
|
||||
def lxml_trace(data, html=True, **kwargs):
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
|
||||
This lets you see how lxml parses a document when no Beautiful
|
||||
Soup code is running.
|
||||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
print("%s, %4s, %s" % (event, element.tag, element.text))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Announces HTMLParser parse events, without doing anything else."""
|
||||
|
||||
def _p(self, s):
|
||||
print(s)
|
||||
|
||||
def handle_starttag(self, name, attrs):
|
||||
self._p("%s START" % name)
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self._p("%s END" % name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self._p("%s DATA" % data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
self._p("%s CHARREF" % name)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
self._p("%s ENTITYREF" % name)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self._p("%s COMMENT" % data)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self._p("%s DECL" % data)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self._p("%s UNKNOWN-DECL" % data)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self._p("%s PI" % data)
|
||||
|
||||
def htmlparser_trace(data):
|
||||
"""Print out the HTMLParser events that occur during parsing.
|
||||
|
||||
This lets you see how HTMLParser parses a document when no
|
||||
Beautiful Soup code is running.
|
||||
"""
|
||||
parser = AnnouncingParser()
|
||||
parser.feed(data)
|
||||
|
||||
_vowels = "aeiou"
|
||||
_consonants = "bcdfghjklmnpqrstvwxyz"
|
||||
|
||||
def rword(length=5):
|
||||
"Generate a random word-like string."
|
||||
s = ''
|
||||
for i in range(length):
|
||||
if i % 2 == 0:
|
||||
t = _consonants
|
||||
else:
|
||||
t = _vowels
|
||||
s += random.choice(t)
|
||||
return s
|
||||
|
||||
def rsentence(length=4):
|
||||
"Generate a random sentence-like string."
|
||||
return " ".join(rword(random.randint(4,9)) for i in range(length))
|
||||
|
||||
def rdoc(num_elements=1000):
|
||||
"""Randomly generate an invalid HTML document."""
|
||||
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
|
||||
elements = []
|
||||
for i in range(num_elements):
|
||||
choice = random.randint(0,3)
|
||||
if choice == 0:
|
||||
# New tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("<%s>" % tag_name)
|
||||
elif choice == 1:
|
||||
elements.append(rsentence(random.randint(1,4)))
|
||||
elif choice == 2:
|
||||
# Close a tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("</%s>" % tag_name)
|
||||
return "<html>" + "\n".join(elements) + "</html>"
|
||||
|
||||
def benchmark_parsers(num_elements=100000):
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print "Comparative parser benchmark on Beautiful Soup %s" % __version__
|
||||
data = rdoc(num_elements)
|
||||
print "Generated a large invalid HTML document (%d bytes)." % len(data)
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
try:
|
||||
a = time.time()
|
||||
soup = BeautifulSoup(data, parser)
|
||||
b = time.time()
|
||||
success = True
|
||||
except Exception, e:
|
||||
print "%s could not parse the markup." % parser
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print "BS4+%s parsed the markup in %.2fs." % (parser, b-a)
|
||||
|
||||
from lxml import etree
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print "Raw lxml parsed the markup in %.2fs." % (b-a)
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print "Raw html5lib parsed the markup in %.2fs." % (b-a)
|
||||
|
||||
def profile(num_elements=100000, parser="lxml"):
|
||||
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
|
||||
data = rdoc(num_elements)
|
||||
vars = dict(bs4=bs4, data=data, parser=parser)
|
||||
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
|
||||
|
||||
stats = pstats.Stats(filename)
|
||||
# stats.strip_dirs()
|
||||
stats.sort_stats("cumulative")
|
||||
stats.print_stats('_html5lib|bs4', 50)
|
||||
|
||||
if __name__ == '__main__':
|
||||
diagnose(sys.stdin.read())
|
||||
1808
libs2/bs4/element.py
1808
libs2/bs4/element.py
File diff suppressed because it is too large
Load Diff
@@ -1,99 +0,0 @@
|
||||
from bs4.dammit import EntitySubstitution
|
||||
|
||||
class Formatter(EntitySubstitution):
|
||||
"""Describes a strategy to use when outputting a parse tree to a string.
|
||||
|
||||
Some parts of this strategy come from the distinction between
|
||||
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||
"""
|
||||
# Registries of XML and HTML formatters.
|
||||
XML_FORMATTERS = {}
|
||||
HTML_FORMATTERS = {}
|
||||
|
||||
HTML = 'html'
|
||||
XML = 'xml'
|
||||
|
||||
HTML_DEFAULTS = dict(
|
||||
cdata_containing_tags=set(["script", "style"]),
|
||||
)
|
||||
|
||||
def _default(self, language, value, kwarg):
|
||||
if value is not None:
|
||||
return value
|
||||
if language == self.XML:
|
||||
return set()
|
||||
return self.HTML_DEFAULTS[kwarg]
|
||||
|
||||
def __init__(
|
||||
self, language=None, entity_substitution=None,
|
||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||
):
|
||||
"""
|
||||
|
||||
:param void_element_close_prefix: By default, represent void
|
||||
elements as <tag/> rather than <tag>
|
||||
"""
|
||||
self.language = language
|
||||
self.entity_substitution = entity_substitution
|
||||
self.void_element_close_prefix = void_element_close_prefix
|
||||
self.cdata_containing_tags = self._default(
|
||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||
)
|
||||
|
||||
def substitute(self, ns):
|
||||
"""Process a string that needs to undergo entity substitution."""
|
||||
if not self.entity_substitution:
|
||||
return ns
|
||||
from .element import NavigableString
|
||||
if (isinstance(ns, NavigableString)
|
||||
and ns.parent is not None
|
||||
and ns.parent.name in self.cdata_containing_tags):
|
||||
# Do nothing.
|
||||
return ns
|
||||
# Substitute.
|
||||
return self.entity_substitution(ns)
|
||||
|
||||
def attribute_value(self, value):
|
||||
"""Process the value of an attribute."""
|
||||
return self.substitute(value)
|
||||
|
||||
def attributes(self, tag):
|
||||
"""Reorder a tag's attributes however you want."""
|
||||
return sorted(tag.attrs.items())
|
||||
|
||||
|
||||
class HTMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||
|
||||
|
||||
class XMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||
|
||||
|
||||
# Set up aliases for the default formatters.
|
||||
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html,
|
||||
void_element_close_prefix = None
|
||||
)
|
||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
|
||||
entity_substitution=None
|
||||
)
|
||||
XMLFormatter.REGISTRY["html"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
XMLFormatter.REGISTRY[None] = Formatter(
|
||||
Formatter(Formatter.XML, entity_substitution=None)
|
||||
)
|
||||
@@ -1,770 +0,0 @@
|
||||
"""Helper classes for tests."""
|
||||
|
||||
# Use of this source code is governed by a BSD-style license that can be
|
||||
# found in the LICENSE file.
|
||||
__license__ = "MIT"
|
||||
|
||||
import pickle
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
from unittest import TestCase
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
Comment,
|
||||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder()
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder.test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
obj = BeautifulSoup(to_parse, builder=builder)
|
||||
if compare_parsed_to is None:
|
||||
compare_parsed_to = to_parse
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
def assertConnectedness(self, element):
|
||||
"""Ensure that next_element and previous_element are properly
|
||||
set for all descendants of the given element.
|
||||
"""
|
||||
earlier = None
|
||||
for e in element.descendants:
|
||||
if earlier:
|
||||
self.assertEqual(e, earlier.next_element)
|
||||
self.assertEqual(earlier, e.previous_element)
|
||||
earlier = e
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
||||
Any HTML treebuilder, present or future, should be able to pass
|
||||
these tests. With invalid markup, there's room for interpretation,
|
||||
and different parsers can handle it differently. But with the
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def test_empty_element_tags(self):
|
||||
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
|
||||
are handled correctly.
|
||||
"""
|
||||
for name in [
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
'spacer', 'frame'
|
||||
]:
|
||||
soup = self.soup("")
|
||||
new_tag = soup.new_tag(name)
|
||||
self.assertEqual(True, new_tag.is_empty_element)
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
|
||||
# Make sure a Doctype object was created.
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual(doctype.__class__, Doctype)
|
||||
self.assertEqual(doctype, doctype_fragment)
|
||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||
|
||||
# Make sure that the doctype was correctly associated with the
|
||||
# parse tree and that the rest of the document parsed.
|
||||
self.assertEqual(soup.p.contents[0], 'foo')
|
||||
|
||||
def _document_with_doctype(self, doctype_fragment):
|
||||
"""Generate and parse a document with the given doctype."""
|
||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||
markup = doctype + '\n<p>foo</p>'
|
||||
soup = self.soup(markup)
|
||||
return doctype, soup
|
||||
|
||||
def test_normal_doctypes(self):
|
||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||
self.assertDoctypeHandled("html")
|
||||
self.assertDoctypeHandled(
|
||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||
|
||||
def test_empty_doctype(self):
|
||||
soup = self.soup("<!DOCTYPE>")
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual("", doctype.strip())
|
||||
|
||||
def test_public_doctype_with_url(self):
|
||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||
self.assertDoctypeHandled(doctype)
|
||||
|
||||
def test_system_doctype(self):
|
||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# We can handle a namespaced doctype with a system ID.
|
||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# Test a namespaced doctype with a public id.
|
||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out more or less the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
# We test both Unicode and bytestring to verify that
|
||||
# process_markup correctly sets processing_instruction_class
|
||||
# even when the markup is already Unicode and there is no
|
||||
# need to process anything.
|
||||
markup = u"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.decode())
|
||||
|
||||
markup = b"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
This is important because the builder is part of a
|
||||
BeautifulSoup object, and we want to be able to copy that.
|
||||
"""
|
||||
copy.deepcopy(self.default_builder)
|
||||
|
||||
def test_p_tag_is_never_empty_element(self):
|
||||
"""A <p> tag is never designated as an empty-element tag.
|
||||
|
||||
Even if the markup shows it as an empty-element tag, it
|
||||
shouldn't be presented that way.
|
||||
"""
|
||||
soup = self.soup("<p/>")
|
||||
self.assertFalse(soup.p.is_empty_element)
|
||||
self.assertEqual(str(soup.p), "<p></p>")
|
||||
|
||||
def test_unclosed_tags_get_closed(self):
|
||||
"""A tag that's not closed by the end of the document should be closed.
|
||||
|
||||
This applies to all tags except empty-element tags.
|
||||
"""
|
||||
self.assertSoupEquals("<p>", "<p></p>")
|
||||
self.assertSoupEquals("<b>", "<b></b>")
|
||||
|
||||
self.assertSoupEquals("<br>", "<br/>")
|
||||
|
||||
def test_br_is_always_empty_element_tag(self):
|
||||
"""A <br> tag is designated as an empty-element tag.
|
||||
|
||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||
two tags, but it should always be an empty-element tag.
|
||||
"""
|
||||
soup = self.soup("<br></br>")
|
||||
self.assertTrue(soup.br.is_empty_element)
|
||||
self.assertEqual(str(soup.br), "<br/>")
|
||||
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_double_head(self):
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Ordinary HEAD element test</title>
|
||||
</head>
|
||||
<script type="text/javascript">
|
||||
alert("Help!");
|
||||
</script>
|
||||
<body>
|
||||
Hello, world!
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
soup = self.soup(html)
|
||||
self.assertEqual("text/javascript", soup.find('script')['type'])
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
self.assertSoupEquals(markup)
|
||||
|
||||
soup = self.soup(markup)
|
||||
comment = soup.find(text="foobar")
|
||||
self.assertEqual(comment.__class__, Comment)
|
||||
|
||||
# The comment is properly integrated into the tree.
|
||||
foo = soup.find(text="foo")
|
||||
self.assertEqual(comment, foo.next_element)
|
||||
baz = soup.find(text="baz")
|
||||
self.assertEqual(comment, baz.previous_element)
|
||||
|
||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||
"""Whitespace must be preserved in <pre> and <textarea> tags,
|
||||
even if that would mean not prettifying the markup.
|
||||
"""
|
||||
pre_markup = "<pre> </pre>"
|
||||
textarea_markup = "<textarea> woo\nwoo </textarea>"
|
||||
self.assertSoupEquals(pre_markup)
|
||||
self.assertSoupEquals(textarea_markup)
|
||||
|
||||
soup = self.soup(pre_markup)
|
||||
self.assertEqual(soup.pre.prettify(), pre_markup)
|
||||
|
||||
soup = self.soup(textarea_markup)
|
||||
self.assertEqual(soup.textarea.prettify(), textarea_markup)
|
||||
|
||||
soup = self.soup("<textarea></textarea>")
|
||||
self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
b_tag = "<b>Inside a B tag</b>"
|
||||
self.assertSoupEquals(b_tag)
|
||||
|
||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
def test_nested_block_level_elements(self):
|
||||
"""Block elements can be nested."""
|
||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||
blockquote = soup.blockquote
|
||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||
self.assertEqual(blockquote.b.string, 'Foo')
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""One table can go inside another one."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||
'</td></tr></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_deeply_nested_multivalued_attribute(self):
|
||||
# html5lib can set the attributes of the same tag many times
|
||||
# as it rearranges the tree. This has caused problems with
|
||||
# multivalued attributes.
|
||||
markup = '<table><div><div class="css"></div></div></table>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["css"], soup.div.div['class'])
|
||||
|
||||
def test_multivalued_attribute_on_html(self):
|
||||
# html5lib uses a different API to set the attributes ot the
|
||||
# <html> tag. This has caused problems with multivalued
|
||||
# attributes.
|
||||
markup = '<html class="a b"></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["a", "b"], soup.html['class'])
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = u'<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = u'<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
||||
def test_quot_entity_converted_to_quotation_mark(self):
|
||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = u"\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
def test_multipart_strings(self):
|
||||
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||
self.assertEqual("p", soup.h2.string.next_element.name)
|
||||
self.assertEqual("p", soup.p.name)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_empty_element_tags(self):
|
||||
"""Verify consistent handling of empty-element tags,
|
||||
no matter how they come in through the markup.
|
||||
"""
|
||||
self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
|
||||
self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
|
||||
|
||||
def test_head_tag_between_head_and_body(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<html><head></head>
|
||||
<link></link>
|
||||
<body>foo</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertNotEqual(None, soup.html.body)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_multiple_copies_of_a_tag(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<article id="a" >
|
||||
<div><a href="1"></div>
|
||||
<footer>
|
||||
<a href="2"></a>
|
||||
</footer>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertConnectedness(soup.article)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
very least they should not choke on namespaces or lose
|
||||
data."""
|
||||
|
||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode())
|
||||
html = soup.html
|
||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||
|
||||
def test_multivalued_attribute_value_becomes_list(self):
|
||||
markup = b'<a class="foo bar">'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||
|
||||
#
|
||||
# Generally speaking, tests below this point are more tests of
|
||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||
# weird, so we run these tests separately for every tree builder
|
||||
# to detect any differences between them.
|
||||
#
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
# A seemingly innocuous document... but it's in Unicode! And
|
||||
# it contains characters that can't be represented in the
|
||||
# encoding found in the declaration! The horror!
|
||||
markup = u'<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string)
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||
parse_only=strainer)
|
||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||
|
||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||
'<foo attr="bar"></foo>')
|
||||
|
||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
self.assertSoupEquals(text)
|
||||
|
||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
soup = self.soup(text)
|
||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||
self.assertSoupEquals(
|
||||
soup.foo.decode(),
|
||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||
|
||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||
'<this is="really messed up & stuff"></this>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||
|
||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||
|
||||
def test_entities_in_strings_converted_during_parsing(self):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
# Microsoft smart quotes are converted to Unicode characters during
|
||||
# parsing.
|
||||
quote = b"<p>\x91Foo\x92</p>"
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = u"<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
def test_real_iso_latin_document(self):
|
||||
# Smoke test of interrelated functionality, using an
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = u'<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||
|
||||
# Parse the ISO-Latin-1 HTML.
|
||||
soup = self.soup(iso_latin_html)
|
||||
# Encode it to UTF-8.
|
||||
result = soup.encode("utf-8")
|
||||
|
||||
# What do we expect the result to look like? Well, it would
|
||||
# look like unicode_html, except that the META tag would say
|
||||
# UTF-8 instead of ISO-Latin-1.
|
||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||
|
||||
# And, of course, it would be in UTF-8, not Unicode.
|
||||
expected = expected.encode("utf-8")
|
||||
|
||||
# Ta-da!
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_real_shift_jis_document(self):
|
||||
# Smoke test to make sure the parser can handle a document in
|
||||
# Shift-JIS encoding, without choking.
|
||||
shift_jis_html = (
|
||||
b'<html><head></head><body><pre>'
|
||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||
b'</pre></body></html>')
|
||||
unicode_html = shift_jis_html.decode("shift-jis")
|
||||
soup = self.soup(unicode_html)
|
||||
|
||||
# Make sure the parse tree is correctly encoded to various
|
||||
# encodings.
|
||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||
|
||||
def test_real_hebrew_document(self):
|
||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||
# Hebrew encoding) to UTF-8.
|
||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||
soup = self.soup(
|
||||
hebrew_document, from_encoding="iso8859-8")
|
||||
# Some tree builders call it iso8859-8, others call it iso-8859-9.
|
||||
# That's not a difference we really care about.
|
||||
assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
|
||||
self.assertEqual(
|
||||
soup.encode('utf-8'),
|
||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||
|
||||
def test_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||
'http-equiv="Content-type"/>')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||
content = parsed_meta['content']
|
||||
self.assertEqual('text/html; charset=x-sjis', content)
|
||||
|
||||
# But that value is actually a ContentMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||
|
||||
# For the rest of the story, see TestSubstitutions in
|
||||
# test_tree.py.
|
||||
|
||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', id="encoding")
|
||||
charset = parsed_meta['charset']
|
||||
self.assertEqual('x-sjis', charset)
|
||||
|
||||
# But that value is actually a CharsetMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('utf8', charset.encode("utf8"))
|
||||
|
||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||
data = self.soup("<a>text</a>")
|
||||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||
|
||||
def test_xml_declaration(self):
|
||||
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||
doc = """
|
||||
<script type="text/javascript">
|
||||
</script>
|
||||
"""
|
||||
soup = BeautifulSoup(doc, "lxml-xml")
|
||||
# lxml would have stripped this while parsing, but we can add
|
||||
# it later.
|
||||
soup.script.string = 'console.log("< < hey > > ");'
|
||||
encoded = soup.encode()
|
||||
self.assertTrue(b"< < hey > >" in encoded)
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
markup = u'<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
unicode(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode("latin1"),
|
||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||
|
||||
def test_large_xml_document(self):
|
||||
"""A large XML document should come out the same as it went in."""
|
||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||
+ b'0' * (2**12)
|
||||
+ b'</root>')
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||
self.assertSoupEquals("<p>", "<p/>")
|
||||
self.assertSoupEquals("<p>foo</p>")
|
||||
|
||||
def test_namespaces_are_preserved(self):
|
||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||
soup = self.soup(markup)
|
||||
root = soup.root
|
||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||
|
||||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
def test_namespaced_attributes_xml_namespace(self):
|
||||
markup = '<foo xml:lang="fr">bar</foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(unicode(soup.foo), markup)
|
||||
|
||||
def test_find_by_prefixed_name(self):
|
||||
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||
<Document xmlns="http://example.com/ns0"
|
||||
xmlns:ns1="http://example.com/ns1"
|
||||
xmlns:ns2="http://example.com/ns2"
|
||||
<ns1:tag>foo</ns1:tag>
|
||||
<ns1:tag>bar</ns1:tag>
|
||||
<ns2:tag key="value">baz</ns2:tag>
|
||||
</Document>
|
||||
"""
|
||||
soup = self.soup(doc)
|
||||
|
||||
# There are three <tag> tags.
|
||||
self.assertEqual(3, len(soup.find_all('tag')))
|
||||
|
||||
# But two of them are ns1:tag and one of them is ns2:tag.
|
||||
self.assertEqual(2, len(soup.find_all('ns1:tag')))
|
||||
self.assertEqual(1, len(soup.find_all('ns2:tag')))
|
||||
|
||||
self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
|
||||
self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
|
||||
|
||||
def test_copy_tag_preserves_namespace(self):
|
||||
xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://example.com/ns0"/>"""
|
||||
|
||||
soup = self.soup(xml)
|
||||
tag = soup.document
|
||||
duplicate = copy.copy(tag)
|
||||
|
||||
# The two tags have the same namespace prefix.
|
||||
self.assertEqual(tag.prefix, duplicate.prefix)
|
||||
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||
# XHTML documents in any particular way.
|
||||
pass
|
||||
|
||||
def test_html_tags_have_namespace(self):
|
||||
markup = "<a>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||
|
||||
def test_svg_tags_have_namespace(self):
|
||||
markup = '<svg><circle/></svg>'
|
||||
soup = self.soup(markup)
|
||||
namespace = "http://www.w3.org/2000/svg"
|
||||
self.assertEqual(namespace, soup.svg.namespace)
|
||||
self.assertEqual(namespace, soup.circle.namespace)
|
||||
|
||||
|
||||
def test_mathml_tags_have_namespace(self):
|
||||
markup = '<math><msqrt>5</msqrt></math>'
|
||||
soup = self.soup(markup)
|
||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||
self.assertEqual(namespace, soup.math.namespace)
|
||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||
|
||||
def test_xml_declaration_becomes_comment(self):
|
||||
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertTrue(isinstance(soup.contents[0], Comment))
|
||||
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
||||
self.assertEqual("html", soup.contents[0].next_element.name)
|
||||
|
||||
def skipIf(condition, reason):
|
||||
def nothing(test, *args, **kwargs):
|
||||
return None
|
||||
|
||||
def decorator(test_item):
|
||||
if condition:
|
||||
return nothing
|
||||
else:
|
||||
return test_item
|
||||
|
||||
return decorator
|
||||
@@ -1 +0,0 @@
|
||||
"The beautifulsoup tests."
|
||||
@@ -1,147 +0,0 @@
|
||||
"""Tests of the builder registry."""
|
||||
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.builder import (
|
||||
builder_registry as registry,
|
||||
HTMLParserTreeBuilder,
|
||||
TreeBuilderRegistry,
|
||||
)
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError:
|
||||
HTML5LIB_PRESENT = False
|
||||
|
||||
try:
|
||||
from bs4.builder import (
|
||||
LXMLTreeBuilderForXML,
|
||||
LXMLTreeBuilder,
|
||||
)
|
||||
LXML_PRESENT = True
|
||||
except ImportError:
|
||||
LXML_PRESENT = False
|
||||
|
||||
|
||||
class BuiltInRegistryTest(unittest.TestCase):
|
||||
"""Test the built-in registry with the default builders registered."""
|
||||
|
||||
def test_combination(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('fast', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('permissive', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('strict', 'html'),
|
||||
HTMLParserTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib', 'html'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
def test_lookup_by_markup_type(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
||||
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('xml'), None)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
||||
|
||||
def test_named_library(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('lxml', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('lxml', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
self.assertEqual(registry.lookup('html.parser'),
|
||||
HTMLParserTreeBuilder)
|
||||
|
||||
def test_beautifulsoup_constructor_does_lookup(self):
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
# This will create a warning about not explicitly
|
||||
# specifying a parser, but we'll ignore it.
|
||||
|
||||
# You can pass in a string.
|
||||
BeautifulSoup("", features="html")
|
||||
# Or a list of strings.
|
||||
BeautifulSoup("", features=["html", "fast"])
|
||||
|
||||
# You'll get an exception if BS can't find an appropriate
|
||||
# builder.
|
||||
self.assertRaises(ValueError, BeautifulSoup,
|
||||
"", features="no-such-feature")
|
||||
|
||||
class RegistryTest(unittest.TestCase):
|
||||
"""Test the TreeBuilderRegistry class in general."""
|
||||
|
||||
def setUp(self):
|
||||
self.registry = TreeBuilderRegistry()
|
||||
|
||||
def builder_for_features(self, *feature_list):
|
||||
cls = type('Builder_' + '_'.join(feature_list),
|
||||
(object,), {'features' : feature_list})
|
||||
|
||||
self.registry.register(cls)
|
||||
return cls
|
||||
|
||||
def test_register_with_no_features(self):
|
||||
builder = self.builder_for_features()
|
||||
|
||||
# Since the builder advertises no features, you can't find it
|
||||
# by looking up features.
|
||||
self.assertEqual(self.registry.lookup('foo'), None)
|
||||
|
||||
# But you can find it by doing a lookup with no features, if
|
||||
# this happens to be the only registered builder.
|
||||
self.assertEqual(self.registry.lookup(), builder)
|
||||
|
||||
def test_register_with_features_makes_lookup_succeed(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('foo'), builder)
|
||||
self.assertEqual(self.registry.lookup('bar'), builder)
|
||||
|
||||
def test_lookup_fails_when_no_builder_implements_feature(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('baz'), None)
|
||||
|
||||
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
||||
builder1 = self.builder_for_features('foo')
|
||||
builder2 = self.builder_for_features('bar')
|
||||
self.assertEqual(self.registry.lookup(), builder2)
|
||||
|
||||
def test_lookup_fails_when_no_tree_builders_registered(self):
|
||||
self.assertEqual(self.registry.lookup(), None)
|
||||
|
||||
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
||||
has_one = self.builder_for_features('foo')
|
||||
has_the_other = self.builder_for_features('bar')
|
||||
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
|
||||
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
|
||||
lacks_one = self.builder_for_features('bar')
|
||||
has_the_other = self.builder_for_features('foo')
|
||||
|
||||
# There are two builders featuring 'foo' and 'bar', but
|
||||
# the one that also features 'quux' was registered later.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
||||
has_both_late)
|
||||
|
||||
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
||||
has_both_early)
|
||||
|
||||
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
||||
builder1 = self.builder_for_features('foo', 'bar')
|
||||
builder2 = self.builder_for_features('foo', 'baz')
|
||||
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
||||
@@ -1,36 +0,0 @@
|
||||
"Test harness for doctests."
|
||||
|
||||
# pylint: disable-msg=E0611,W0142
|
||||
|
||||
__metaclass__ = type
|
||||
__all__ = [
|
||||
'additional_tests',
|
||||
]
|
||||
|
||||
import atexit
|
||||
import doctest
|
||||
import os
|
||||
#from pkg_resources import (
|
||||
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
|
||||
import unittest
|
||||
|
||||
DOCTEST_FLAGS = (
|
||||
doctest.ELLIPSIS |
|
||||
doctest.NORMALIZE_WHITESPACE |
|
||||
doctest.REPORT_NDIFF)
|
||||
|
||||
|
||||
# def additional_tests():
|
||||
# "Run the doc tests (README.txt and docs/*, if any exist)"
|
||||
# doctest_files = [
|
||||
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
|
||||
# if resource_exists('bs4', 'docs'):
|
||||
# for name in resource_listdir('bs4', 'docs'):
|
||||
# if name.endswith('.txt'):
|
||||
# doctest_files.append(
|
||||
# os.path.abspath(
|
||||
# resource_filename('bs4', 'docs/%s' % name)))
|
||||
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
|
||||
# atexit.register(cleanup_resources)
|
||||
# return unittest.TestSuite((
|
||||
# doctest.DocFileSuite(*doctest_files, **kwargs)))
|
||||
@@ -1,130 +0,0 @@
|
||||
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
||||
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError, e:
|
||||
HTML5LIB_PRESENT = False
|
||||
from bs4.element import SoupStrainer
|
||||
from bs4.testing import (
|
||||
HTML5TreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not HTML5LIB_PRESENT,
|
||||
"html5lib seems not to be present, not testing its tree builder.")
|
||||
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||
"""See ``HTML5TreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTML5TreeBuilder()
|
||||
|
||||
def test_soupstrainer(self):
|
||||
# The html5lib tree builder does not support SoupStrainers.
|
||||
strainer = SoupStrainer("b")
|
||||
markup = "<p>A <b>bold</b> statement.</p>"
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for(markup))
|
||||
|
||||
self.assertTrue(
|
||||
"the html5lib tree builder doesn't support parse_only" in
|
||||
str(w[0].message))
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""html5lib inserts <tbody> tags where other parsers don't."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
||||
'</td></tr></tbody></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_xml_declaration_followed_by_doctype(self):
|
||||
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<p>foo</p>
|
||||
</body>
|
||||
</html>'''
|
||||
soup = self.soup(markup)
|
||||
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||
self.assertEqual(b"<p>foo</p>", soup.p.encode())
|
||||
|
||||
def test_reparented_markup(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
|
||||
def test_reparented_markup_ends_with_whitespace(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||||
"""Verify that we keep the two whitespace nodes in this
|
||||
document distinct when reparenting the adjacent <tbody> tags.
|
||||
"""
|
||||
markup = '<table> <tbody><tbody><ims></tbody> </table>'
|
||||
soup = self.soup(markup)
|
||||
space1, space2 = soup.find_all(string=' ')
|
||||
tbody1, tbody2 = soup.find_all('tbody')
|
||||
assert space1.next_element is tbody1
|
||||
assert tbody2.next_element is space2
|
||||
|
||||
def test_reparented_markup_containing_children(self):
|
||||
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
|
||||
soup = self.soup(markup)
|
||||
noscript = soup.noscript
|
||||
self.assertEqual("target", noscript.next_element)
|
||||
target = soup.find(string='target')
|
||||
|
||||
# The 'aftermath' string was duplicated; we want the second one.
|
||||
final_aftermath = soup.find_all(string='aftermath')[-1]
|
||||
|
||||
# The <noscript> tag was moved beneath a copy of the <a> tag,
|
||||
# but the 'target' string within is still connected to the
|
||||
# (second) 'aftermath' string.
|
||||
self.assertEqual(final_aftermath, target.next_element)
|
||||
self.assertEqual(target, final_aftermath.previous_element)
|
||||
|
||||
def test_processing_instruction(self):
|
||||
"""Processing instructions become comments."""
|
||||
markup = b"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
assert str(soup).startswith("<!--?PITarget PIContent?-->")
|
||||
|
||||
def test_cloned_multivalue_node(self):
|
||||
markup = b"""<a class="my_class"><p></a>"""
|
||||
soup = self.soup(markup)
|
||||
a1, a2 = soup.find_all('a')
|
||||
self.assertEqual(a1, a2)
|
||||
assert a1 is not a2
|
||||
|
||||
def test_foster_parenting(self):
|
||||
markup = b"""<table><td></tbody>A"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(u"<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||||
@@ -1,34 +0,0 @@
|
||||
"""Tests to ensure that the html.parser tree builder generates good
|
||||
trees."""
|
||||
|
||||
from pdb import set_trace
|
||||
import pickle
|
||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
|
||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTMLParserTreeBuilder()
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
||||
def test_builder_is_pickled(self):
|
||||
"""Unlike most tree builders, HTMLParserTreeBuilder and will
|
||||
be restored after pickling.
|
||||
"""
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
|
||||
|
||||
def test_redundant_empty_element_closing_tags(self):
|
||||
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||
self.assertSoupEquals('</br></br></br>', "")
|
||||
@@ -1,76 +0,0 @@
|
||||
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
try:
|
||||
import lxml.etree
|
||||
LXML_PRESENT = True
|
||||
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||
except ImportError, e:
|
||||
LXML_PRESENT = False
|
||||
LXML_VERSION = (0,)
|
||||
|
||||
if LXML_PRESENT:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import Comment, Doctype, SoupStrainer
|
||||
from bs4.testing import skipIf
|
||||
from bs4.tests import test_htmlparser
|
||||
from bs4.testing import (
|
||||
HTMLTreeBuilderSmokeTest,
|
||||
XMLTreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its tree builder.")
|
||||
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilder()
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
|
||||
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
||||
# test if an old version of lxml is installed.
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
|
||||
"Skipping doctype test for old version of lxml to avoid segfault.")
|
||||
def test_empty_doctype(self):
|
||||
soup = self.soup("<!DOCTYPE>")
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual("", doctype.strip())
|
||||
|
||||
def test_beautifulstonesoup_is_xml_parser(self):
|
||||
# Make sure that the deprecated BSS class uses an xml builder
|
||||
# if one is installed.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = BeautifulStoneSoup("<b />")
|
||||
self.assertEqual(u"<b/>", unicode(soup.b))
|
||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its XML tree builder.")
|
||||
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilderForXML()
|
||||
@@ -1,501 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests of Beautiful Soup as a whole."""
|
||||
|
||||
from pdb import set_trace
|
||||
import logging
|
||||
import unittest
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
SoupStrainer,
|
||||
NamespacedAttribute,
|
||||
)
|
||||
import bs4.dammit
|
||||
from bs4.dammit import (
|
||||
EntitySubstitution,
|
||||
UnicodeDammit,
|
||||
EncodingDetector,
|
||||
)
|
||||
from bs4.testing import (
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
LXML_PRESENT = True
|
||||
except ImportError, e:
|
||||
LXML_PRESENT = False
|
||||
|
||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||
|
||||
class TestConstructor(SoupTest):
|
||||
|
||||
def test_short_unicode_input(self):
|
||||
data = u"<h1>éé</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual(u"éé", soup.h1.string)
|
||||
|
||||
def test_embedded_null(self):
|
||||
data = u"<h1>foo\0bar</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual(u"foo\0bar", soup.h1.string)
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual("windows-1252", soup.original_encoding)
|
||||
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def _no_parser_specified(self, s, is_there=True):
|
||||
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
|
||||
self.assertTrue(v)
|
||||
|
||||
def test_warning_if_no_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
|
||||
def test_warning_if_parser_specified_too_vague(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
|
||||
def test_no_warning_if_explicit_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html.parser")
|
||||
self.assertEqual([], w)
|
||||
|
||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("parseOnlyThese" in msg)
|
||||
self.assertTrue("parse_only" in msg)
|
||||
self.assertEqual(b"<b></b>", soup.encode())
|
||||
|
||||
def test_fromEncoding_renamed_to_from_encoding(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
utf8 = b"\xc3\xa9"
|
||||
soup = self.soup(utf8, fromEncoding="utf8")
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("fromEncoding" in msg)
|
||||
self.assertTrue("from_encoding" in msg)
|
||||
self.assertEqual("utf8", soup.original_encoding)
|
||||
|
||||
def test_unrecognized_keyword_argument(self):
|
||||
self.assertRaises(
|
||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def test_disk_file_warning(self):
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
try:
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("looks like a filename" in msg)
|
||||
finally:
|
||||
filehandle.close()
|
||||
|
||||
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
self.assertEqual(0, len(w))
|
||||
|
||||
def test_url_warning_with_bytes_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/")
|
||||
# Be aware this isn't the only warning that can be raised during
|
||||
# execution..
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_unicode_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
# note - this url must differ from the bytes one otherwise
|
||||
# python's warnings system swallows the second warning
|
||||
soup = self.soup(u"http://www.crummyunicode.com/")
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_bytes_and_space(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_unicode_and_space(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(u"http://www.crummyuncode.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
|
||||
class TestSelectiveParsing(SoupTest):
|
||||
|
||||
def test_parse_with_soupstrainer(self):
|
||||
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
|
||||
|
||||
|
||||
class TestEntitySubstitution(unittest.TestCase):
|
||||
"""Standalone tests of the EntitySubstitution class."""
|
||||
def setUp(self):
|
||||
self.sub = EntitySubstitution
|
||||
|
||||
def test_simple_html_substitution(self):
|
||||
# Unicode characters corresponding to named HTML entites
|
||||
# are substituted, and no others.
|
||||
s = u"foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||
self.assertEqual(self.sub.substitute_html(s),
|
||||
u"foo∀\N{SNOWMAN}õbar")
|
||||
|
||||
def test_smart_quote_substitution(self):
|
||||
# MS smart quotes are a common source of frustration, so we
|
||||
# give them a special test.
|
||||
quotes = b"\x91\x92foo\x93\x94"
|
||||
dammit = UnicodeDammit(quotes)
|
||||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
||||
"‘’foo“”")
|
||||
|
||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
||||
|
||||
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
||||
self.assertEqual(self.sub.substitute_xml("Welcome", True),
|
||||
'"Welcome"')
|
||||
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
|
||||
'"Bob\'s Bar"')
|
||||
|
||||
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, True),
|
||||
"'Welcome to \"my bar\"'")
|
||||
|
||||
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
||||
s = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml(s, True),
|
||||
'"Welcome to "Bob\'s Bar""')
|
||||
|
||||
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
||||
quoted = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
|
||||
|
||||
def test_xml_quoting_handles_angle_brackets(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("foo<bar>"),
|
||||
"foo<bar>")
|
||||
|
||||
def test_xml_quoting_handles_ampersands(self):
|
||||
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
||||
|
||||
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("ÁT&T"),
|
||||
"&Aacute;T&T")
|
||||
|
||||
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
||||
"ÁT&T")
|
||||
|
||||
def test_quotes_not_html_substituted(self):
|
||||
"""There's no need to do this except inside attribute values."""
|
||||
text = 'Bob\'s "bar"'
|
||||
self.assertEqual(self.sub.substitute_html(text), text)
|
||||
|
||||
|
||||
class TestEncodingConversion(SoupTest):
|
||||
# Test Beautiful Soup's ability to decode and encode from various
|
||||
# encodings.
|
||||
|
||||
def setUp(self):
|
||||
super(TestEncodingConversion, self).setUp()
|
||||
self.unicode_data = u'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||
# Just so you know what it looks like.
|
||||
self.assertEqual(
|
||||
self.utf8_data,
|
||||
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
|
||||
|
||||
def test_ascii_in_unicode_out(self):
|
||||
# ASCII input is converted to Unicode. The original_encoding
|
||||
# attribute is set to 'utf-8', a superset of ASCII.
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
# Disable chardet, which will realize that the ASCII is ASCII.
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
ascii = b"<foo>a</foo>"
|
||||
soup_from_ascii = self.soup(ascii)
|
||||
unicode_output = soup_from_ascii.decode()
|
||||
self.assertTrue(isinstance(unicode_output, unicode))
|
||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_unicode_in_unicode_out(self):
|
||||
# Unicode input is left alone. The original_encoding attribute
|
||||
# is not set.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!')
|
||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||||
|
||||
def test_utf8_in_unicode_out(self):
|
||||
# UTF-8 input is converted to Unicode. The original_encoding
|
||||
# attribute is set.
|
||||
soup_from_utf8 = self.soup(self.utf8_data)
|
||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
|
||||
|
||||
def test_utf8_out(self):
|
||||
# The internal data structures can be encoded as UTF-8.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
|
||||
|
||||
@skipIf(
|
||||
PYTHON_3_PRE_3_2,
|
||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||
def test_attribute_name_containing_unicode_characters(self):
|
||||
markup = u'<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||
|
||||
class TestUnicodeDammit(unittest.TestCase):
|
||||
"""Standalone tests of UnicodeDammit."""
|
||||
|
||||
def test_unicode_input(self):
|
||||
markup = u"I'm already Unicode! \N{SNOWMAN}"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(dammit.unicode_markup, markup)
|
||||
|
||||
def test_smart_quotes_to_unicode(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, u"<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||
|
||||
def test_smart_quotes_to_xml_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_html_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_ascii(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, """<foo>''""</foo>""")
|
||||
|
||||
def test_detect_utf8(self):
|
||||
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||
dammit = UnicodeDammit(utf8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup, u'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||
|
||||
|
||||
def test_convert_hebrew(self):
|
||||
hebrew = b"\xed\xe5\xec\xf9"
|
||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||
self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
|
||||
|
||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||
dammit = UnicodeDammit(utf_8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||
|
||||
def test_ignore_inappropriate_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_ignore_invalid_codecs(self):
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
# This is UTF-8.
|
||||
utf8_data = u"Räksmörgås".encode("utf-8")
|
||||
|
||||
# But if we exclude UTF-8 from consideration, the guess is
|
||||
# Windows-1252.
|
||||
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
|
||||
|
||||
# And if we exclude that, there is no valid guess at all.
|
||||
dammit = UnicodeDammit(
|
||||
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
|
||||
self.assertEqual(dammit.original_encoding, None)
|
||||
|
||||
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
|
||||
detected = EncodingDetector(
|
||||
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||
encodings = list(detected.encodings)
|
||||
assert u'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||
|
||||
def test_detect_html5_style_meta_tag(self):
|
||||
|
||||
for data in (
|
||||
b'<html><meta charset="euc-jp" /></html>',
|
||||
b"<html><meta charset='euc-jp' /></html>",
|
||||
b"<html><meta charset=euc-jp /></html>",
|
||||
b"<html><meta charset=euc-jp/></html>"):
|
||||
dammit = UnicodeDammit(data, is_html=True)
|
||||
self.assertEqual(
|
||||
"euc-jp", dammit.original_encoding)
|
||||
|
||||
def test_last_ditch_entity_replacement(self):
|
||||
# This is a UTF-8 document that contains bytestrings
|
||||
# completely incompatible with UTF-8 (ie. encoded with some other
|
||||
# encoding).
|
||||
#
|
||||
# Since there is no consistent encoding for the document,
|
||||
# Unicode, Dammit will eventually encode the document as UTF-8
|
||||
# and encode the incompatible characters as REPLACEMENT
|
||||
# CHARACTER.
|
||||
#
|
||||
# If chardet is installed, it will detect that the document
|
||||
# can be converted into ISO-8859-1 without errors. This happens
|
||||
# to be the wrong encoding, but it is a consistent encoding, so the
|
||||
# code we're testing here won't run.
|
||||
#
|
||||
# So we temporarily disable chardet if it's present.
|
||||
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html><b>\330\250\330\252\330\261</b>
|
||||
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
dammit = UnicodeDammit(doc)
|
||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||
self.assertTrue(u"\ufffd" in dammit.unicode_markup)
|
||||
|
||||
soup = BeautifulSoup(doc, "html.parser")
|
||||
self.assertTrue(soup.contains_replacement_characters)
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_byte_order_mark_removed(self):
|
||||
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||
dammit = UnicodeDammit(data)
|
||||
self.assertEqual(u"<a>áé</a>", dammit.unicode_markup)
|
||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||
|
||||
def test_detwingle(self):
|
||||
# Here's a UTF8 document.
|
||||
utf8 = (u"\N{SNOWMAN}" * 3).encode("utf8")
|
||||
|
||||
# Here's a Windows-1252 document.
|
||||
windows_1252 = (
|
||||
u"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||
u"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||
|
||||
# Through some unholy alchemy, they've been stuck together.
|
||||
doc = utf8 + windows_1252 + utf8
|
||||
|
||||
# The document can't be turned into UTF-8:
|
||||
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
||||
|
||||
# Unicode, Dammit thinks the whole document is Windows-1252,
|
||||
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
||||
|
||||
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
||||
|
||||
fixed = UnicodeDammit.detwingle(doc)
|
||||
self.assertEqual(
|
||||
u"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||
|
||||
def test_detwingle_ignores_multibyte_characters(self):
|
||||
# Each of these characters has a UTF-8 representation ending
|
||||
# in \x93. \x93 is a smart quote if interpreted as
|
||||
# Windows-1252. But our code knows to skip over multibyte
|
||||
# UTF-8 characters, so they'll survive the process unscathed.
|
||||
for tricky_unicode_char in (
|
||||
u"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||
u"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||
u"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||
):
|
||||
input = tricky_unicode_char.encode("utf8")
|
||||
self.assertTrue(input.endswith(b'\x93'))
|
||||
output = UnicodeDammit.detwingle(input)
|
||||
self.assertEqual(output, input)
|
||||
|
||||
class TestNamedspacedAttribute(SoupTest):
|
||||
|
||||
def test_name_may_be_none(self):
|
||||
a = NamespacedAttribute("xmlns", None)
|
||||
self.assertEqual(a, "xmlns")
|
||||
|
||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||
a = NamespacedAttribute("a", "b")
|
||||
self.assertEqual("a:b", a)
|
||||
|
||||
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
||||
a = NamespacedAttribute("a", "b", "c")
|
||||
b = NamespacedAttribute("a", "b", "c")
|
||||
self.assertEqual(a, b)
|
||||
|
||||
# The actual namespace is not considered.
|
||||
c = NamespacedAttribute("a", "b", None)
|
||||
self.assertEqual(a, c)
|
||||
|
||||
# But name and prefix are important.
|
||||
d = NamespacedAttribute("a", "z", "c")
|
||||
self.assertNotEqual(a, d)
|
||||
|
||||
e = NamespacedAttribute("z", "b", "c")
|
||||
self.assertNotEqual(a, e)
|
||||
|
||||
|
||||
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = CharsetMetaAttributeValue("euc-jp")
|
||||
self.assertEqual("euc-jp", value)
|
||||
self.assertEqual("euc-jp", value.original_value)
|
||||
self.assertEqual("utf8", value.encode("utf8"))
|
||||
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
||||
self.assertEqual("text/html; charset=euc-jp", value)
|
||||
self.assertEqual("text/html; charset=euc-jp", value.original_value)
|
||||
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,3 +0,0 @@
|
||||
from pkgutil import extend_path
|
||||
|
||||
__path__ = extend_path(__path__, __name__)
|
||||
@@ -1,23 +0,0 @@
|
||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Execute computations asynchronously using threads or processes."""
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
from concurrent.futures._base import (FIRST_COMPLETED,
|
||||
FIRST_EXCEPTION,
|
||||
ALL_COMPLETED,
|
||||
CancelledError,
|
||||
TimeoutError,
|
||||
Future,
|
||||
Executor,
|
||||
wait,
|
||||
as_completed)
|
||||
from concurrent.futures.thread import ThreadPoolExecutor
|
||||
|
||||
try:
|
||||
from concurrent.futures.process import ProcessPoolExecutor
|
||||
except ImportError:
|
||||
# some platforms don't have multiprocessing
|
||||
pass
|
||||
@@ -1,607 +0,0 @@
|
||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
import collections
|
||||
import logging
|
||||
import threading
|
||||
import itertools
|
||||
import time
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
FIRST_COMPLETED = 'FIRST_COMPLETED'
|
||||
FIRST_EXCEPTION = 'FIRST_EXCEPTION'
|
||||
ALL_COMPLETED = 'ALL_COMPLETED'
|
||||
_AS_COMPLETED = '_AS_COMPLETED'
|
||||
|
||||
# Possible future states (for internal use by the futures package).
|
||||
PENDING = 'PENDING'
|
||||
RUNNING = 'RUNNING'
|
||||
# The future was cancelled by the user...
|
||||
CANCELLED = 'CANCELLED'
|
||||
# ...and _Waiter.add_cancelled() was called by a worker.
|
||||
CANCELLED_AND_NOTIFIED = 'CANCELLED_AND_NOTIFIED'
|
||||
FINISHED = 'FINISHED'
|
||||
|
||||
_FUTURE_STATES = [
|
||||
PENDING,
|
||||
RUNNING,
|
||||
CANCELLED,
|
||||
CANCELLED_AND_NOTIFIED,
|
||||
FINISHED
|
||||
]
|
||||
|
||||
_STATE_TO_DESCRIPTION_MAP = {
|
||||
PENDING: "pending",
|
||||
RUNNING: "running",
|
||||
CANCELLED: "cancelled",
|
||||
CANCELLED_AND_NOTIFIED: "cancelled",
|
||||
FINISHED: "finished"
|
||||
}
|
||||
|
||||
# Logger for internal use by the futures package.
|
||||
LOGGER = logging.getLogger("concurrent.futures")
|
||||
|
||||
class Error(Exception):
|
||||
"""Base class for all future-related exceptions."""
|
||||
pass
|
||||
|
||||
class CancelledError(Error):
|
||||
"""The Future was cancelled."""
|
||||
pass
|
||||
|
||||
class TimeoutError(Error):
|
||||
"""The operation exceeded the given deadline."""
|
||||
pass
|
||||
|
||||
class _Waiter(object):
|
||||
"""Provides the event that wait() and as_completed() block on."""
|
||||
def __init__(self):
|
||||
self.event = threading.Event()
|
||||
self.finished_futures = []
|
||||
|
||||
def add_result(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
def add_exception(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
def add_cancelled(self, future):
|
||||
self.finished_futures.append(future)
|
||||
|
||||
class _AsCompletedWaiter(_Waiter):
|
||||
"""Used by as_completed()."""
|
||||
|
||||
def __init__(self):
|
||||
super(_AsCompletedWaiter, self).__init__()
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def add_result(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_result(future)
|
||||
self.event.set()
|
||||
|
||||
def add_exception(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_exception(future)
|
||||
self.event.set()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
with self.lock:
|
||||
super(_AsCompletedWaiter, self).add_cancelled(future)
|
||||
self.event.set()
|
||||
|
||||
class _FirstCompletedWaiter(_Waiter):
|
||||
"""Used by wait(return_when=FIRST_COMPLETED)."""
|
||||
|
||||
def add_result(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_result(future)
|
||||
self.event.set()
|
||||
|
||||
def add_exception(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_exception(future)
|
||||
self.event.set()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
super(_FirstCompletedWaiter, self).add_cancelled(future)
|
||||
self.event.set()
|
||||
|
||||
class _AllCompletedWaiter(_Waiter):
|
||||
"""Used by wait(return_when=FIRST_EXCEPTION and ALL_COMPLETED)."""
|
||||
|
||||
def __init__(self, num_pending_calls, stop_on_exception):
|
||||
self.num_pending_calls = num_pending_calls
|
||||
self.stop_on_exception = stop_on_exception
|
||||
self.lock = threading.Lock()
|
||||
super(_AllCompletedWaiter, self).__init__()
|
||||
|
||||
def _decrement_pending_calls(self):
|
||||
with self.lock:
|
||||
self.num_pending_calls -= 1
|
||||
if not self.num_pending_calls:
|
||||
self.event.set()
|
||||
|
||||
def add_result(self, future):
|
||||
super(_AllCompletedWaiter, self).add_result(future)
|
||||
self._decrement_pending_calls()
|
||||
|
||||
def add_exception(self, future):
|
||||
super(_AllCompletedWaiter, self).add_exception(future)
|
||||
if self.stop_on_exception:
|
||||
self.event.set()
|
||||
else:
|
||||
self._decrement_pending_calls()
|
||||
|
||||
def add_cancelled(self, future):
|
||||
super(_AllCompletedWaiter, self).add_cancelled(future)
|
||||
self._decrement_pending_calls()
|
||||
|
||||
class _AcquireFutures(object):
|
||||
"""A context manager that does an ordered acquire of Future conditions."""
|
||||
|
||||
def __init__(self, futures):
|
||||
self.futures = sorted(futures, key=id)
|
||||
|
||||
def __enter__(self):
|
||||
for future in self.futures:
|
||||
future._condition.acquire()
|
||||
|
||||
def __exit__(self, *args):
|
||||
for future in self.futures:
|
||||
future._condition.release()
|
||||
|
||||
def _create_and_install_waiters(fs, return_when):
|
||||
if return_when == _AS_COMPLETED:
|
||||
waiter = _AsCompletedWaiter()
|
||||
elif return_when == FIRST_COMPLETED:
|
||||
waiter = _FirstCompletedWaiter()
|
||||
else:
|
||||
pending_count = sum(
|
||||
f._state not in [CANCELLED_AND_NOTIFIED, FINISHED] for f in fs)
|
||||
|
||||
if return_when == FIRST_EXCEPTION:
|
||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=True)
|
||||
elif return_when == ALL_COMPLETED:
|
||||
waiter = _AllCompletedWaiter(pending_count, stop_on_exception=False)
|
||||
else:
|
||||
raise ValueError("Invalid return condition: %r" % return_when)
|
||||
|
||||
for f in fs:
|
||||
f._waiters.append(waiter)
|
||||
|
||||
return waiter
|
||||
|
||||
def as_completed(fs, timeout=None):
|
||||
"""An iterator over the given futures that yields each as it completes.
|
||||
|
||||
Args:
|
||||
fs: The sequence of Futures (possibly created by different Executors) to
|
||||
iterate over.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
An iterator that yields the given Futures as they complete (finished or
|
||||
cancelled). If any given Futures are duplicated, they will be returned
|
||||
once.
|
||||
|
||||
Raises:
|
||||
TimeoutError: If the entire result iterator could not be generated
|
||||
before the given timeout.
|
||||
"""
|
||||
if timeout is not None:
|
||||
end_time = timeout + time.time()
|
||||
|
||||
fs = set(fs)
|
||||
with _AcquireFutures(fs):
|
||||
finished = set(
|
||||
f for f in fs
|
||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
||||
pending = fs - finished
|
||||
waiter = _create_and_install_waiters(fs, _AS_COMPLETED)
|
||||
|
||||
try:
|
||||
for future in finished:
|
||||
yield future
|
||||
|
||||
while pending:
|
||||
if timeout is None:
|
||||
wait_timeout = None
|
||||
else:
|
||||
wait_timeout = end_time - time.time()
|
||||
if wait_timeout < 0:
|
||||
raise TimeoutError(
|
||||
'%d (of %d) futures unfinished' % (
|
||||
len(pending), len(fs)))
|
||||
|
||||
waiter.event.wait(wait_timeout)
|
||||
|
||||
with waiter.lock:
|
||||
finished = waiter.finished_futures
|
||||
waiter.finished_futures = []
|
||||
waiter.event.clear()
|
||||
|
||||
for future in finished:
|
||||
yield future
|
||||
pending.remove(future)
|
||||
|
||||
finally:
|
||||
for f in fs:
|
||||
with f._condition:
|
||||
f._waiters.remove(waiter)
|
||||
|
||||
DoneAndNotDoneFutures = collections.namedtuple(
|
||||
'DoneAndNotDoneFutures', 'done not_done')
|
||||
def wait(fs, timeout=None, return_when=ALL_COMPLETED):
|
||||
"""Wait for the futures in the given sequence to complete.
|
||||
|
||||
Args:
|
||||
fs: The sequence of Futures (possibly created by different Executors) to
|
||||
wait upon.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
return_when: Indicates when this function should return. The options
|
||||
are:
|
||||
|
||||
FIRST_COMPLETED - Return when any future finishes or is
|
||||
cancelled.
|
||||
FIRST_EXCEPTION - Return when any future finishes by raising an
|
||||
exception. If no future raises an exception
|
||||
then it is equivalent to ALL_COMPLETED.
|
||||
ALL_COMPLETED - Return when all futures finish or are cancelled.
|
||||
|
||||
Returns:
|
||||
A named 2-tuple of sets. The first set, named 'done', contains the
|
||||
futures that completed (is finished or cancelled) before the wait
|
||||
completed. The second set, named 'not_done', contains uncompleted
|
||||
futures.
|
||||
"""
|
||||
with _AcquireFutures(fs):
|
||||
done = set(f for f in fs
|
||||
if f._state in [CANCELLED_AND_NOTIFIED, FINISHED])
|
||||
not_done = set(fs) - done
|
||||
|
||||
if (return_when == FIRST_COMPLETED) and done:
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
elif (return_when == FIRST_EXCEPTION) and done:
|
||||
if any(f for f in done
|
||||
if not f.cancelled() and f.exception() is not None):
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
|
||||
if len(done) == len(fs):
|
||||
return DoneAndNotDoneFutures(done, not_done)
|
||||
|
||||
waiter = _create_and_install_waiters(fs, return_when)
|
||||
|
||||
waiter.event.wait(timeout)
|
||||
for f in fs:
|
||||
with f._condition:
|
||||
f._waiters.remove(waiter)
|
||||
|
||||
done.update(waiter.finished_futures)
|
||||
return DoneAndNotDoneFutures(done, set(fs) - done)
|
||||
|
||||
class Future(object):
|
||||
"""Represents the result of an asynchronous computation."""
|
||||
|
||||
def __init__(self):
|
||||
"""Initializes the future. Should not be called by clients."""
|
||||
self._condition = threading.Condition()
|
||||
self._state = PENDING
|
||||
self._result = None
|
||||
self._exception = None
|
||||
self._traceback = None
|
||||
self._waiters = []
|
||||
self._done_callbacks = []
|
||||
|
||||
def _invoke_callbacks(self):
|
||||
for callback in self._done_callbacks:
|
||||
try:
|
||||
callback(self)
|
||||
except Exception:
|
||||
LOGGER.exception('exception calling callback for %r', self)
|
||||
|
||||
def __repr__(self):
|
||||
with self._condition:
|
||||
if self._state == FINISHED:
|
||||
if self._exception:
|
||||
return '<Future at %s state=%s raised %s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
||||
self._exception.__class__.__name__)
|
||||
else:
|
||||
return '<Future at %s state=%s returned %s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state],
|
||||
self._result.__class__.__name__)
|
||||
return '<Future at %s state=%s>' % (
|
||||
hex(id(self)),
|
||||
_STATE_TO_DESCRIPTION_MAP[self._state])
|
||||
|
||||
def cancel(self):
|
||||
"""Cancel the future if possible.
|
||||
|
||||
Returns True if the future was cancelled, False otherwise. A future
|
||||
cannot be cancelled if it is running or has already completed.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [RUNNING, FINISHED]:
|
||||
return False
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
return True
|
||||
|
||||
self._state = CANCELLED
|
||||
self._condition.notify_all()
|
||||
|
||||
self._invoke_callbacks()
|
||||
return True
|
||||
|
||||
def cancelled(self):
|
||||
"""Return True if the future has cancelled."""
|
||||
with self._condition:
|
||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]
|
||||
|
||||
def running(self):
|
||||
"""Return True if the future is currently executing."""
|
||||
with self._condition:
|
||||
return self._state == RUNNING
|
||||
|
||||
def done(self):
|
||||
"""Return True of the future was cancelled or finished executing."""
|
||||
with self._condition:
|
||||
return self._state in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]
|
||||
|
||||
def __get_result(self):
|
||||
if self._exception:
|
||||
raise type(self._exception), self._exception, self._traceback
|
||||
else:
|
||||
return self._result
|
||||
|
||||
def add_done_callback(self, fn):
|
||||
"""Attaches a callable that will be called when the future finishes.
|
||||
|
||||
Args:
|
||||
fn: A callable that will be called with this future as its only
|
||||
argument when the future completes or is cancelled. The callable
|
||||
will always be called by a thread in the same process in which
|
||||
it was added. If the future has already completed or been
|
||||
cancelled then the callable will be called immediately. These
|
||||
callables are called in the order that they were added.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state not in [CANCELLED, CANCELLED_AND_NOTIFIED, FINISHED]:
|
||||
self._done_callbacks.append(fn)
|
||||
return
|
||||
fn(self)
|
||||
|
||||
def result(self, timeout=None):
|
||||
"""Return the result of the call that the future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the result if the future
|
||||
isn't done. If None, then there is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
The result of the call that the future represents.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
Exception: If the call raised then that exception will be raised.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self.__get_result()
|
||||
|
||||
self._condition.wait(timeout)
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self.__get_result()
|
||||
else:
|
||||
raise TimeoutError()
|
||||
|
||||
def exception_info(self, timeout=None):
|
||||
"""Return a tuple of (exception, traceback) raised by the call that the
|
||||
future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the exception if the
|
||||
future isn't done. If None, then there is no limit on the wait
|
||||
time.
|
||||
|
||||
Returns:
|
||||
The exception raised by the call that the future represents or None
|
||||
if the call completed without raising.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self._exception, self._traceback
|
||||
|
||||
self._condition.wait(timeout)
|
||||
|
||||
if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
|
||||
raise CancelledError()
|
||||
elif self._state == FINISHED:
|
||||
return self._exception, self._traceback
|
||||
else:
|
||||
raise TimeoutError()
|
||||
|
||||
def exception(self, timeout=None):
|
||||
"""Return the exception raised by the call that the future represents.
|
||||
|
||||
Args:
|
||||
timeout: The number of seconds to wait for the exception if the
|
||||
future isn't done. If None, then there is no limit on the wait
|
||||
time.
|
||||
|
||||
Returns:
|
||||
The exception raised by the call that the future represents or None
|
||||
if the call completed without raising.
|
||||
|
||||
Raises:
|
||||
CancelledError: If the future was cancelled.
|
||||
TimeoutError: If the future didn't finish executing before the given
|
||||
timeout.
|
||||
"""
|
||||
return self.exception_info(timeout)[0]
|
||||
|
||||
# The following methods should only be used by Executors and in tests.
|
||||
def set_running_or_notify_cancel(self):
|
||||
"""Mark the future as running or process any cancel notifications.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
|
||||
If the future has been cancelled (cancel() was called and returned
|
||||
True) then any threads waiting on the future completing (though calls
|
||||
to as_completed() or wait()) are notified and False is returned.
|
||||
|
||||
If the future was not cancelled then it is put in the running state
|
||||
(future calls to running() will return True) and True is returned.
|
||||
|
||||
This method should be called by Executor implementations before
|
||||
executing the work associated with this future. If this method returns
|
||||
False then the work should not be executed.
|
||||
|
||||
Returns:
|
||||
False if the Future was cancelled, True otherwise.
|
||||
|
||||
Raises:
|
||||
RuntimeError: if this method was already called or if set_result()
|
||||
or set_exception() was called.
|
||||
"""
|
||||
with self._condition:
|
||||
if self._state == CANCELLED:
|
||||
self._state = CANCELLED_AND_NOTIFIED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_cancelled(self)
|
||||
# self._condition.notify_all() is not necessary because
|
||||
# self.cancel() triggers a notification.
|
||||
return False
|
||||
elif self._state == PENDING:
|
||||
self._state = RUNNING
|
||||
return True
|
||||
else:
|
||||
LOGGER.critical('Future %s in unexpected state: %s',
|
||||
id(self),
|
||||
self._state)
|
||||
raise RuntimeError('Future in unexpected state')
|
||||
|
||||
def set_result(self, result):
|
||||
"""Sets the return value of work associated with the future.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
with self._condition:
|
||||
self._result = result
|
||||
self._state = FINISHED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_result(self)
|
||||
self._condition.notify_all()
|
||||
self._invoke_callbacks()
|
||||
|
||||
def set_exception_info(self, exception, traceback):
|
||||
"""Sets the result of the future as being the given exception
|
||||
and traceback.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
with self._condition:
|
||||
self._exception = exception
|
||||
self._traceback = traceback
|
||||
self._state = FINISHED
|
||||
for waiter in self._waiters:
|
||||
waiter.add_exception(self)
|
||||
self._condition.notify_all()
|
||||
self._invoke_callbacks()
|
||||
|
||||
def set_exception(self, exception):
|
||||
"""Sets the result of the future as being the given exception.
|
||||
|
||||
Should only be used by Executor implementations and unit tests.
|
||||
"""
|
||||
self.set_exception_info(exception, None)
|
||||
|
||||
class Executor(object):
|
||||
"""This is an abstract base class for concrete asynchronous executors."""
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
"""Submits a callable to be executed with the given arguments.
|
||||
|
||||
Schedules the callable to be executed as fn(*args, **kwargs) and returns
|
||||
a Future instance representing the execution of the callable.
|
||||
|
||||
Returns:
|
||||
A Future representing the given call.
|
||||
"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def map(self, fn, *iterables, **kwargs):
|
||||
"""Returns a iterator equivalent to map(fn, iter).
|
||||
|
||||
Args:
|
||||
fn: A callable that will take as many arguments as there are
|
||||
passed iterables.
|
||||
timeout: The maximum number of seconds to wait. If None, then there
|
||||
is no limit on the wait time.
|
||||
|
||||
Returns:
|
||||
An iterator equivalent to: map(func, *iterables) but the calls may
|
||||
be evaluated out-of-order.
|
||||
|
||||
Raises:
|
||||
TimeoutError: If the entire result iterator could not be generated
|
||||
before the given timeout.
|
||||
Exception: If fn(*args) raises for any values.
|
||||
"""
|
||||
timeout = kwargs.get('timeout')
|
||||
if timeout is not None:
|
||||
end_time = timeout + time.time()
|
||||
|
||||
fs = [self.submit(fn, *args) for args in itertools.izip(*iterables)]
|
||||
|
||||
# Yield must be hidden in closure so that the futures are submitted
|
||||
# before the first iterator value is required.
|
||||
def result_iterator():
|
||||
try:
|
||||
for future in fs:
|
||||
if timeout is None:
|
||||
yield future.result()
|
||||
else:
|
||||
yield future.result(end_time - time.time())
|
||||
finally:
|
||||
for future in fs:
|
||||
future.cancel()
|
||||
return result_iterator()
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
"""Clean-up the resources associated with the Executor.
|
||||
|
||||
It is safe to call this method several times. Otherwise, no other
|
||||
methods can be called after this one.
|
||||
|
||||
Args:
|
||||
wait: If True then shutdown will not return until all running
|
||||
futures have finished executing and the resources used by the
|
||||
executor have been reclaimed.
|
||||
"""
|
||||
pass
|
||||
|
||||
def __enter__(self):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
self.shutdown(wait=True)
|
||||
return False
|
||||
@@ -1,359 +0,0 @@
|
||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Implements ProcessPoolExecutor.
|
||||
|
||||
The follow diagram and text describe the data-flow through the system:
|
||||
|
||||
|======================= In-process =====================|== Out-of-process ==|
|
||||
|
||||
+----------+ +----------+ +--------+ +-----------+ +---------+
|
||||
| | => | Work Ids | => | | => | Call Q | => | |
|
||||
| | +----------+ | | +-----------+ | |
|
||||
| | | ... | | | | ... | | |
|
||||
| | | 6 | | | | 5, call() | | |
|
||||
| | | 7 | | | | ... | | |
|
||||
| Process | | ... | | Local | +-----------+ | Process |
|
||||
| Pool | +----------+ | Worker | | #1..n |
|
||||
| Executor | | Thread | | |
|
||||
| | +----------- + | | +-----------+ | |
|
||||
| | <=> | Work Items | <=> | | <= | Result Q | <= | |
|
||||
| | +------------+ | | +-----------+ | |
|
||||
| | | 6: call() | | | | ... | | |
|
||||
| | | future | | | | 4, result | | |
|
||||
| | | ... | | | | 3, except | | |
|
||||
+----------+ +------------+ +--------+ +-----------+ +---------+
|
||||
|
||||
Executor.submit() called:
|
||||
- creates a uniquely numbered _WorkItem and adds it to the "Work Items" dict
|
||||
- adds the id of the _WorkItem to the "Work Ids" queue
|
||||
|
||||
Local worker thread:
|
||||
- reads work ids from the "Work Ids" queue and looks up the corresponding
|
||||
WorkItem from the "Work Items" dict: if the work item has been cancelled then
|
||||
it is simply removed from the dict, otherwise it is repackaged as a
|
||||
_CallItem and put in the "Call Q". New _CallItems are put in the "Call Q"
|
||||
until "Call Q" is full. NOTE: the size of the "Call Q" is kept small because
|
||||
calls placed in the "Call Q" can no longer be cancelled with Future.cancel().
|
||||
- reads _ResultItems from "Result Q", updates the future stored in the
|
||||
"Work Items" dict and deletes the dict entry
|
||||
|
||||
Process #1..n:
|
||||
- reads _CallItems from "Call Q", executes the calls, and puts the resulting
|
||||
_ResultItems in "Request Q"
|
||||
"""
|
||||
|
||||
import atexit
|
||||
from concurrent.futures import _base
|
||||
import Queue as queue
|
||||
import multiprocessing
|
||||
import threading
|
||||
import weakref
|
||||
import sys
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
# Workers are created as daemon threads and processes. This is done to allow the
|
||||
# interpreter to exit when there are still idle processes in a
|
||||
# ProcessPoolExecutor's process pool (i.e. shutdown() was not called). However,
|
||||
# allowing workers to die with the interpreter has two undesirable properties:
|
||||
# - The workers would still be running during interpretor shutdown,
|
||||
# meaning that they would fail in unpredictable ways.
|
||||
# - The workers could be killed while evaluating a work item, which could
|
||||
# be bad if the callable being evaluated has external side-effects e.g.
|
||||
# writing to a file.
|
||||
#
|
||||
# To work around this problem, an exit handler is installed which tells the
|
||||
# workers to exit when their work queues are empty and then waits until the
|
||||
# threads/processes finish.
|
||||
|
||||
_threads_queues = weakref.WeakKeyDictionary()
|
||||
_shutdown = False
|
||||
|
||||
def _python_exit():
|
||||
global _shutdown
|
||||
_shutdown = True
|
||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
||||
for t, q in items:
|
||||
q.put(None)
|
||||
for t, q in items:
|
||||
t.join(sys.maxint)
|
||||
|
||||
# Controls how many more calls than processes will be queued in the call queue.
|
||||
# A smaller number will mean that processes spend more time idle waiting for
|
||||
# work while a larger number will make Future.cancel() succeed less frequently
|
||||
# (Futures in the call queue cannot be cancelled).
|
||||
EXTRA_QUEUED_CALLS = 1
|
||||
|
||||
class _WorkItem(object):
|
||||
def __init__(self, future, fn, args, kwargs):
|
||||
self.future = future
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
class _ResultItem(object):
|
||||
def __init__(self, work_id, exception=None, result=None):
|
||||
self.work_id = work_id
|
||||
self.exception = exception
|
||||
self.result = result
|
||||
|
||||
class _CallItem(object):
|
||||
def __init__(self, work_id, fn, args, kwargs):
|
||||
self.work_id = work_id
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
def _process_worker(call_queue, result_queue):
|
||||
"""Evaluates calls from call_queue and places the results in result_queue.
|
||||
|
||||
This worker is run in a separate process.
|
||||
|
||||
Args:
|
||||
call_queue: A multiprocessing.Queue of _CallItems that will be read and
|
||||
evaluated by the worker.
|
||||
result_queue: A multiprocessing.Queue of _ResultItems that will written
|
||||
to by the worker.
|
||||
shutdown: A multiprocessing.Event that will be set as a signal to the
|
||||
worker that it should exit when call_queue is empty.
|
||||
"""
|
||||
while True:
|
||||
call_item = call_queue.get(block=True)
|
||||
if call_item is None:
|
||||
# Wake up queue management thread
|
||||
result_queue.put(None)
|
||||
return
|
||||
try:
|
||||
r = call_item.fn(*call_item.args, **call_item.kwargs)
|
||||
except BaseException:
|
||||
e = sys.exc_info()[1]
|
||||
result_queue.put(_ResultItem(call_item.work_id,
|
||||
exception=e))
|
||||
else:
|
||||
result_queue.put(_ResultItem(call_item.work_id,
|
||||
result=r))
|
||||
|
||||
def _add_call_item_to_queue(pending_work_items,
|
||||
work_ids,
|
||||
call_queue):
|
||||
"""Fills call_queue with _WorkItems from pending_work_items.
|
||||
|
||||
This function never blocks.
|
||||
|
||||
Args:
|
||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
||||
work_ids: A queue.Queue of work ids e.g. Queue([5, 6, ...]). Work ids
|
||||
are consumed and the corresponding _WorkItems from
|
||||
pending_work_items are transformed into _CallItems and put in
|
||||
call_queue.
|
||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
||||
derived from _WorkItems.
|
||||
"""
|
||||
while True:
|
||||
if call_queue.full():
|
||||
return
|
||||
try:
|
||||
work_id = work_ids.get(block=False)
|
||||
except queue.Empty:
|
||||
return
|
||||
else:
|
||||
work_item = pending_work_items[work_id]
|
||||
|
||||
if work_item.future.set_running_or_notify_cancel():
|
||||
call_queue.put(_CallItem(work_id,
|
||||
work_item.fn,
|
||||
work_item.args,
|
||||
work_item.kwargs),
|
||||
block=True)
|
||||
else:
|
||||
del pending_work_items[work_id]
|
||||
continue
|
||||
|
||||
def _queue_management_worker(executor_reference,
|
||||
processes,
|
||||
pending_work_items,
|
||||
work_ids_queue,
|
||||
call_queue,
|
||||
result_queue):
|
||||
"""Manages the communication between this process and the worker processes.
|
||||
|
||||
This function is run in a local thread.
|
||||
|
||||
Args:
|
||||
executor_reference: A weakref.ref to the ProcessPoolExecutor that owns
|
||||
this thread. Used to determine if the ProcessPoolExecutor has been
|
||||
garbage collected and that this function can exit.
|
||||
process: A list of the multiprocessing.Process instances used as
|
||||
workers.
|
||||
pending_work_items: A dict mapping work ids to _WorkItems e.g.
|
||||
{5: <_WorkItem...>, 6: <_WorkItem...>, ...}
|
||||
work_ids_queue: A queue.Queue of work ids e.g. Queue([5, 6, ...]).
|
||||
call_queue: A multiprocessing.Queue that will be filled with _CallItems
|
||||
derived from _WorkItems for processing by the process workers.
|
||||
result_queue: A multiprocessing.Queue of _ResultItems generated by the
|
||||
process workers.
|
||||
"""
|
||||
nb_shutdown_processes = [0]
|
||||
def shutdown_one_process():
|
||||
"""Tell a worker to terminate, which will in turn wake us again"""
|
||||
call_queue.put(None)
|
||||
nb_shutdown_processes[0] += 1
|
||||
while True:
|
||||
_add_call_item_to_queue(pending_work_items,
|
||||
work_ids_queue,
|
||||
call_queue)
|
||||
|
||||
result_item = result_queue.get(block=True)
|
||||
if result_item is not None:
|
||||
work_item = pending_work_items[result_item.work_id]
|
||||
del pending_work_items[result_item.work_id]
|
||||
|
||||
if result_item.exception:
|
||||
work_item.future.set_exception(result_item.exception)
|
||||
else:
|
||||
work_item.future.set_result(result_item.result)
|
||||
# Delete references to object. See issue16284
|
||||
del work_item
|
||||
# Check whether we should start shutting down.
|
||||
executor = executor_reference()
|
||||
# No more work items can be added if:
|
||||
# - The interpreter is shutting down OR
|
||||
# - The executor that owns this worker has been collected OR
|
||||
# - The executor that owns this worker has been shutdown.
|
||||
if _shutdown or executor is None or executor._shutdown_thread:
|
||||
# Since no new work items can be added, it is safe to shutdown
|
||||
# this thread if there are no pending work items.
|
||||
if not pending_work_items:
|
||||
while nb_shutdown_processes[0] < len(processes):
|
||||
shutdown_one_process()
|
||||
# If .join() is not called on the created processes then
|
||||
# some multiprocessing.Queue methods may deadlock on Mac OS
|
||||
# X.
|
||||
for p in processes:
|
||||
p.join()
|
||||
call_queue.close()
|
||||
return
|
||||
del executor
|
||||
|
||||
_system_limits_checked = False
|
||||
_system_limited = None
|
||||
def _check_system_limits():
|
||||
global _system_limits_checked, _system_limited
|
||||
if _system_limits_checked:
|
||||
if _system_limited:
|
||||
raise NotImplementedError(_system_limited)
|
||||
_system_limits_checked = True
|
||||
try:
|
||||
import os
|
||||
nsems_max = os.sysconf("SC_SEM_NSEMS_MAX")
|
||||
except (AttributeError, ValueError):
|
||||
# sysconf not available or setting not available
|
||||
return
|
||||
if nsems_max == -1:
|
||||
# indetermine limit, assume that limit is determined
|
||||
# by available memory only
|
||||
return
|
||||
if nsems_max >= 256:
|
||||
# minimum number of semaphores available
|
||||
# according to POSIX
|
||||
return
|
||||
_system_limited = "system provides too few semaphores (%d available, 256 necessary)" % nsems_max
|
||||
raise NotImplementedError(_system_limited)
|
||||
|
||||
class ProcessPoolExecutor(_base.Executor):
|
||||
def __init__(self, max_workers=None):
|
||||
"""Initializes a new ProcessPoolExecutor instance.
|
||||
|
||||
Args:
|
||||
max_workers: The maximum number of processes that can be used to
|
||||
execute the given calls. If None or not given then as many
|
||||
worker processes will be created as the machine has processors.
|
||||
"""
|
||||
_check_system_limits()
|
||||
|
||||
if max_workers is None:
|
||||
self._max_workers = multiprocessing.cpu_count()
|
||||
else:
|
||||
self._max_workers = max_workers
|
||||
|
||||
# Make the call queue slightly larger than the number of processes to
|
||||
# prevent the worker processes from idling. But don't make it too big
|
||||
# because futures in the call queue cannot be cancelled.
|
||||
self._call_queue = multiprocessing.Queue(self._max_workers +
|
||||
EXTRA_QUEUED_CALLS)
|
||||
self._result_queue = multiprocessing.Queue()
|
||||
self._work_ids = queue.Queue()
|
||||
self._queue_management_thread = None
|
||||
self._processes = set()
|
||||
|
||||
# Shutdown is a two-step process.
|
||||
self._shutdown_thread = False
|
||||
self._shutdown_lock = threading.Lock()
|
||||
self._queue_count = 0
|
||||
self._pending_work_items = {}
|
||||
|
||||
def _start_queue_management_thread(self):
|
||||
# When the executor gets lost, the weakref callback will wake up
|
||||
# the queue management thread.
|
||||
def weakref_cb(_, q=self._result_queue):
|
||||
q.put(None)
|
||||
if self._queue_management_thread is None:
|
||||
self._queue_management_thread = threading.Thread(
|
||||
target=_queue_management_worker,
|
||||
args=(weakref.ref(self, weakref_cb),
|
||||
self._processes,
|
||||
self._pending_work_items,
|
||||
self._work_ids,
|
||||
self._call_queue,
|
||||
self._result_queue))
|
||||
self._queue_management_thread.daemon = True
|
||||
self._queue_management_thread.start()
|
||||
_threads_queues[self._queue_management_thread] = self._result_queue
|
||||
|
||||
def _adjust_process_count(self):
|
||||
for _ in range(len(self._processes), self._max_workers):
|
||||
p = multiprocessing.Process(
|
||||
target=_process_worker,
|
||||
args=(self._call_queue,
|
||||
self._result_queue))
|
||||
p.start()
|
||||
self._processes.add(p)
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
with self._shutdown_lock:
|
||||
if self._shutdown_thread:
|
||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
||||
|
||||
f = _base.Future()
|
||||
w = _WorkItem(f, fn, args, kwargs)
|
||||
|
||||
self._pending_work_items[self._queue_count] = w
|
||||
self._work_ids.put(self._queue_count)
|
||||
self._queue_count += 1
|
||||
# Wake up queue management thread
|
||||
self._result_queue.put(None)
|
||||
|
||||
self._start_queue_management_thread()
|
||||
self._adjust_process_count()
|
||||
return f
|
||||
submit.__doc__ = _base.Executor.submit.__doc__
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
with self._shutdown_lock:
|
||||
self._shutdown_thread = True
|
||||
if self._queue_management_thread:
|
||||
# Wake up queue management thread
|
||||
self._result_queue.put(None)
|
||||
if wait:
|
||||
self._queue_management_thread.join(sys.maxint)
|
||||
# To reduce the risk of openning too many files, remove references to
|
||||
# objects that use file descriptors.
|
||||
self._queue_management_thread = None
|
||||
self._call_queue = None
|
||||
self._result_queue = None
|
||||
self._processes = None
|
||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
||||
|
||||
atexit.register(_python_exit)
|
||||
@@ -1,134 +0,0 @@
|
||||
# Copyright 2009 Brian Quinlan. All Rights Reserved.
|
||||
# Licensed to PSF under a Contributor Agreement.
|
||||
|
||||
"""Implements ThreadPoolExecutor."""
|
||||
|
||||
import atexit
|
||||
from concurrent.futures import _base
|
||||
import Queue as queue
|
||||
import threading
|
||||
import weakref
|
||||
import sys
|
||||
|
||||
__author__ = 'Brian Quinlan (brian@sweetapp.com)'
|
||||
|
||||
# Workers are created as daemon threads. This is done to allow the interpreter
|
||||
# to exit when there are still idle threads in a ThreadPoolExecutor's thread
|
||||
# pool (i.e. shutdown() was not called). However, allowing workers to die with
|
||||
# the interpreter has two undesirable properties:
|
||||
# - The workers would still be running during interpretor shutdown,
|
||||
# meaning that they would fail in unpredictable ways.
|
||||
# - The workers could be killed while evaluating a work item, which could
|
||||
# be bad if the callable being evaluated has external side-effects e.g.
|
||||
# writing to a file.
|
||||
#
|
||||
# To work around this problem, an exit handler is installed which tells the
|
||||
# workers to exit when their work queues are empty and then waits until the
|
||||
# threads finish.
|
||||
|
||||
_threads_queues = weakref.WeakKeyDictionary()
|
||||
_shutdown = False
|
||||
|
||||
def _python_exit():
|
||||
global _shutdown
|
||||
_shutdown = True
|
||||
items = list(_threads_queues.items()) if _threads_queues else ()
|
||||
for t, q in items:
|
||||
q.put(None)
|
||||
for t, q in items:
|
||||
t.join(sys.maxint)
|
||||
|
||||
atexit.register(_python_exit)
|
||||
|
||||
class _WorkItem(object):
|
||||
def __init__(self, future, fn, args, kwargs):
|
||||
self.future = future
|
||||
self.fn = fn
|
||||
self.args = args
|
||||
self.kwargs = kwargs
|
||||
|
||||
def run(self):
|
||||
if not self.future.set_running_or_notify_cancel():
|
||||
return
|
||||
|
||||
try:
|
||||
result = self.fn(*self.args, **self.kwargs)
|
||||
except BaseException:
|
||||
e, tb = sys.exc_info()[1:]
|
||||
self.future.set_exception_info(e, tb)
|
||||
else:
|
||||
self.future.set_result(result)
|
||||
|
||||
def _worker(executor_reference, work_queue):
|
||||
try:
|
||||
while True:
|
||||
work_item = work_queue.get(block=True)
|
||||
if work_item is not None:
|
||||
work_item.run()
|
||||
# Delete references to object. See issue16284
|
||||
del work_item
|
||||
continue
|
||||
executor = executor_reference()
|
||||
# Exit if:
|
||||
# - The interpreter is shutting down OR
|
||||
# - The executor that owns the worker has been collected OR
|
||||
# - The executor that owns the worker has been shutdown.
|
||||
if _shutdown or executor is None or executor._shutdown:
|
||||
# Notice other workers
|
||||
work_queue.put(None)
|
||||
return
|
||||
del executor
|
||||
except BaseException:
|
||||
_base.LOGGER.critical('Exception in worker', exc_info=True)
|
||||
|
||||
class ThreadPoolExecutor(_base.Executor):
|
||||
def __init__(self, max_workers):
|
||||
"""Initializes a new ThreadPoolExecutor instance.
|
||||
|
||||
Args:
|
||||
max_workers: The maximum number of threads that can be used to
|
||||
execute the given calls.
|
||||
"""
|
||||
self._max_workers = max_workers
|
||||
self._work_queue = queue.Queue()
|
||||
self._threads = set()
|
||||
self._shutdown = False
|
||||
self._shutdown_lock = threading.Lock()
|
||||
|
||||
def submit(self, fn, *args, **kwargs):
|
||||
with self._shutdown_lock:
|
||||
if self._shutdown:
|
||||
raise RuntimeError('cannot schedule new futures after shutdown')
|
||||
|
||||
f = _base.Future()
|
||||
w = _WorkItem(f, fn, args, kwargs)
|
||||
|
||||
self._work_queue.put(w)
|
||||
self._adjust_thread_count()
|
||||
return f
|
||||
submit.__doc__ = _base.Executor.submit.__doc__
|
||||
|
||||
def _adjust_thread_count(self):
|
||||
# When the executor gets lost, the weakref callback will wake up
|
||||
# the worker threads.
|
||||
def weakref_cb(_, q=self._work_queue):
|
||||
q.put(None)
|
||||
# TODO(bquinlan): Should avoid creating new threads if there are more
|
||||
# idle threads than items in the work queue.
|
||||
if len(self._threads) < self._max_workers:
|
||||
t = threading.Thread(target=_worker,
|
||||
args=(weakref.ref(self, weakref_cb),
|
||||
self._work_queue))
|
||||
t.daemon = True
|
||||
t.start()
|
||||
self._threads.add(t)
|
||||
_threads_queues[t] = self._work_queue
|
||||
|
||||
def shutdown(self, wait=True):
|
||||
with self._shutdown_lock:
|
||||
self._shutdown = True
|
||||
self._work_queue.put(None)
|
||||
if wait:
|
||||
for t in self._threads:
|
||||
t.join(sys.maxint)
|
||||
shutdown.__doc__ = _base.Executor.shutdown.__doc__
|
||||
@@ -1,32 +0,0 @@
|
||||
Copyright (c) 2013, Ethan Furman.
|
||||
All rights reserved.
|
||||
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
Redistributions of source code must retain the above
|
||||
copyright notice, this list of conditions and the
|
||||
following disclaimer.
|
||||
|
||||
Redistributions in binary form must reproduce the above
|
||||
copyright notice, this list of conditions and the following
|
||||
disclaimer in the documentation and/or other materials
|
||||
provided with the distribution.
|
||||
|
||||
Neither the name Ethan Furman nor the names of any
|
||||
contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written
|
||||
permission.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
|
||||
CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
|
||||
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
|
||||
INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
|
||||
CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
|
||||
ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
|
||||
POSSIBILITY OF SUCH DAMAGE.
|
||||
@@ -1,3 +0,0 @@
|
||||
enum34 is the new Python stdlib enum module available in Python 3.4
|
||||
backported for previous versions of Python from 2.4 to 3.3.
|
||||
tested on 2.6, 2.7, and 3.3+
|
||||
@@ -1,837 +0,0 @@
|
||||
"""Python Enumerations"""
|
||||
|
||||
import sys as _sys
|
||||
|
||||
__all__ = ['Enum', 'IntEnum', 'unique']
|
||||
|
||||
version = 1, 1, 6
|
||||
|
||||
pyver = float('%s.%s' % _sys.version_info[:2])
|
||||
|
||||
try:
|
||||
any
|
||||
except NameError:
|
||||
def any(iterable):
|
||||
for element in iterable:
|
||||
if element:
|
||||
return True
|
||||
return False
|
||||
|
||||
try:
|
||||
from collections import OrderedDict
|
||||
except ImportError:
|
||||
OrderedDict = None
|
||||
|
||||
try:
|
||||
basestring
|
||||
except NameError:
|
||||
# In Python 2 basestring is the ancestor of both str and unicode
|
||||
# in Python 3 it's just str, but was missing in 3.1
|
||||
basestring = str
|
||||
|
||||
try:
|
||||
unicode
|
||||
except NameError:
|
||||
# In Python 3 unicode no longer exists (it's just str)
|
||||
unicode = str
|
||||
|
||||
class _RouteClassAttributeToGetattr(object):
|
||||
"""Route attribute access on a class to __getattr__.
|
||||
|
||||
This is a descriptor, used to define attributes that act differently when
|
||||
accessed through an instance and through a class. Instance access remains
|
||||
normal, but access to an attribute through a class will be routed to the
|
||||
class's __getattr__ method; this is done by raising AttributeError.
|
||||
|
||||
"""
|
||||
def __init__(self, fget=None):
|
||||
self.fget = fget
|
||||
|
||||
def __get__(self, instance, ownerclass=None):
|
||||
if instance is None:
|
||||
raise AttributeError()
|
||||
return self.fget(instance)
|
||||
|
||||
def __set__(self, instance, value):
|
||||
raise AttributeError("can't set attribute")
|
||||
|
||||
def __delete__(self, instance):
|
||||
raise AttributeError("can't delete attribute")
|
||||
|
||||
|
||||
def _is_descriptor(obj):
|
||||
"""Returns True if obj is a descriptor, False otherwise."""
|
||||
return (
|
||||
hasattr(obj, '__get__') or
|
||||
hasattr(obj, '__set__') or
|
||||
hasattr(obj, '__delete__'))
|
||||
|
||||
|
||||
def _is_dunder(name):
|
||||
"""Returns True if a __dunder__ name, False otherwise."""
|
||||
return (name[:2] == name[-2:] == '__' and
|
||||
name[2:3] != '_' and
|
||||
name[-3:-2] != '_' and
|
||||
len(name) > 4)
|
||||
|
||||
|
||||
def _is_sunder(name):
|
||||
"""Returns True if a _sunder_ name, False otherwise."""
|
||||
return (name[0] == name[-1] == '_' and
|
||||
name[1:2] != '_' and
|
||||
name[-2:-1] != '_' and
|
||||
len(name) > 2)
|
||||
|
||||
|
||||
def _make_class_unpicklable(cls):
|
||||
"""Make the given class un-picklable."""
|
||||
def _break_on_call_reduce(self, protocol=None):
|
||||
raise TypeError('%r cannot be pickled' % self)
|
||||
cls.__reduce_ex__ = _break_on_call_reduce
|
||||
cls.__module__ = '<unknown>'
|
||||
|
||||
|
||||
class _EnumDict(dict):
|
||||
"""Track enum member order and ensure member names are not reused.
|
||||
|
||||
EnumMeta will use the names found in self._member_names as the
|
||||
enumeration member names.
|
||||
|
||||
"""
|
||||
def __init__(self):
|
||||
super(_EnumDict, self).__init__()
|
||||
self._member_names = []
|
||||
|
||||
def __setitem__(self, key, value):
|
||||
"""Changes anything not dundered or not a descriptor.
|
||||
|
||||
If a descriptor is added with the same name as an enum member, the name
|
||||
is removed from _member_names (this may leave a hole in the numerical
|
||||
sequence of values).
|
||||
|
||||
If an enum member name is used twice, an error is raised; duplicate
|
||||
values are not checked for.
|
||||
|
||||
Single underscore (sunder) names are reserved.
|
||||
|
||||
Note: in 3.x __order__ is simply discarded as a not necessary piece
|
||||
leftover from 2.x
|
||||
|
||||
"""
|
||||
if pyver >= 3.0 and key in ('_order_', '__order__'):
|
||||
return
|
||||
elif key == '__order__':
|
||||
key = '_order_'
|
||||
if _is_sunder(key):
|
||||
if key != '_order_':
|
||||
raise ValueError('_names_ are reserved for future Enum use')
|
||||
elif _is_dunder(key):
|
||||
pass
|
||||
elif key in self._member_names:
|
||||
# descriptor overwriting an enum?
|
||||
raise TypeError('Attempted to reuse key: %r' % key)
|
||||
elif not _is_descriptor(value):
|
||||
if key in self:
|
||||
# enum overwriting a descriptor?
|
||||
raise TypeError('Key already defined as: %r' % self[key])
|
||||
self._member_names.append(key)
|
||||
super(_EnumDict, self).__setitem__(key, value)
|
||||
|
||||
|
||||
# Dummy value for Enum as EnumMeta explicity checks for it, but of course until
|
||||
# EnumMeta finishes running the first time the Enum class doesn't exist. This
|
||||
# is also why there are checks in EnumMeta like `if Enum is not None`
|
||||
Enum = None
|
||||
|
||||
|
||||
class EnumMeta(type):
|
||||
"""Metaclass for Enum"""
|
||||
@classmethod
|
||||
def __prepare__(metacls, cls, bases):
|
||||
return _EnumDict()
|
||||
|
||||
def __new__(metacls, cls, bases, classdict):
|
||||
# an Enum class is final once enumeration items have been defined; it
|
||||
# cannot be mixed with other types (int, float, etc.) if it has an
|
||||
# inherited __new__ unless a new __new__ is defined (or the resulting
|
||||
# class will fail).
|
||||
if type(classdict) is dict:
|
||||
original_dict = classdict
|
||||
classdict = _EnumDict()
|
||||
for k, v in original_dict.items():
|
||||
classdict[k] = v
|
||||
|
||||
member_type, first_enum = metacls._get_mixins_(bases)
|
||||
__new__, save_new, use_args = metacls._find_new_(classdict, member_type,
|
||||
first_enum)
|
||||
# save enum items into separate mapping so they don't get baked into
|
||||
# the new class
|
||||
members = dict((k, classdict[k]) for k in classdict._member_names)
|
||||
for name in classdict._member_names:
|
||||
del classdict[name]
|
||||
|
||||
# py2 support for definition order
|
||||
_order_ = classdict.get('_order_')
|
||||
if _order_ is None:
|
||||
if pyver < 3.0:
|
||||
try:
|
||||
_order_ = [name for (name, value) in sorted(members.items(), key=lambda item: item[1])]
|
||||
except TypeError:
|
||||
_order_ = [name for name in sorted(members.keys())]
|
||||
else:
|
||||
_order_ = classdict._member_names
|
||||
else:
|
||||
del classdict['_order_']
|
||||
if pyver < 3.0:
|
||||
_order_ = _order_.replace(',', ' ').split()
|
||||
aliases = [name for name in members if name not in _order_]
|
||||
_order_ += aliases
|
||||
|
||||
# check for illegal enum names (any others?)
|
||||
invalid_names = set(members) & set(['mro'])
|
||||
if invalid_names:
|
||||
raise ValueError('Invalid enum member name(s): %s' % (
|
||||
', '.join(invalid_names), ))
|
||||
|
||||
# save attributes from super classes so we know if we can take
|
||||
# the shortcut of storing members in the class dict
|
||||
base_attributes = set([a for b in bases for a in b.__dict__])
|
||||
# create our new Enum type
|
||||
enum_class = super(EnumMeta, metacls).__new__(metacls, cls, bases, classdict)
|
||||
enum_class._member_names_ = [] # names in random order
|
||||
if OrderedDict is not None:
|
||||
enum_class._member_map_ = OrderedDict()
|
||||
else:
|
||||
enum_class._member_map_ = {} # name->value map
|
||||
enum_class._member_type_ = member_type
|
||||
|
||||
# Reverse value->name map for hashable values.
|
||||
enum_class._value2member_map_ = {}
|
||||
|
||||
# instantiate them, checking for duplicates as we go
|
||||
# we instantiate first instead of checking for duplicates first in case
|
||||
# a custom __new__ is doing something funky with the values -- such as
|
||||
# auto-numbering ;)
|
||||
if __new__ is None:
|
||||
__new__ = enum_class.__new__
|
||||
for member_name in _order_:
|
||||
value = members[member_name]
|
||||
if not isinstance(value, tuple):
|
||||
args = (value, )
|
||||
else:
|
||||
args = value
|
||||
if member_type is tuple: # special case for tuple enums
|
||||
args = (args, ) # wrap it one more time
|
||||
if not use_args or not args:
|
||||
enum_member = __new__(enum_class)
|
||||
if not hasattr(enum_member, '_value_'):
|
||||
enum_member._value_ = value
|
||||
else:
|
||||
enum_member = __new__(enum_class, *args)
|
||||
if not hasattr(enum_member, '_value_'):
|
||||
enum_member._value_ = member_type(*args)
|
||||
value = enum_member._value_
|
||||
enum_member._name_ = member_name
|
||||
enum_member.__objclass__ = enum_class
|
||||
enum_member.__init__(*args)
|
||||
# If another member with the same value was already defined, the
|
||||
# new member becomes an alias to the existing one.
|
||||
for name, canonical_member in enum_class._member_map_.items():
|
||||
if canonical_member.value == enum_member._value_:
|
||||
enum_member = canonical_member
|
||||
break
|
||||
else:
|
||||
# Aliases don't appear in member names (only in __members__).
|
||||
enum_class._member_names_.append(member_name)
|
||||
# performance boost for any member that would not shadow
|
||||
# a DynamicClassAttribute (aka _RouteClassAttributeToGetattr)
|
||||
if member_name not in base_attributes:
|
||||
setattr(enum_class, member_name, enum_member)
|
||||
# now add to _member_map_
|
||||
enum_class._member_map_[member_name] = enum_member
|
||||
try:
|
||||
# This may fail if value is not hashable. We can't add the value
|
||||
# to the map, and by-value lookups for this value will be
|
||||
# linear.
|
||||
enum_class._value2member_map_[value] = enum_member
|
||||
except TypeError:
|
||||
pass
|
||||
|
||||
|
||||
# If a custom type is mixed into the Enum, and it does not know how
|
||||
# to pickle itself, pickle.dumps will succeed but pickle.loads will
|
||||
# fail. Rather than have the error show up later and possibly far
|
||||
# from the source, sabotage the pickle protocol for this class so
|
||||
# that pickle.dumps also fails.
|
||||
#
|
||||
# However, if the new class implements its own __reduce_ex__, do not
|
||||
# sabotage -- it's on them to make sure it works correctly. We use
|
||||
# __reduce_ex__ instead of any of the others as it is preferred by
|
||||
# pickle over __reduce__, and it handles all pickle protocols.
|
||||
unpicklable = False
|
||||
if '__reduce_ex__' not in classdict:
|
||||
if member_type is not object:
|
||||
methods = ('__getnewargs_ex__', '__getnewargs__',
|
||||
'__reduce_ex__', '__reduce__')
|
||||
if not any(m in member_type.__dict__ for m in methods):
|
||||
_make_class_unpicklable(enum_class)
|
||||
unpicklable = True
|
||||
|
||||
|
||||
# double check that repr and friends are not the mixin's or various
|
||||
# things break (such as pickle)
|
||||
for name in ('__repr__', '__str__', '__format__', '__reduce_ex__'):
|
||||
class_method = getattr(enum_class, name)
|
||||
obj_method = getattr(member_type, name, None)
|
||||
enum_method = getattr(first_enum, name, None)
|
||||
if name not in classdict and class_method is not enum_method:
|
||||
if name == '__reduce_ex__' and unpicklable:
|
||||
continue
|
||||
setattr(enum_class, name, enum_method)
|
||||
|
||||
# method resolution and int's are not playing nice
|
||||
# Python's less than 2.6 use __cmp__
|
||||
|
||||
if pyver < 2.6:
|
||||
|
||||
if issubclass(enum_class, int):
|
||||
setattr(enum_class, '__cmp__', getattr(int, '__cmp__'))
|
||||
|
||||
elif pyver < 3.0:
|
||||
|
||||
if issubclass(enum_class, int):
|
||||
for method in (
|
||||
'__le__',
|
||||
'__lt__',
|
||||
'__gt__',
|
||||
'__ge__',
|
||||
'__eq__',
|
||||
'__ne__',
|
||||
'__hash__',
|
||||
):
|
||||
setattr(enum_class, method, getattr(int, method))
|
||||
|
||||
# replace any other __new__ with our own (as long as Enum is not None,
|
||||
# anyway) -- again, this is to support pickle
|
||||
if Enum is not None:
|
||||
# if the user defined their own __new__, save it before it gets
|
||||
# clobbered in case they subclass later
|
||||
if save_new:
|
||||
setattr(enum_class, '__member_new__', enum_class.__dict__['__new__'])
|
||||
setattr(enum_class, '__new__', Enum.__dict__['__new__'])
|
||||
return enum_class
|
||||
|
||||
def __bool__(cls):
|
||||
"""
|
||||
classes/types should always be True.
|
||||
"""
|
||||
return True
|
||||
|
||||
def __call__(cls, value, names=None, module=None, type=None, start=1):
|
||||
"""Either returns an existing member, or creates a new enum class.
|
||||
|
||||
This method is used both when an enum class is given a value to match
|
||||
to an enumeration member (i.e. Color(3)) and for the functional API
|
||||
(i.e. Color = Enum('Color', names='red green blue')).
|
||||
|
||||
When used for the functional API: `module`, if set, will be stored in
|
||||
the new class' __module__ attribute; `type`, if set, will be mixed in
|
||||
as the first base class.
|
||||
|
||||
Note: if `module` is not set this routine will attempt to discover the
|
||||
calling module by walking the frame stack; if this is unsuccessful
|
||||
the resulting class will not be pickleable.
|
||||
|
||||
"""
|
||||
if names is None: # simple value lookup
|
||||
return cls.__new__(cls, value)
|
||||
# otherwise, functional API: we're creating a new Enum type
|
||||
return cls._create_(value, names, module=module, type=type, start=start)
|
||||
|
||||
def __contains__(cls, member):
|
||||
return isinstance(member, cls) and member.name in cls._member_map_
|
||||
|
||||
def __delattr__(cls, attr):
|
||||
# nicer error message when someone tries to delete an attribute
|
||||
# (see issue19025).
|
||||
if attr in cls._member_map_:
|
||||
raise AttributeError(
|
||||
"%s: cannot delete Enum member." % cls.__name__)
|
||||
super(EnumMeta, cls).__delattr__(attr)
|
||||
|
||||
def __dir__(self):
|
||||
return (['__class__', '__doc__', '__members__', '__module__'] +
|
||||
self._member_names_)
|
||||
|
||||
@property
|
||||
def __members__(cls):
|
||||
"""Returns a mapping of member name->value.
|
||||
|
||||
This mapping lists all enum members, including aliases. Note that this
|
||||
is a copy of the internal mapping.
|
||||
|
||||
"""
|
||||
return cls._member_map_.copy()
|
||||
|
||||
def __getattr__(cls, name):
|
||||
"""Return the enum member matching `name`
|
||||
|
||||
We use __getattr__ instead of descriptors or inserting into the enum
|
||||
class' __dict__ in order to support `name` and `value` being both
|
||||
properties for enum members (which live in the class' __dict__) and
|
||||
enum members themselves.
|
||||
|
||||
"""
|
||||
if _is_dunder(name):
|
||||
raise AttributeError(name)
|
||||
try:
|
||||
return cls._member_map_[name]
|
||||
except KeyError:
|
||||
raise AttributeError(name)
|
||||
|
||||
def __getitem__(cls, name):
|
||||
return cls._member_map_[name]
|
||||
|
||||
def __iter__(cls):
|
||||
return (cls._member_map_[name] for name in cls._member_names_)
|
||||
|
||||
def __reversed__(cls):
|
||||
return (cls._member_map_[name] for name in reversed(cls._member_names_))
|
||||
|
||||
def __len__(cls):
|
||||
return len(cls._member_names_)
|
||||
|
||||
__nonzero__ = __bool__
|
||||
|
||||
def __repr__(cls):
|
||||
return "<enum %r>" % cls.__name__
|
||||
|
||||
def __setattr__(cls, name, value):
|
||||
"""Block attempts to reassign Enum members.
|
||||
|
||||
A simple assignment to the class namespace only changes one of the
|
||||
several possible ways to get an Enum member from the Enum class,
|
||||
resulting in an inconsistent Enumeration.
|
||||
|
||||
"""
|
||||
member_map = cls.__dict__.get('_member_map_', {})
|
||||
if name in member_map:
|
||||
raise AttributeError('Cannot reassign members.')
|
||||
super(EnumMeta, cls).__setattr__(name, value)
|
||||
|
||||
def _create_(cls, class_name, names=None, module=None, type=None, start=1):
|
||||
"""Convenience method to create a new Enum class.
|
||||
|
||||
`names` can be:
|
||||
|
||||
* A string containing member names, separated either with spaces or
|
||||
commas. Values are auto-numbered from 1.
|
||||
* An iterable of member names. Values are auto-numbered from 1.
|
||||
* An iterable of (member name, value) pairs.
|
||||
* A mapping of member name -> value.
|
||||
|
||||
"""
|
||||
if pyver < 3.0:
|
||||
# if class_name is unicode, attempt a conversion to ASCII
|
||||
if isinstance(class_name, unicode):
|
||||
try:
|
||||
class_name = class_name.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
raise TypeError('%r is not representable in ASCII' % class_name)
|
||||
metacls = cls.__class__
|
||||
if type is None:
|
||||
bases = (cls, )
|
||||
else:
|
||||
bases = (type, cls)
|
||||
classdict = metacls.__prepare__(class_name, bases)
|
||||
_order_ = []
|
||||
|
||||
# special processing needed for names?
|
||||
if isinstance(names, basestring):
|
||||
names = names.replace(',', ' ').split()
|
||||
if isinstance(names, (tuple, list)) and isinstance(names[0], basestring):
|
||||
names = [(e, i+start) for (i, e) in enumerate(names)]
|
||||
|
||||
# Here, names is either an iterable of (name, value) or a mapping.
|
||||
item = None # in case names is empty
|
||||
for item in names:
|
||||
if isinstance(item, basestring):
|
||||
member_name, member_value = item, names[item]
|
||||
else:
|
||||
member_name, member_value = item
|
||||
classdict[member_name] = member_value
|
||||
_order_.append(member_name)
|
||||
# only set _order_ in classdict if name/value was not from a mapping
|
||||
if not isinstance(item, basestring):
|
||||
classdict['_order_'] = ' '.join(_order_)
|
||||
enum_class = metacls.__new__(metacls, class_name, bases, classdict)
|
||||
|
||||
# TODO: replace the frame hack if a blessed way to know the calling
|
||||
# module is ever developed
|
||||
if module is None:
|
||||
try:
|
||||
module = _sys._getframe(2).f_globals['__name__']
|
||||
except (AttributeError, ValueError):
|
||||
pass
|
||||
if module is None:
|
||||
_make_class_unpicklable(enum_class)
|
||||
else:
|
||||
enum_class.__module__ = module
|
||||
|
||||
return enum_class
|
||||
|
||||
@staticmethod
|
||||
def _get_mixins_(bases):
|
||||
"""Returns the type for creating enum members, and the first inherited
|
||||
enum class.
|
||||
|
||||
bases: the tuple of bases that was given to __new__
|
||||
|
||||
"""
|
||||
if not bases or Enum is None:
|
||||
return object, Enum
|
||||
|
||||
|
||||
# double check that we are not subclassing a class with existing
|
||||
# enumeration members; while we're at it, see if any other data
|
||||
# type has been mixed in so we can use the correct __new__
|
||||
member_type = first_enum = None
|
||||
for base in bases:
|
||||
if (base is not Enum and
|
||||
issubclass(base, Enum) and
|
||||
base._member_names_):
|
||||
raise TypeError("Cannot extend enumerations")
|
||||
# base is now the last base in bases
|
||||
if not issubclass(base, Enum):
|
||||
raise TypeError("new enumerations must be created as "
|
||||
"`ClassName([mixin_type,] enum_type)`")
|
||||
|
||||
# get correct mix-in type (either mix-in type of Enum subclass, or
|
||||
# first base if last base is Enum)
|
||||
if not issubclass(bases[0], Enum):
|
||||
member_type = bases[0] # first data type
|
||||
first_enum = bases[-1] # enum type
|
||||
else:
|
||||
for base in bases[0].__mro__:
|
||||
# most common: (IntEnum, int, Enum, object)
|
||||
# possible: (<Enum 'AutoIntEnum'>, <Enum 'IntEnum'>,
|
||||
# <class 'int'>, <Enum 'Enum'>,
|
||||
# <class 'object'>)
|
||||
if issubclass(base, Enum):
|
||||
if first_enum is None:
|
||||
first_enum = base
|
||||
else:
|
||||
if member_type is None:
|
||||
member_type = base
|
||||
|
||||
return member_type, first_enum
|
||||
|
||||
if pyver < 3.0:
|
||||
@staticmethod
|
||||
def _find_new_(classdict, member_type, first_enum):
|
||||
"""Returns the __new__ to be used for creating the enum members.
|
||||
|
||||
classdict: the class dictionary given to __new__
|
||||
member_type: the data type whose __new__ will be used by default
|
||||
first_enum: enumeration to check for an overriding __new__
|
||||
|
||||
"""
|
||||
# now find the correct __new__, checking to see of one was defined
|
||||
# by the user; also check earlier enum classes in case a __new__ was
|
||||
# saved as __member_new__
|
||||
__new__ = classdict.get('__new__', None)
|
||||
if __new__:
|
||||
return None, True, True # __new__, save_new, use_args
|
||||
|
||||
N__new__ = getattr(None, '__new__')
|
||||
O__new__ = getattr(object, '__new__')
|
||||
if Enum is None:
|
||||
E__new__ = N__new__
|
||||
else:
|
||||
E__new__ = Enum.__dict__['__new__']
|
||||
# check all possibles for __member_new__ before falling back to
|
||||
# __new__
|
||||
for method in ('__member_new__', '__new__'):
|
||||
for possible in (member_type, first_enum):
|
||||
try:
|
||||
target = possible.__dict__[method]
|
||||
except (AttributeError, KeyError):
|
||||
target = getattr(possible, method, None)
|
||||
if target not in [
|
||||
None,
|
||||
N__new__,
|
||||
O__new__,
|
||||
E__new__,
|
||||
]:
|
||||
if method == '__member_new__':
|
||||
classdict['__new__'] = target
|
||||
return None, False, True
|
||||
if isinstance(target, staticmethod):
|
||||
target = target.__get__(member_type)
|
||||
__new__ = target
|
||||
break
|
||||
if __new__ is not None:
|
||||
break
|
||||
else:
|
||||
__new__ = object.__new__
|
||||
|
||||
# if a non-object.__new__ is used then whatever value/tuple was
|
||||
# assigned to the enum member name will be passed to __new__ and to the
|
||||
# new enum member's __init__
|
||||
if __new__ is object.__new__:
|
||||
use_args = False
|
||||
else:
|
||||
use_args = True
|
||||
|
||||
return __new__, False, use_args
|
||||
else:
|
||||
@staticmethod
|
||||
def _find_new_(classdict, member_type, first_enum):
|
||||
"""Returns the __new__ to be used for creating the enum members.
|
||||
|
||||
classdict: the class dictionary given to __new__
|
||||
member_type: the data type whose __new__ will be used by default
|
||||
first_enum: enumeration to check for an overriding __new__
|
||||
|
||||
"""
|
||||
# now find the correct __new__, checking to see of one was defined
|
||||
# by the user; also check earlier enum classes in case a __new__ was
|
||||
# saved as __member_new__
|
||||
__new__ = classdict.get('__new__', None)
|
||||
|
||||
# should __new__ be saved as __member_new__ later?
|
||||
save_new = __new__ is not None
|
||||
|
||||
if __new__ is None:
|
||||
# check all possibles for __member_new__ before falling back to
|
||||
# __new__
|
||||
for method in ('__member_new__', '__new__'):
|
||||
for possible in (member_type, first_enum):
|
||||
target = getattr(possible, method, None)
|
||||
if target not in (
|
||||
None,
|
||||
None.__new__,
|
||||
object.__new__,
|
||||
Enum.__new__,
|
||||
):
|
||||
__new__ = target
|
||||
break
|
||||
if __new__ is not None:
|
||||
break
|
||||
else:
|
||||
__new__ = object.__new__
|
||||
|
||||
# if a non-object.__new__ is used then whatever value/tuple was
|
||||
# assigned to the enum member name will be passed to __new__ and to the
|
||||
# new enum member's __init__
|
||||
if __new__ is object.__new__:
|
||||
use_args = False
|
||||
else:
|
||||
use_args = True
|
||||
|
||||
return __new__, save_new, use_args
|
||||
|
||||
|
||||
########################################################
|
||||
# In order to support Python 2 and 3 with a single
|
||||
# codebase we have to create the Enum methods separately
|
||||
# and then use the `type(name, bases, dict)` method to
|
||||
# create the class.
|
||||
########################################################
|
||||
temp_enum_dict = {}
|
||||
temp_enum_dict['__doc__'] = "Generic enumeration.\n\n Derive from this class to define new enumerations.\n\n"
|
||||
|
||||
def __new__(cls, value):
|
||||
# all enum instances are actually created during class construction
|
||||
# without calling this method; this method is called by the metaclass'
|
||||
# __call__ (i.e. Color(3) ), and by pickle
|
||||
if type(value) is cls:
|
||||
# For lookups like Color(Color.red)
|
||||
value = value.value
|
||||
#return value
|
||||
# by-value search for a matching enum member
|
||||
# see if it's in the reverse mapping (for hashable values)
|
||||
try:
|
||||
if value in cls._value2member_map_:
|
||||
return cls._value2member_map_[value]
|
||||
except TypeError:
|
||||
# not there, now do long search -- O(n) behavior
|
||||
for member in cls._member_map_.values():
|
||||
if member.value == value:
|
||||
return member
|
||||
raise ValueError("%s is not a valid %s" % (value, cls.__name__))
|
||||
temp_enum_dict['__new__'] = __new__
|
||||
del __new__
|
||||
|
||||
def __repr__(self):
|
||||
return "<%s.%s: %r>" % (
|
||||
self.__class__.__name__, self._name_, self._value_)
|
||||
temp_enum_dict['__repr__'] = __repr__
|
||||
del __repr__
|
||||
|
||||
def __str__(self):
|
||||
return "%s.%s" % (self.__class__.__name__, self._name_)
|
||||
temp_enum_dict['__str__'] = __str__
|
||||
del __str__
|
||||
|
||||
if pyver >= 3.0:
|
||||
def __dir__(self):
|
||||
added_behavior = [
|
||||
m
|
||||
for cls in self.__class__.mro()
|
||||
for m in cls.__dict__
|
||||
if m[0] != '_' and m not in self._member_map_
|
||||
]
|
||||
return (['__class__', '__doc__', '__module__', ] + added_behavior)
|
||||
temp_enum_dict['__dir__'] = __dir__
|
||||
del __dir__
|
||||
|
||||
def __format__(self, format_spec):
|
||||
# mixed-in Enums should use the mixed-in type's __format__, otherwise
|
||||
# we can get strange results with the Enum name showing up instead of
|
||||
# the value
|
||||
|
||||
# pure Enum branch
|
||||
if self._member_type_ is object:
|
||||
cls = str
|
||||
val = str(self)
|
||||
# mix-in branch
|
||||
else:
|
||||
cls = self._member_type_
|
||||
val = self.value
|
||||
return cls.__format__(val, format_spec)
|
||||
temp_enum_dict['__format__'] = __format__
|
||||
del __format__
|
||||
|
||||
|
||||
####################################
|
||||
# Python's less than 2.6 use __cmp__
|
||||
|
||||
if pyver < 2.6:
|
||||
|
||||
def __cmp__(self, other):
|
||||
if type(other) is self.__class__:
|
||||
if self is other:
|
||||
return 0
|
||||
return -1
|
||||
return NotImplemented
|
||||
raise TypeError("unorderable types: %s() and %s()" % (self.__class__.__name__, other.__class__.__name__))
|
||||
temp_enum_dict['__cmp__'] = __cmp__
|
||||
del __cmp__
|
||||
|
||||
else:
|
||||
|
||||
def __le__(self, other):
|
||||
raise TypeError("unorderable types: %s() <= %s()" % (self.__class__.__name__, other.__class__.__name__))
|
||||
temp_enum_dict['__le__'] = __le__
|
||||
del __le__
|
||||
|
||||
def __lt__(self, other):
|
||||
raise TypeError("unorderable types: %s() < %s()" % (self.__class__.__name__, other.__class__.__name__))
|
||||
temp_enum_dict['__lt__'] = __lt__
|
||||
del __lt__
|
||||
|
||||
def __ge__(self, other):
|
||||
raise TypeError("unorderable types: %s() >= %s()" % (self.__class__.__name__, other.__class__.__name__))
|
||||
temp_enum_dict['__ge__'] = __ge__
|
||||
del __ge__
|
||||
|
||||
def __gt__(self, other):
|
||||
raise TypeError("unorderable types: %s() > %s()" % (self.__class__.__name__, other.__class__.__name__))
|
||||
temp_enum_dict['__gt__'] = __gt__
|
||||
del __gt__
|
||||
|
||||
|
||||
def __eq__(self, other):
|
||||
if type(other) is self.__class__:
|
||||
return self is other
|
||||
return NotImplemented
|
||||
temp_enum_dict['__eq__'] = __eq__
|
||||
del __eq__
|
||||
|
||||
def __ne__(self, other):
|
||||
if type(other) is self.__class__:
|
||||
return self is not other
|
||||
return NotImplemented
|
||||
temp_enum_dict['__ne__'] = __ne__
|
||||
del __ne__
|
||||
|
||||
def __hash__(self):
|
||||
return hash(self._name_)
|
||||
temp_enum_dict['__hash__'] = __hash__
|
||||
del __hash__
|
||||
|
||||
def __reduce_ex__(self, proto):
|
||||
return self.__class__, (self._value_, )
|
||||
temp_enum_dict['__reduce_ex__'] = __reduce_ex__
|
||||
del __reduce_ex__
|
||||
|
||||
# _RouteClassAttributeToGetattr is used to provide access to the `name`
|
||||
# and `value` properties of enum members while keeping some measure of
|
||||
# protection from modification, while still allowing for an enumeration
|
||||
# to have members named `name` and `value`. This works because enumeration
|
||||
# members are not set directly on the enum class -- __getattr__ is
|
||||
# used to look them up.
|
||||
|
||||
@_RouteClassAttributeToGetattr
|
||||
def name(self):
|
||||
return self._name_
|
||||
temp_enum_dict['name'] = name
|
||||
del name
|
||||
|
||||
@_RouteClassAttributeToGetattr
|
||||
def value(self):
|
||||
return self._value_
|
||||
temp_enum_dict['value'] = value
|
||||
del value
|
||||
|
||||
@classmethod
|
||||
def _convert(cls, name, module, filter, source=None):
|
||||
"""
|
||||
Create a new Enum subclass that replaces a collection of global constants
|
||||
"""
|
||||
# convert all constants from source (or module) that pass filter() to
|
||||
# a new Enum called name, and export the enum and its members back to
|
||||
# module;
|
||||
# also, replace the __reduce_ex__ method so unpickling works in
|
||||
# previous Python versions
|
||||
module_globals = vars(_sys.modules[module])
|
||||
if source:
|
||||
source = vars(source)
|
||||
else:
|
||||
source = module_globals
|
||||
members = dict((name, value) for name, value in source.items() if filter(name))
|
||||
cls = cls(name, members, module=module)
|
||||
cls.__reduce_ex__ = _reduce_ex_by_name
|
||||
module_globals.update(cls.__members__)
|
||||
module_globals[name] = cls
|
||||
return cls
|
||||
temp_enum_dict['_convert'] = _convert
|
||||
del _convert
|
||||
|
||||
Enum = EnumMeta('Enum', (object, ), temp_enum_dict)
|
||||
del temp_enum_dict
|
||||
|
||||
# Enum has now been created
|
||||
###########################
|
||||
|
||||
class IntEnum(int, Enum):
|
||||
"""Enum where members are also (and must be) ints"""
|
||||
|
||||
def _reduce_ex_by_name(self, proto):
|
||||
return self.name
|
||||
|
||||
def unique(enumeration):
|
||||
"""Class decorator that ensures only unique members exist in an enumeration."""
|
||||
duplicates = []
|
||||
for name, member in enumeration.__members__.items():
|
||||
if name != member.name:
|
||||
duplicates.append((name, member.name))
|
||||
if duplicates:
|
||||
duplicate_names = ', '.join(
|
||||
["%s -> %s" % (alias, name) for (alias, name) in duplicates]
|
||||
)
|
||||
raise ValueError('duplicate names found in %r: %s' %
|
||||
(enumeration, duplicate_names)
|
||||
)
|
||||
return enumeration
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,735 +0,0 @@
|
||||
``enum`` --- support for enumerations
|
||||
========================================
|
||||
|
||||
.. :synopsis: enumerations are sets of symbolic names bound to unique, constant
|
||||
values.
|
||||
.. :moduleauthor:: Ethan Furman <ethan@stoneleaf.us>
|
||||
.. :sectionauthor:: Barry Warsaw <barry@python.org>,
|
||||
.. :sectionauthor:: Eli Bendersky <eliben@gmail.com>,
|
||||
.. :sectionauthor:: Ethan Furman <ethan@stoneleaf.us>
|
||||
|
||||
----------------
|
||||
|
||||
An enumeration is a set of symbolic names (members) bound to unique, constant
|
||||
values. Within an enumeration, the members can be compared by identity, and
|
||||
the enumeration itself can be iterated over.
|
||||
|
||||
|
||||
Module Contents
|
||||
---------------
|
||||
|
||||
This module defines two enumeration classes that can be used to define unique
|
||||
sets of names and values: ``Enum`` and ``IntEnum``. It also defines
|
||||
one decorator, ``unique``.
|
||||
|
||||
``Enum``
|
||||
|
||||
Base class for creating enumerated constants. See section `Functional API`_
|
||||
for an alternate construction syntax.
|
||||
|
||||
``IntEnum``
|
||||
|
||||
Base class for creating enumerated constants that are also subclasses of ``int``.
|
||||
|
||||
``unique``
|
||||
|
||||
Enum class decorator that ensures only one name is bound to any one value.
|
||||
|
||||
|
||||
Creating an Enum
|
||||
----------------
|
||||
|
||||
Enumerations are created using the ``class`` syntax, which makes them
|
||||
easy to read and write. An alternative creation method is described in
|
||||
`Functional API`_. To define an enumeration, subclass ``Enum`` as
|
||||
follows::
|
||||
|
||||
>>> from enum import Enum
|
||||
>>> class Color(Enum):
|
||||
... red = 1
|
||||
... green = 2
|
||||
... blue = 3
|
||||
|
||||
Note: Nomenclature
|
||||
|
||||
- The class ``Color`` is an *enumeration* (or *enum*)
|
||||
- The attributes ``Color.red``, ``Color.green``, etc., are
|
||||
*enumeration members* (or *enum members*).
|
||||
- The enum members have *names* and *values* (the name of
|
||||
``Color.red`` is ``red``, the value of ``Color.blue`` is
|
||||
``3``, etc.)
|
||||
|
||||
Note:
|
||||
|
||||
Even though we use the ``class`` syntax to create Enums, Enums
|
||||
are not normal Python classes. See `How are Enums different?`_ for
|
||||
more details.
|
||||
|
||||
Enumeration members have human readable string representations::
|
||||
|
||||
>>> print(Color.red)
|
||||
Color.red
|
||||
|
||||
...while their ``repr`` has more information::
|
||||
|
||||
>>> print(repr(Color.red))
|
||||
<Color.red: 1>
|
||||
|
||||
The *type* of an enumeration member is the enumeration it belongs to::
|
||||
|
||||
>>> type(Color.red)
|
||||
<enum 'Color'>
|
||||
>>> isinstance(Color.green, Color)
|
||||
True
|
||||
>>>
|
||||
|
||||
Enum members also have a property that contains just their item name::
|
||||
|
||||
>>> print(Color.red.name)
|
||||
red
|
||||
|
||||
Enumerations support iteration. In Python 3.x definition order is used; in
|
||||
Python 2.x the definition order is not available, but class attribute
|
||||
``__order__`` is supported; otherwise, value order is used::
|
||||
|
||||
>>> class Shake(Enum):
|
||||
... __order__ = 'vanilla chocolate cookies mint' # only needed in 2.x
|
||||
... vanilla = 7
|
||||
... chocolate = 4
|
||||
... cookies = 9
|
||||
... mint = 3
|
||||
...
|
||||
>>> for shake in Shake:
|
||||
... print(shake)
|
||||
...
|
||||
Shake.vanilla
|
||||
Shake.chocolate
|
||||
Shake.cookies
|
||||
Shake.mint
|
||||
|
||||
The ``__order__`` attribute is always removed, and in 3.x it is also ignored
|
||||
(order is definition order); however, in the stdlib version it will be ignored
|
||||
but not removed.
|
||||
|
||||
Enumeration members are hashable, so they can be used in dictionaries and sets::
|
||||
|
||||
>>> apples = {}
|
||||
>>> apples[Color.red] = 'red delicious'
|
||||
>>> apples[Color.green] = 'granny smith'
|
||||
>>> apples == {Color.red: 'red delicious', Color.green: 'granny smith'}
|
||||
True
|
||||
|
||||
|
||||
Programmatic access to enumeration members and their attributes
|
||||
---------------------------------------------------------------
|
||||
|
||||
Sometimes it's useful to access members in enumerations programmatically (i.e.
|
||||
situations where ``Color.red`` won't do because the exact color is not known
|
||||
at program-writing time). ``Enum`` allows such access::
|
||||
|
||||
>>> Color(1)
|
||||
<Color.red: 1>
|
||||
>>> Color(3)
|
||||
<Color.blue: 3>
|
||||
|
||||
If you want to access enum members by *name*, use item access::
|
||||
|
||||
>>> Color['red']
|
||||
<Color.red: 1>
|
||||
>>> Color['green']
|
||||
<Color.green: 2>
|
||||
|
||||
If have an enum member and need its ``name`` or ``value``::
|
||||
|
||||
>>> member = Color.red
|
||||
>>> member.name
|
||||
'red'
|
||||
>>> member.value
|
||||
1
|
||||
|
||||
|
||||
Duplicating enum members and values
|
||||
-----------------------------------
|
||||
|
||||
Having two enum members (or any other attribute) with the same name is invalid;
|
||||
in Python 3.x this would raise an error, but in Python 2.x the second member
|
||||
simply overwrites the first::
|
||||
|
||||
>>> # python 2.x
|
||||
>>> class Shape(Enum):
|
||||
... square = 2
|
||||
... square = 3
|
||||
...
|
||||
>>> Shape.square
|
||||
<Shape.square: 3>
|
||||
|
||||
>>> # python 3.x
|
||||
>>> class Shape(Enum):
|
||||
... square = 2
|
||||
... square = 3
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
TypeError: Attempted to reuse key: 'square'
|
||||
|
||||
However, two enum members are allowed to have the same value. Given two members
|
||||
A and B with the same value (and A defined first), B is an alias to A. By-value
|
||||
lookup of the value of A and B will return A. By-name lookup of B will also
|
||||
return A::
|
||||
|
||||
>>> class Shape(Enum):
|
||||
... __order__ = 'square diamond circle alias_for_square' # only needed in 2.x
|
||||
... square = 2
|
||||
... diamond = 1
|
||||
... circle = 3
|
||||
... alias_for_square = 2
|
||||
...
|
||||
>>> Shape.square
|
||||
<Shape.square: 2>
|
||||
>>> Shape.alias_for_square
|
||||
<Shape.square: 2>
|
||||
>>> Shape(2)
|
||||
<Shape.square: 2>
|
||||
|
||||
|
||||
Allowing aliases is not always desirable. ``unique`` can be used to ensure
|
||||
that none exist in a particular enumeration::
|
||||
|
||||
>>> from enum import unique
|
||||
>>> @unique
|
||||
... class Mistake(Enum):
|
||||
... __order__ = 'one two three four' # only needed in 2.x
|
||||
... one = 1
|
||||
... two = 2
|
||||
... three = 3
|
||||
... four = 3
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: duplicate names found in <enum 'Mistake'>: four -> three
|
||||
|
||||
Iterating over the members of an enum does not provide the aliases::
|
||||
|
||||
>>> list(Shape)
|
||||
[<Shape.square: 2>, <Shape.diamond: 1>, <Shape.circle: 3>]
|
||||
|
||||
The special attribute ``__members__`` is a dictionary mapping names to members.
|
||||
It includes all names defined in the enumeration, including the aliases::
|
||||
|
||||
>>> for name, member in sorted(Shape.__members__.items()):
|
||||
... name, member
|
||||
...
|
||||
('alias_for_square', <Shape.square: 2>)
|
||||
('circle', <Shape.circle: 3>)
|
||||
('diamond', <Shape.diamond: 1>)
|
||||
('square', <Shape.square: 2>)
|
||||
|
||||
The ``__members__`` attribute can be used for detailed programmatic access to
|
||||
the enumeration members. For example, finding all the aliases::
|
||||
|
||||
>>> [name for name, member in Shape.__members__.items() if member.name != name]
|
||||
['alias_for_square']
|
||||
|
||||
Comparisons
|
||||
-----------
|
||||
|
||||
Enumeration members are compared by identity::
|
||||
|
||||
>>> Color.red is Color.red
|
||||
True
|
||||
>>> Color.red is Color.blue
|
||||
False
|
||||
>>> Color.red is not Color.blue
|
||||
True
|
||||
|
||||
Ordered comparisons between enumeration values are *not* supported. Enum
|
||||
members are not integers (but see `IntEnum`_ below)::
|
||||
|
||||
>>> Color.red < Color.blue
|
||||
Traceback (most recent call last):
|
||||
File "<stdin>", line 1, in <module>
|
||||
TypeError: unorderable types: Color() < Color()
|
||||
|
||||
.. warning::
|
||||
|
||||
In Python 2 *everything* is ordered, even though the ordering may not
|
||||
make sense. If you want your enumerations to have a sensible ordering
|
||||
check out the `OrderedEnum`_ recipe below.
|
||||
|
||||
|
||||
Equality comparisons are defined though::
|
||||
|
||||
>>> Color.blue == Color.red
|
||||
False
|
||||
>>> Color.blue != Color.red
|
||||
True
|
||||
>>> Color.blue == Color.blue
|
||||
True
|
||||
|
||||
Comparisons against non-enumeration values will always compare not equal
|
||||
(again, ``IntEnum`` was explicitly designed to behave differently, see
|
||||
below)::
|
||||
|
||||
>>> Color.blue == 2
|
||||
False
|
||||
|
||||
|
||||
Allowed members and attributes of enumerations
|
||||
----------------------------------------------
|
||||
|
||||
The examples above use integers for enumeration values. Using integers is
|
||||
short and handy (and provided by default by the `Functional API`_), but not
|
||||
strictly enforced. In the vast majority of use-cases, one doesn't care what
|
||||
the actual value of an enumeration is. But if the value *is* important,
|
||||
enumerations can have arbitrary values.
|
||||
|
||||
Enumerations are Python classes, and can have methods and special methods as
|
||||
usual. If we have this enumeration::
|
||||
|
||||
>>> class Mood(Enum):
|
||||
... funky = 1
|
||||
... happy = 3
|
||||
...
|
||||
... def describe(self):
|
||||
... # self is the member here
|
||||
... return self.name, self.value
|
||||
...
|
||||
... def __str__(self):
|
||||
... return 'my custom str! {0}'.format(self.value)
|
||||
...
|
||||
... @classmethod
|
||||
... def favorite_mood(cls):
|
||||
... # cls here is the enumeration
|
||||
... return cls.happy
|
||||
|
||||
Then::
|
||||
|
||||
>>> Mood.favorite_mood()
|
||||
<Mood.happy: 3>
|
||||
>>> Mood.happy.describe()
|
||||
('happy', 3)
|
||||
>>> str(Mood.funky)
|
||||
'my custom str! 1'
|
||||
|
||||
The rules for what is allowed are as follows: _sunder_ names (starting and
|
||||
ending with a single underscore) are reserved by enum and cannot be used;
|
||||
all other attributes defined within an enumeration will become members of this
|
||||
enumeration, with the exception of *__dunder__* names and descriptors (methods
|
||||
are also descriptors).
|
||||
|
||||
Note:
|
||||
|
||||
If your enumeration defines ``__new__`` and/or ``__init__`` then
|
||||
whatever value(s) were given to the enum member will be passed into
|
||||
those methods. See `Planet`_ for an example.
|
||||
|
||||
|
||||
Restricted subclassing of enumerations
|
||||
--------------------------------------
|
||||
|
||||
Subclassing an enumeration is allowed only if the enumeration does not define
|
||||
any members. So this is forbidden::
|
||||
|
||||
>>> class MoreColor(Color):
|
||||
... pink = 17
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
TypeError: Cannot extend enumerations
|
||||
|
||||
But this is allowed::
|
||||
|
||||
>>> class Foo(Enum):
|
||||
... def some_behavior(self):
|
||||
... pass
|
||||
...
|
||||
>>> class Bar(Foo):
|
||||
... happy = 1
|
||||
... sad = 2
|
||||
...
|
||||
|
||||
Allowing subclassing of enums that define members would lead to a violation of
|
||||
some important invariants of types and instances. On the other hand, it makes
|
||||
sense to allow sharing some common behavior between a group of enumerations.
|
||||
(See `OrderedEnum`_ for an example.)
|
||||
|
||||
|
||||
Pickling
|
||||
--------
|
||||
|
||||
Enumerations can be pickled and unpickled::
|
||||
|
||||
>>> from enum.test_enum import Fruit
|
||||
>>> from pickle import dumps, loads
|
||||
>>> Fruit.tomato is loads(dumps(Fruit.tomato, 2))
|
||||
True
|
||||
|
||||
The usual restrictions for pickling apply: picklable enums must be defined in
|
||||
the top level of a module, since unpickling requires them to be importable
|
||||
from that module.
|
||||
|
||||
Note:
|
||||
|
||||
With pickle protocol version 4 (introduced in Python 3.4) it is possible
|
||||
to easily pickle enums nested in other classes.
|
||||
|
||||
|
||||
|
||||
Functional API
|
||||
--------------
|
||||
|
||||
The ``Enum`` class is callable, providing the following functional API::
|
||||
|
||||
>>> Animal = Enum('Animal', 'ant bee cat dog')
|
||||
>>> Animal
|
||||
<enum 'Animal'>
|
||||
>>> Animal.ant
|
||||
<Animal.ant: 1>
|
||||
>>> Animal.ant.value
|
||||
1
|
||||
>>> list(Animal)
|
||||
[<Animal.ant: 1>, <Animal.bee: 2>, <Animal.cat: 3>, <Animal.dog: 4>]
|
||||
|
||||
The semantics of this API resemble ``namedtuple``. The first argument
|
||||
of the call to ``Enum`` is the name of the enumeration.
|
||||
|
||||
The second argument is the *source* of enumeration member names. It can be a
|
||||
whitespace-separated string of names, a sequence of names, a sequence of
|
||||
2-tuples with key/value pairs, or a mapping (e.g. dictionary) of names to
|
||||
values. The last two options enable assigning arbitrary values to
|
||||
enumerations; the others auto-assign increasing integers starting with 1. A
|
||||
new class derived from ``Enum`` is returned. In other words, the above
|
||||
assignment to ``Animal`` is equivalent to::
|
||||
|
||||
>>> class Animals(Enum):
|
||||
... ant = 1
|
||||
... bee = 2
|
||||
... cat = 3
|
||||
... dog = 4
|
||||
|
||||
Pickling enums created with the functional API can be tricky as frame stack
|
||||
implementation details are used to try and figure out which module the
|
||||
enumeration is being created in (e.g. it will fail if you use a utility
|
||||
function in separate module, and also may not work on IronPython or Jython).
|
||||
The solution is to specify the module name explicitly as follows::
|
||||
|
||||
>>> Animals = Enum('Animals', 'ant bee cat dog', module=__name__)
|
||||
|
||||
Derived Enumerations
|
||||
--------------------
|
||||
|
||||
IntEnum
|
||||
^^^^^^^
|
||||
|
||||
A variation of ``Enum`` is provided which is also a subclass of
|
||||
``int``. Members of an ``IntEnum`` can be compared to integers;
|
||||
by extension, integer enumerations of different types can also be compared
|
||||
to each other::
|
||||
|
||||
>>> from enum import IntEnum
|
||||
>>> class Shape(IntEnum):
|
||||
... circle = 1
|
||||
... square = 2
|
||||
...
|
||||
>>> class Request(IntEnum):
|
||||
... post = 1
|
||||
... get = 2
|
||||
...
|
||||
>>> Shape == 1
|
||||
False
|
||||
>>> Shape.circle == 1
|
||||
True
|
||||
>>> Shape.circle == Request.post
|
||||
True
|
||||
|
||||
However, they still can't be compared to standard ``Enum`` enumerations::
|
||||
|
||||
>>> class Shape(IntEnum):
|
||||
... circle = 1
|
||||
... square = 2
|
||||
...
|
||||
>>> class Color(Enum):
|
||||
... red = 1
|
||||
... green = 2
|
||||
...
|
||||
>>> Shape.circle == Color.red
|
||||
False
|
||||
|
||||
``IntEnum`` values behave like integers in other ways you'd expect::
|
||||
|
||||
>>> int(Shape.circle)
|
||||
1
|
||||
>>> ['a', 'b', 'c'][Shape.circle]
|
||||
'b'
|
||||
>>> [i for i in range(Shape.square)]
|
||||
[0, 1]
|
||||
|
||||
For the vast majority of code, ``Enum`` is strongly recommended,
|
||||
since ``IntEnum`` breaks some semantic promises of an enumeration (by
|
||||
being comparable to integers, and thus by transitivity to other
|
||||
unrelated enumerations). It should be used only in special cases where
|
||||
there's no other choice; for example, when integer constants are
|
||||
replaced with enumerations and backwards compatibility is required with code
|
||||
that still expects integers.
|
||||
|
||||
|
||||
Others
|
||||
^^^^^^
|
||||
|
||||
While ``IntEnum`` is part of the ``enum`` module, it would be very
|
||||
simple to implement independently::
|
||||
|
||||
class IntEnum(int, Enum):
|
||||
pass
|
||||
|
||||
This demonstrates how similar derived enumerations can be defined; for example
|
||||
a ``StrEnum`` that mixes in ``str`` instead of ``int``.
|
||||
|
||||
Some rules:
|
||||
|
||||
1. When subclassing ``Enum``, mix-in types must appear before
|
||||
``Enum`` itself in the sequence of bases, as in the ``IntEnum``
|
||||
example above.
|
||||
2. While ``Enum`` can have members of any type, once you mix in an
|
||||
additional type, all the members must have values of that type, e.g.
|
||||
``int`` above. This restriction does not apply to mix-ins which only
|
||||
add methods and don't specify another data type such as ``int`` or
|
||||
``str``.
|
||||
3. When another data type is mixed in, the ``value`` attribute is *not the
|
||||
same* as the enum member itself, although it is equivalant and will compare
|
||||
equal.
|
||||
4. %-style formatting: ``%s`` and ``%r`` call ``Enum``'s ``__str__`` and
|
||||
``__repr__`` respectively; other codes (such as ``%i`` or ``%h`` for
|
||||
IntEnum) treat the enum member as its mixed-in type.
|
||||
|
||||
Note: Prior to Python 3.4 there is a bug in ``str``'s %-formatting: ``int``
|
||||
subclasses are printed as strings and not numbers when the ``%d``, ``%i``,
|
||||
or ``%u`` codes are used.
|
||||
5. ``str.__format__`` (or ``format``) will use the mixed-in
|
||||
type's ``__format__``. If the ``Enum``'s ``str`` or
|
||||
``repr`` is desired use the ``!s`` or ``!r`` ``str`` format codes.
|
||||
|
||||
|
||||
Decorators
|
||||
----------
|
||||
|
||||
unique
|
||||
^^^^^^
|
||||
|
||||
A ``class`` decorator specifically for enumerations. It searches an
|
||||
enumeration's ``__members__`` gathering any aliases it finds; if any are
|
||||
found ``ValueError`` is raised with the details::
|
||||
|
||||
>>> @unique
|
||||
... class NoDupes(Enum):
|
||||
... first = 'one'
|
||||
... second = 'two'
|
||||
... third = 'two'
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: duplicate names found in <enum 'NoDupes'>: third -> second
|
||||
|
||||
|
||||
Interesting examples
|
||||
--------------------
|
||||
|
||||
While ``Enum`` and ``IntEnum`` are expected to cover the majority of
|
||||
use-cases, they cannot cover them all. Here are recipes for some different
|
||||
types of enumerations that can be used directly, or as examples for creating
|
||||
one's own.
|
||||
|
||||
|
||||
AutoNumber
|
||||
^^^^^^^^^^
|
||||
|
||||
Avoids having to specify the value for each enumeration member::
|
||||
|
||||
>>> class AutoNumber(Enum):
|
||||
... def __new__(cls):
|
||||
... value = len(cls.__members__) + 1
|
||||
... obj = object.__new__(cls)
|
||||
... obj._value_ = value
|
||||
... return obj
|
||||
...
|
||||
>>> class Color(AutoNumber):
|
||||
... __order__ = "red green blue" # only needed in 2.x
|
||||
... red = ()
|
||||
... green = ()
|
||||
... blue = ()
|
||||
...
|
||||
>>> Color.green.value == 2
|
||||
True
|
||||
|
||||
Note:
|
||||
|
||||
The `__new__` method, if defined, is used during creation of the Enum
|
||||
members; it is then replaced by Enum's `__new__` which is used after
|
||||
class creation for lookup of existing members. Due to the way Enums are
|
||||
supposed to behave, there is no way to customize Enum's `__new__`.
|
||||
|
||||
|
||||
UniqueEnum
|
||||
^^^^^^^^^^
|
||||
|
||||
Raises an error if a duplicate member name is found instead of creating an
|
||||
alias::
|
||||
|
||||
>>> class UniqueEnum(Enum):
|
||||
... def __init__(self, *args):
|
||||
... cls = self.__class__
|
||||
... if any(self.value == e.value for e in cls):
|
||||
... a = self.name
|
||||
... e = cls(self.value).name
|
||||
... raise ValueError(
|
||||
... "aliases not allowed in UniqueEnum: %r --> %r"
|
||||
... % (a, e))
|
||||
...
|
||||
>>> class Color(UniqueEnum):
|
||||
... red = 1
|
||||
... green = 2
|
||||
... blue = 3
|
||||
... grene = 2
|
||||
Traceback (most recent call last):
|
||||
...
|
||||
ValueError: aliases not allowed in UniqueEnum: 'grene' --> 'green'
|
||||
|
||||
|
||||
OrderedEnum
|
||||
^^^^^^^^^^^
|
||||
|
||||
An ordered enumeration that is not based on ``IntEnum`` and so maintains
|
||||
the normal ``Enum`` invariants (such as not being comparable to other
|
||||
enumerations)::
|
||||
|
||||
>>> class OrderedEnum(Enum):
|
||||
... def __ge__(self, other):
|
||||
... if self.__class__ is other.__class__:
|
||||
... return self._value_ >= other._value_
|
||||
... return NotImplemented
|
||||
... def __gt__(self, other):
|
||||
... if self.__class__ is other.__class__:
|
||||
... return self._value_ > other._value_
|
||||
... return NotImplemented
|
||||
... def __le__(self, other):
|
||||
... if self.__class__ is other.__class__:
|
||||
... return self._value_ <= other._value_
|
||||
... return NotImplemented
|
||||
... def __lt__(self, other):
|
||||
... if self.__class__ is other.__class__:
|
||||
... return self._value_ < other._value_
|
||||
... return NotImplemented
|
||||
...
|
||||
>>> class Grade(OrderedEnum):
|
||||
... __ordered__ = 'A B C D F'
|
||||
... A = 5
|
||||
... B = 4
|
||||
... C = 3
|
||||
... D = 2
|
||||
... F = 1
|
||||
...
|
||||
>>> Grade.C < Grade.A
|
||||
True
|
||||
|
||||
|
||||
Planet
|
||||
^^^^^^
|
||||
|
||||
If ``__new__`` or ``__init__`` is defined the value of the enum member
|
||||
will be passed to those methods::
|
||||
|
||||
>>> class Planet(Enum):
|
||||
... MERCURY = (3.303e+23, 2.4397e6)
|
||||
... VENUS = (4.869e+24, 6.0518e6)
|
||||
... EARTH = (5.976e+24, 6.37814e6)
|
||||
... MARS = (6.421e+23, 3.3972e6)
|
||||
... JUPITER = (1.9e+27, 7.1492e7)
|
||||
... SATURN = (5.688e+26, 6.0268e7)
|
||||
... URANUS = (8.686e+25, 2.5559e7)
|
||||
... NEPTUNE = (1.024e+26, 2.4746e7)
|
||||
... def __init__(self, mass, radius):
|
||||
... self.mass = mass # in kilograms
|
||||
... self.radius = radius # in meters
|
||||
... @property
|
||||
... def surface_gravity(self):
|
||||
... # universal gravitational constant (m3 kg-1 s-2)
|
||||
... G = 6.67300E-11
|
||||
... return G * self.mass / (self.radius * self.radius)
|
||||
...
|
||||
>>> Planet.EARTH.value
|
||||
(5.976e+24, 6378140.0)
|
||||
>>> Planet.EARTH.surface_gravity
|
||||
9.802652743337129
|
||||
|
||||
|
||||
How are Enums different?
|
||||
------------------------
|
||||
|
||||
Enums have a custom metaclass that affects many aspects of both derived Enum
|
||||
classes and their instances (members).
|
||||
|
||||
|
||||
Enum Classes
|
||||
^^^^^^^^^^^^
|
||||
|
||||
The ``EnumMeta`` metaclass is responsible for providing the
|
||||
``__contains__``, ``__dir__``, ``__iter__`` and other methods that
|
||||
allow one to do things with an ``Enum`` class that fail on a typical
|
||||
class, such as ``list(Color)`` or ``some_var in Color``. ``EnumMeta`` is
|
||||
responsible for ensuring that various other methods on the final ``Enum``
|
||||
class are correct (such as ``__new__``, ``__getnewargs__``,
|
||||
``__str__`` and ``__repr__``).
|
||||
|
||||
.. note::
|
||||
|
||||
``__dir__`` is not changed in the Python 2 line as it messes up some
|
||||
of the decorators included in the stdlib.
|
||||
|
||||
|
||||
Enum Members (aka instances)
|
||||
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
The most interesting thing about Enum members is that they are singletons.
|
||||
``EnumMeta`` creates them all while it is creating the ``Enum``
|
||||
class itself, and then puts a custom ``__new__`` in place to ensure
|
||||
that no new ones are ever instantiated by returning only the existing
|
||||
member instances.
|
||||
|
||||
|
||||
Finer Points
|
||||
^^^^^^^^^^^^
|
||||
|
||||
``Enum`` members are instances of an ``Enum`` class, and even though they
|
||||
are accessible as `EnumClass.member1.member2`, they should not be
|
||||
accessed directly from the member as that lookup may fail or, worse,
|
||||
return something besides the ``Enum`` member you were looking for
|
||||
(changed in version 1.1.1)::
|
||||
|
||||
>>> class FieldTypes(Enum):
|
||||
... name = 1
|
||||
... value = 2
|
||||
... size = 3
|
||||
...
|
||||
>>> FieldTypes.value.size
|
||||
<FieldTypes.size: 3>
|
||||
>>> FieldTypes.size.value
|
||||
3
|
||||
|
||||
The ``__members__`` attribute is only available on the class.
|
||||
|
||||
In Python 3.x ``__members__`` is always an ``OrderedDict``, with the order being
|
||||
the definition order. In Python 2.7 ``__members__`` is an ``OrderedDict`` if
|
||||
``__order__`` was specified, and a plain ``dict`` otherwise. In all other Python
|
||||
2.x versions ``__members__`` is a plain ``dict`` even if ``__order__`` was specified
|
||||
as the ``OrderedDict`` type didn't exist yet.
|
||||
|
||||
If you give your ``Enum`` subclass extra methods, like the `Planet`_
|
||||
class above, those methods will show up in a `dir` of the member,
|
||||
but not of the class::
|
||||
|
||||
>>> dir(Planet)
|
||||
['EARTH', 'JUPITER', 'MARS', 'MERCURY', 'NEPTUNE', 'SATURN', 'URANUS',
|
||||
'VENUS', '__class__', '__doc__', '__members__', '__module__']
|
||||
>>> dir(Planet.EARTH)
|
||||
['__class__', '__doc__', '__module__', 'name', 'surface_gravity', 'value']
|
||||
|
||||
A ``__new__`` method will only be used for the creation of the
|
||||
``Enum`` members -- after that it is replaced. This means if you wish to
|
||||
change how ``Enum`` members are looked up you either have to write a
|
||||
helper function or a ``classmethod``.
|
||||
1820
libs2/enum/test.py
1820
libs2/enum/test.py
File diff suppressed because it is too large
Load Diff
@@ -1,406 +0,0 @@
|
||||
|
||||
from error import *
|
||||
|
||||
from tokens import *
|
||||
from events import *
|
||||
from nodes import *
|
||||
|
||||
from loader import *
|
||||
from dumper import *
|
||||
|
||||
__version__ = '5.1'
|
||||
|
||||
try:
|
||||
from cyaml import *
|
||||
__with_libyaml__ = True
|
||||
except ImportError:
|
||||
__with_libyaml__ = False
|
||||
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
# Warnings control
|
||||
#------------------------------------------------------------------------------
|
||||
|
||||
# 'Global' warnings state:
|
||||
_warnings_enabled = {
|
||||
'YAMLLoadWarning': True,
|
||||
}
|
||||
|
||||
# Get or set global warnings' state
|
||||
def warnings(settings=None):
|
||||
if settings is None:
|
||||
return _warnings_enabled
|
||||
|
||||
if type(settings) is dict:
|
||||
for key in settings:
|
||||
if key in _warnings_enabled:
|
||||
_warnings_enabled[key] = settings[key]
|
||||
|
||||
# Warn when load() is called without Loader=...
|
||||
class YAMLLoadWarning(RuntimeWarning):
|
||||
pass
|
||||
|
||||
def load_warning(method):
|
||||
if _warnings_enabled['YAMLLoadWarning'] is False:
|
||||
return
|
||||
|
||||
import warnings
|
||||
|
||||
message = (
|
||||
"calling yaml.%s() without Loader=... is deprecated, as the "
|
||||
"default Loader is unsafe. Please read "
|
||||
"https://msg.pyyaml.org/load for full details."
|
||||
) % method
|
||||
|
||||
warnings.warn(message, YAMLLoadWarning, stacklevel=3)
|
||||
|
||||
#------------------------------------------------------------------------------
|
||||
def scan(stream, Loader=Loader):
|
||||
"""
|
||||
Scan a YAML stream and produce scanning tokens.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_token():
|
||||
yield loader.get_token()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def parse(stream, Loader=Loader):
|
||||
"""
|
||||
Parse a YAML stream and produce parsing events.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_event():
|
||||
yield loader.get_event()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def compose(stream, Loader=Loader):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding representation tree.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
return loader.get_single_node()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def compose_all(stream, Loader=Loader):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding representation trees.
|
||||
"""
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_node():
|
||||
yield loader.get_node()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def load(stream, Loader=None):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
"""
|
||||
if Loader is None:
|
||||
load_warning('load')
|
||||
Loader = FullLoader
|
||||
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
return loader.get_single_data()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def load_all(stream, Loader=None):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
"""
|
||||
if Loader is None:
|
||||
load_warning('load_all')
|
||||
Loader = FullLoader
|
||||
|
||||
loader = Loader(stream)
|
||||
try:
|
||||
while loader.check_data():
|
||||
yield loader.get_data()
|
||||
finally:
|
||||
loader.dispose()
|
||||
|
||||
def full_load(stream):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
|
||||
Resolve all tags except those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load(stream, FullLoader)
|
||||
|
||||
def full_load_all(stream):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
|
||||
Resolve all tags except those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load_all(stream, FullLoader)
|
||||
|
||||
def safe_load(stream):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
|
||||
Resolve only basic YAML tags. This is known
|
||||
to be safe for untrusted input.
|
||||
"""
|
||||
return load(stream, SafeLoader)
|
||||
|
||||
def safe_load_all(stream):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
|
||||
Resolve only basic YAML tags. This is known
|
||||
to be safe for untrusted input.
|
||||
"""
|
||||
return load_all(stream, SafeLoader)
|
||||
|
||||
def unsafe_load(stream):
|
||||
"""
|
||||
Parse the first YAML document in a stream
|
||||
and produce the corresponding Python object.
|
||||
|
||||
Resolve all tags, even those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load(stream, UnsafeLoader)
|
||||
|
||||
def unsafe_load_all(stream):
|
||||
"""
|
||||
Parse all YAML documents in a stream
|
||||
and produce corresponding Python objects.
|
||||
|
||||
Resolve all tags, even those known to be
|
||||
unsafe on untrusted input.
|
||||
"""
|
||||
return load_all(stream, UnsafeLoader)
|
||||
|
||||
def emit(events, stream=None, Dumper=Dumper,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None):
|
||||
"""
|
||||
Emit YAML parsing events into a stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
getvalue = None
|
||||
if stream is None:
|
||||
from StringIO import StringIO
|
||||
stream = StringIO()
|
||||
getvalue = stream.getvalue
|
||||
dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
try:
|
||||
for event in events:
|
||||
dumper.emit(event)
|
||||
finally:
|
||||
dumper.dispose()
|
||||
if getvalue:
|
||||
return getvalue()
|
||||
|
||||
def serialize_all(nodes, stream=None, Dumper=Dumper,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding='utf-8', explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None):
|
||||
"""
|
||||
Serialize a sequence of representation trees into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
getvalue = None
|
||||
if stream is None:
|
||||
if encoding is None:
|
||||
from StringIO import StringIO
|
||||
else:
|
||||
from cStringIO import StringIO
|
||||
stream = StringIO()
|
||||
getvalue = stream.getvalue
|
||||
dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
encoding=encoding, version=version, tags=tags,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end)
|
||||
try:
|
||||
dumper.open()
|
||||
for node in nodes:
|
||||
dumper.serialize(node)
|
||||
dumper.close()
|
||||
finally:
|
||||
dumper.dispose()
|
||||
if getvalue:
|
||||
return getvalue()
|
||||
|
||||
def serialize(node, stream=None, Dumper=Dumper, **kwds):
|
||||
"""
|
||||
Serialize a representation tree into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return serialize_all([node], stream, Dumper=Dumper, **kwds)
|
||||
|
||||
def dump_all(documents, stream=None, Dumper=Dumper,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding='utf-8', explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
"""
|
||||
Serialize a sequence of Python objects into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
getvalue = None
|
||||
if stream is None:
|
||||
if encoding is None:
|
||||
from StringIO import StringIO
|
||||
else:
|
||||
from cStringIO import StringIO
|
||||
stream = StringIO()
|
||||
getvalue = stream.getvalue
|
||||
dumper = Dumper(stream, default_style=default_style,
|
||||
default_flow_style=default_flow_style,
|
||||
canonical=canonical, indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
encoding=encoding, version=version, tags=tags,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end, sort_keys=sort_keys)
|
||||
try:
|
||||
dumper.open()
|
||||
for data in documents:
|
||||
dumper.represent(data)
|
||||
dumper.close()
|
||||
finally:
|
||||
dumper.dispose()
|
||||
if getvalue:
|
||||
return getvalue()
|
||||
|
||||
def dump(data, stream=None, Dumper=Dumper, **kwds):
|
||||
"""
|
||||
Serialize a Python object into a YAML stream.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return dump_all([data], stream, Dumper=Dumper, **kwds)
|
||||
|
||||
def safe_dump_all(documents, stream=None, **kwds):
|
||||
"""
|
||||
Serialize a sequence of Python objects into a YAML stream.
|
||||
Produce only basic YAML tags.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return dump_all(documents, stream, Dumper=SafeDumper, **kwds)
|
||||
|
||||
def safe_dump(data, stream=None, **kwds):
|
||||
"""
|
||||
Serialize a Python object into a YAML stream.
|
||||
Produce only basic YAML tags.
|
||||
If stream is None, return the produced string instead.
|
||||
"""
|
||||
return dump_all([data], stream, Dumper=SafeDumper, **kwds)
|
||||
|
||||
def add_implicit_resolver(tag, regexp, first=None,
|
||||
Loader=Loader, Dumper=Dumper):
|
||||
"""
|
||||
Add an implicit scalar detector.
|
||||
If an implicit scalar value matches the given regexp,
|
||||
the corresponding tag is assigned to the scalar.
|
||||
first is a sequence of possible initial characters or None.
|
||||
"""
|
||||
Loader.add_implicit_resolver(tag, regexp, first)
|
||||
Dumper.add_implicit_resolver(tag, regexp, first)
|
||||
|
||||
def add_path_resolver(tag, path, kind=None, Loader=Loader, Dumper=Dumper):
|
||||
"""
|
||||
Add a path based resolver for the given tag.
|
||||
A path is a list of keys that forms a path
|
||||
to a node in the representation tree.
|
||||
Keys can be string values, integers, or None.
|
||||
"""
|
||||
Loader.add_path_resolver(tag, path, kind)
|
||||
Dumper.add_path_resolver(tag, path, kind)
|
||||
|
||||
def add_constructor(tag, constructor, Loader=Loader):
|
||||
"""
|
||||
Add a constructor for the given tag.
|
||||
Constructor is a function that accepts a Loader instance
|
||||
and a node object and produces the corresponding Python object.
|
||||
"""
|
||||
Loader.add_constructor(tag, constructor)
|
||||
|
||||
def add_multi_constructor(tag_prefix, multi_constructor, Loader=Loader):
|
||||
"""
|
||||
Add a multi-constructor for the given tag prefix.
|
||||
Multi-constructor is called for a node if its tag starts with tag_prefix.
|
||||
Multi-constructor accepts a Loader instance, a tag suffix,
|
||||
and a node object and produces the corresponding Python object.
|
||||
"""
|
||||
Loader.add_multi_constructor(tag_prefix, multi_constructor)
|
||||
|
||||
def add_representer(data_type, representer, Dumper=Dumper):
|
||||
"""
|
||||
Add a representer for the given type.
|
||||
Representer is a function accepting a Dumper instance
|
||||
and an instance of the given data type
|
||||
and producing the corresponding representation node.
|
||||
"""
|
||||
Dumper.add_representer(data_type, representer)
|
||||
|
||||
def add_multi_representer(data_type, multi_representer, Dumper=Dumper):
|
||||
"""
|
||||
Add a representer for the given type.
|
||||
Multi-representer is a function accepting a Dumper instance
|
||||
and an instance of the given data type or subtype
|
||||
and producing the corresponding representation node.
|
||||
"""
|
||||
Dumper.add_multi_representer(data_type, multi_representer)
|
||||
|
||||
class YAMLObjectMetaclass(type):
|
||||
"""
|
||||
The metaclass for YAMLObject.
|
||||
"""
|
||||
def __init__(cls, name, bases, kwds):
|
||||
super(YAMLObjectMetaclass, cls).__init__(name, bases, kwds)
|
||||
if 'yaml_tag' in kwds and kwds['yaml_tag'] is not None:
|
||||
cls.yaml_loader.add_constructor(cls.yaml_tag, cls.from_yaml)
|
||||
cls.yaml_dumper.add_representer(cls, cls.to_yaml)
|
||||
|
||||
class YAMLObject(object):
|
||||
"""
|
||||
An object that can dump itself to a YAML stream
|
||||
and load itself from a YAML stream.
|
||||
"""
|
||||
|
||||
__metaclass__ = YAMLObjectMetaclass
|
||||
__slots__ = () # no direct instantiation, so allow immutable subclasses
|
||||
|
||||
yaml_loader = Loader
|
||||
yaml_dumper = Dumper
|
||||
|
||||
yaml_tag = None
|
||||
yaml_flow_style = None
|
||||
|
||||
def from_yaml(cls, loader, node):
|
||||
"""
|
||||
Convert a representation node to a Python object.
|
||||
"""
|
||||
return loader.construct_yaml_object(node, cls)
|
||||
from_yaml = classmethod(from_yaml)
|
||||
|
||||
def to_yaml(cls, dumper, data):
|
||||
"""
|
||||
Convert a Python object to a representation node.
|
||||
"""
|
||||
return dumper.represent_yaml_object(cls.yaml_tag, data, cls,
|
||||
flow_style=cls.yaml_flow_style)
|
||||
to_yaml = classmethod(to_yaml)
|
||||
|
||||
@@ -1,139 +0,0 @@
|
||||
|
||||
__all__ = ['Composer', 'ComposerError']
|
||||
|
||||
from error import MarkedYAMLError
|
||||
from events import *
|
||||
from nodes import *
|
||||
|
||||
class ComposerError(MarkedYAMLError):
|
||||
pass
|
||||
|
||||
class Composer(object):
|
||||
|
||||
def __init__(self):
|
||||
self.anchors = {}
|
||||
|
||||
def check_node(self):
|
||||
# Drop the STREAM-START event.
|
||||
if self.check_event(StreamStartEvent):
|
||||
self.get_event()
|
||||
|
||||
# If there are more documents available?
|
||||
return not self.check_event(StreamEndEvent)
|
||||
|
||||
def get_node(self):
|
||||
# Get the root node of the next document.
|
||||
if not self.check_event(StreamEndEvent):
|
||||
return self.compose_document()
|
||||
|
||||
def get_single_node(self):
|
||||
# Drop the STREAM-START event.
|
||||
self.get_event()
|
||||
|
||||
# Compose a document if the stream is not empty.
|
||||
document = None
|
||||
if not self.check_event(StreamEndEvent):
|
||||
document = self.compose_document()
|
||||
|
||||
# Ensure that the stream contains no more documents.
|
||||
if not self.check_event(StreamEndEvent):
|
||||
event = self.get_event()
|
||||
raise ComposerError("expected a single document in the stream",
|
||||
document.start_mark, "but found another document",
|
||||
event.start_mark)
|
||||
|
||||
# Drop the STREAM-END event.
|
||||
self.get_event()
|
||||
|
||||
return document
|
||||
|
||||
def compose_document(self):
|
||||
# Drop the DOCUMENT-START event.
|
||||
self.get_event()
|
||||
|
||||
# Compose the root node.
|
||||
node = self.compose_node(None, None)
|
||||
|
||||
# Drop the DOCUMENT-END event.
|
||||
self.get_event()
|
||||
|
||||
self.anchors = {}
|
||||
return node
|
||||
|
||||
def compose_node(self, parent, index):
|
||||
if self.check_event(AliasEvent):
|
||||
event = self.get_event()
|
||||
anchor = event.anchor
|
||||
if anchor not in self.anchors:
|
||||
raise ComposerError(None, None, "found undefined alias %r"
|
||||
% anchor.encode('utf-8'), event.start_mark)
|
||||
return self.anchors[anchor]
|
||||
event = self.peek_event()
|
||||
anchor = event.anchor
|
||||
if anchor is not None:
|
||||
if anchor in self.anchors:
|
||||
raise ComposerError("found duplicate anchor %r; first occurrence"
|
||||
% anchor.encode('utf-8'), self.anchors[anchor].start_mark,
|
||||
"second occurrence", event.start_mark)
|
||||
self.descend_resolver(parent, index)
|
||||
if self.check_event(ScalarEvent):
|
||||
node = self.compose_scalar_node(anchor)
|
||||
elif self.check_event(SequenceStartEvent):
|
||||
node = self.compose_sequence_node(anchor)
|
||||
elif self.check_event(MappingStartEvent):
|
||||
node = self.compose_mapping_node(anchor)
|
||||
self.ascend_resolver()
|
||||
return node
|
||||
|
||||
def compose_scalar_node(self, anchor):
|
||||
event = self.get_event()
|
||||
tag = event.tag
|
||||
if tag is None or tag == u'!':
|
||||
tag = self.resolve(ScalarNode, event.value, event.implicit)
|
||||
node = ScalarNode(tag, event.value,
|
||||
event.start_mark, event.end_mark, style=event.style)
|
||||
if anchor is not None:
|
||||
self.anchors[anchor] = node
|
||||
return node
|
||||
|
||||
def compose_sequence_node(self, anchor):
|
||||
start_event = self.get_event()
|
||||
tag = start_event.tag
|
||||
if tag is None or tag == u'!':
|
||||
tag = self.resolve(SequenceNode, None, start_event.implicit)
|
||||
node = SequenceNode(tag, [],
|
||||
start_event.start_mark, None,
|
||||
flow_style=start_event.flow_style)
|
||||
if anchor is not None:
|
||||
self.anchors[anchor] = node
|
||||
index = 0
|
||||
while not self.check_event(SequenceEndEvent):
|
||||
node.value.append(self.compose_node(node, index))
|
||||
index += 1
|
||||
end_event = self.get_event()
|
||||
node.end_mark = end_event.end_mark
|
||||
return node
|
||||
|
||||
def compose_mapping_node(self, anchor):
|
||||
start_event = self.get_event()
|
||||
tag = start_event.tag
|
||||
if tag is None or tag == u'!':
|
||||
tag = self.resolve(MappingNode, None, start_event.implicit)
|
||||
node = MappingNode(tag, [],
|
||||
start_event.start_mark, None,
|
||||
flow_style=start_event.flow_style)
|
||||
if anchor is not None:
|
||||
self.anchors[anchor] = node
|
||||
while not self.check_event(MappingEndEvent):
|
||||
#key_event = self.peek_event()
|
||||
item_key = self.compose_node(node, None)
|
||||
#if item_key in node.value:
|
||||
# raise ComposerError("while composing a mapping", start_event.start_mark,
|
||||
# "found duplicate key", key_event.start_mark)
|
||||
item_value = self.compose_node(node, item_key)
|
||||
#node.value[item_key] = item_value
|
||||
node.value.append((item_key, item_value))
|
||||
end_event = self.get_event()
|
||||
node.end_mark = end_event.end_mark
|
||||
return node
|
||||
|
||||
@@ -1,709 +0,0 @@
|
||||
|
||||
__all__ = [
|
||||
'BaseConstructor',
|
||||
'SafeConstructor',
|
||||
'FullConstructor',
|
||||
'UnsafeConstructor',
|
||||
'Constructor',
|
||||
'ConstructorError'
|
||||
]
|
||||
|
||||
from error import *
|
||||
from nodes import *
|
||||
|
||||
import datetime
|
||||
|
||||
import binascii, re, sys, types
|
||||
|
||||
class ConstructorError(MarkedYAMLError):
|
||||
pass
|
||||
|
||||
class BaseConstructor(object):
|
||||
|
||||
yaml_constructors = {}
|
||||
yaml_multi_constructors = {}
|
||||
|
||||
def __init__(self):
|
||||
self.constructed_objects = {}
|
||||
self.recursive_objects = {}
|
||||
self.state_generators = []
|
||||
self.deep_construct = False
|
||||
|
||||
def check_data(self):
|
||||
# If there are more documents available?
|
||||
return self.check_node()
|
||||
|
||||
def get_data(self):
|
||||
# Construct and return the next document.
|
||||
if self.check_node():
|
||||
return self.construct_document(self.get_node())
|
||||
|
||||
def get_single_data(self):
|
||||
# Ensure that the stream contains a single document and construct it.
|
||||
node = self.get_single_node()
|
||||
if node is not None:
|
||||
return self.construct_document(node)
|
||||
return None
|
||||
|
||||
def construct_document(self, node):
|
||||
data = self.construct_object(node)
|
||||
while self.state_generators:
|
||||
state_generators = self.state_generators
|
||||
self.state_generators = []
|
||||
for generator in state_generators:
|
||||
for dummy in generator:
|
||||
pass
|
||||
self.constructed_objects = {}
|
||||
self.recursive_objects = {}
|
||||
self.deep_construct = False
|
||||
return data
|
||||
|
||||
def construct_object(self, node, deep=False):
|
||||
if node in self.constructed_objects:
|
||||
return self.constructed_objects[node]
|
||||
if deep:
|
||||
old_deep = self.deep_construct
|
||||
self.deep_construct = True
|
||||
if node in self.recursive_objects:
|
||||
raise ConstructorError(None, None,
|
||||
"found unconstructable recursive node", node.start_mark)
|
||||
self.recursive_objects[node] = None
|
||||
constructor = None
|
||||
tag_suffix = None
|
||||
if node.tag in self.yaml_constructors:
|
||||
constructor = self.yaml_constructors[node.tag]
|
||||
else:
|
||||
for tag_prefix in self.yaml_multi_constructors:
|
||||
if node.tag.startswith(tag_prefix):
|
||||
tag_suffix = node.tag[len(tag_prefix):]
|
||||
constructor = self.yaml_multi_constructors[tag_prefix]
|
||||
break
|
||||
else:
|
||||
if None in self.yaml_multi_constructors:
|
||||
tag_suffix = node.tag
|
||||
constructor = self.yaml_multi_constructors[None]
|
||||
elif None in self.yaml_constructors:
|
||||
constructor = self.yaml_constructors[None]
|
||||
elif isinstance(node, ScalarNode):
|
||||
constructor = self.__class__.construct_scalar
|
||||
elif isinstance(node, SequenceNode):
|
||||
constructor = self.__class__.construct_sequence
|
||||
elif isinstance(node, MappingNode):
|
||||
constructor = self.__class__.construct_mapping
|
||||
if tag_suffix is None:
|
||||
data = constructor(self, node)
|
||||
else:
|
||||
data = constructor(self, tag_suffix, node)
|
||||
if isinstance(data, types.GeneratorType):
|
||||
generator = data
|
||||
data = generator.next()
|
||||
if self.deep_construct:
|
||||
for dummy in generator:
|
||||
pass
|
||||
else:
|
||||
self.state_generators.append(generator)
|
||||
self.constructed_objects[node] = data
|
||||
del self.recursive_objects[node]
|
||||
if deep:
|
||||
self.deep_construct = old_deep
|
||||
return data
|
||||
|
||||
def construct_scalar(self, node):
|
||||
if not isinstance(node, ScalarNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a scalar node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
return node.value
|
||||
|
||||
def construct_sequence(self, node, deep=False):
|
||||
if not isinstance(node, SequenceNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a sequence node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
return [self.construct_object(child, deep=deep)
|
||||
for child in node.value]
|
||||
|
||||
def construct_mapping(self, node, deep=False):
|
||||
if not isinstance(node, MappingNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a mapping node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
mapping = {}
|
||||
for key_node, value_node in node.value:
|
||||
key = self.construct_object(key_node, deep=deep)
|
||||
try:
|
||||
hash(key)
|
||||
except TypeError, exc:
|
||||
raise ConstructorError("while constructing a mapping", node.start_mark,
|
||||
"found unacceptable key (%s)" % exc, key_node.start_mark)
|
||||
value = self.construct_object(value_node, deep=deep)
|
||||
mapping[key] = value
|
||||
return mapping
|
||||
|
||||
def construct_pairs(self, node, deep=False):
|
||||
if not isinstance(node, MappingNode):
|
||||
raise ConstructorError(None, None,
|
||||
"expected a mapping node, but found %s" % node.id,
|
||||
node.start_mark)
|
||||
pairs = []
|
||||
for key_node, value_node in node.value:
|
||||
key = self.construct_object(key_node, deep=deep)
|
||||
value = self.construct_object(value_node, deep=deep)
|
||||
pairs.append((key, value))
|
||||
return pairs
|
||||
|
||||
def add_constructor(cls, tag, constructor):
|
||||
if not 'yaml_constructors' in cls.__dict__:
|
||||
cls.yaml_constructors = cls.yaml_constructors.copy()
|
||||
cls.yaml_constructors[tag] = constructor
|
||||
add_constructor = classmethod(add_constructor)
|
||||
|
||||
def add_multi_constructor(cls, tag_prefix, multi_constructor):
|
||||
if not 'yaml_multi_constructors' in cls.__dict__:
|
||||
cls.yaml_multi_constructors = cls.yaml_multi_constructors.copy()
|
||||
cls.yaml_multi_constructors[tag_prefix] = multi_constructor
|
||||
add_multi_constructor = classmethod(add_multi_constructor)
|
||||
|
||||
class SafeConstructor(BaseConstructor):
|
||||
|
||||
def construct_scalar(self, node):
|
||||
if isinstance(node, MappingNode):
|
||||
for key_node, value_node in node.value:
|
||||
if key_node.tag == u'tag:yaml.org,2002:value':
|
||||
return self.construct_scalar(value_node)
|
||||
return BaseConstructor.construct_scalar(self, node)
|
||||
|
||||
def flatten_mapping(self, node):
|
||||
merge = []
|
||||
index = 0
|
||||
while index < len(node.value):
|
||||
key_node, value_node = node.value[index]
|
||||
if key_node.tag == u'tag:yaml.org,2002:merge':
|
||||
del node.value[index]
|
||||
if isinstance(value_node, MappingNode):
|
||||
self.flatten_mapping(value_node)
|
||||
merge.extend(value_node.value)
|
||||
elif isinstance(value_node, SequenceNode):
|
||||
submerge = []
|
||||
for subnode in value_node.value:
|
||||
if not isinstance(subnode, MappingNode):
|
||||
raise ConstructorError("while constructing a mapping",
|
||||
node.start_mark,
|
||||
"expected a mapping for merging, but found %s"
|
||||
% subnode.id, subnode.start_mark)
|
||||
self.flatten_mapping(subnode)
|
||||
submerge.append(subnode.value)
|
||||
submerge.reverse()
|
||||
for value in submerge:
|
||||
merge.extend(value)
|
||||
else:
|
||||
raise ConstructorError("while constructing a mapping", node.start_mark,
|
||||
"expected a mapping or list of mappings for merging, but found %s"
|
||||
% value_node.id, value_node.start_mark)
|
||||
elif key_node.tag == u'tag:yaml.org,2002:value':
|
||||
key_node.tag = u'tag:yaml.org,2002:str'
|
||||
index += 1
|
||||
else:
|
||||
index += 1
|
||||
if merge:
|
||||
node.value = merge + node.value
|
||||
|
||||
def construct_mapping(self, node, deep=False):
|
||||
if isinstance(node, MappingNode):
|
||||
self.flatten_mapping(node)
|
||||
return BaseConstructor.construct_mapping(self, node, deep=deep)
|
||||
|
||||
def construct_yaml_null(self, node):
|
||||
self.construct_scalar(node)
|
||||
return None
|
||||
|
||||
bool_values = {
|
||||
u'yes': True,
|
||||
u'no': False,
|
||||
u'true': True,
|
||||
u'false': False,
|
||||
u'on': True,
|
||||
u'off': False,
|
||||
}
|
||||
|
||||
def construct_yaml_bool(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
return self.bool_values[value.lower()]
|
||||
|
||||
def construct_yaml_int(self, node):
|
||||
value = str(self.construct_scalar(node))
|
||||
value = value.replace('_', '')
|
||||
sign = +1
|
||||
if value[0] == '-':
|
||||
sign = -1
|
||||
if value[0] in '+-':
|
||||
value = value[1:]
|
||||
if value == '0':
|
||||
return 0
|
||||
elif value.startswith('0b'):
|
||||
return sign*int(value[2:], 2)
|
||||
elif value.startswith('0x'):
|
||||
return sign*int(value[2:], 16)
|
||||
elif value[0] == '0':
|
||||
return sign*int(value, 8)
|
||||
elif ':' in value:
|
||||
digits = [int(part) for part in value.split(':')]
|
||||
digits.reverse()
|
||||
base = 1
|
||||
value = 0
|
||||
for digit in digits:
|
||||
value += digit*base
|
||||
base *= 60
|
||||
return sign*value
|
||||
else:
|
||||
return sign*int(value)
|
||||
|
||||
inf_value = 1e300
|
||||
while inf_value != inf_value*inf_value:
|
||||
inf_value *= inf_value
|
||||
nan_value = -inf_value/inf_value # Trying to make a quiet NaN (like C99).
|
||||
|
||||
def construct_yaml_float(self, node):
|
||||
value = str(self.construct_scalar(node))
|
||||
value = value.replace('_', '').lower()
|
||||
sign = +1
|
||||
if value[0] == '-':
|
||||
sign = -1
|
||||
if value[0] in '+-':
|
||||
value = value[1:]
|
||||
if value == '.inf':
|
||||
return sign*self.inf_value
|
||||
elif value == '.nan':
|
||||
return self.nan_value
|
||||
elif ':' in value:
|
||||
digits = [float(part) for part in value.split(':')]
|
||||
digits.reverse()
|
||||
base = 1
|
||||
value = 0.0
|
||||
for digit in digits:
|
||||
value += digit*base
|
||||
base *= 60
|
||||
return sign*value
|
||||
else:
|
||||
return sign*float(value)
|
||||
|
||||
def construct_yaml_binary(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
try:
|
||||
return str(value).decode('base64')
|
||||
except (binascii.Error, UnicodeEncodeError), exc:
|
||||
raise ConstructorError(None, None,
|
||||
"failed to decode base64 data: %s" % exc, node.start_mark)
|
||||
|
||||
timestamp_regexp = re.compile(
|
||||
ur'''^(?P<year>[0-9][0-9][0-9][0-9])
|
||||
-(?P<month>[0-9][0-9]?)
|
||||
-(?P<day>[0-9][0-9]?)
|
||||
(?:(?:[Tt]|[ \t]+)
|
||||
(?P<hour>[0-9][0-9]?)
|
||||
:(?P<minute>[0-9][0-9])
|
||||
:(?P<second>[0-9][0-9])
|
||||
(?:\.(?P<fraction>[0-9]*))?
|
||||
(?:[ \t]*(?P<tz>Z|(?P<tz_sign>[-+])(?P<tz_hour>[0-9][0-9]?)
|
||||
(?::(?P<tz_minute>[0-9][0-9]))?))?)?$''', re.X)
|
||||
|
||||
def construct_yaml_timestamp(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
match = self.timestamp_regexp.match(node.value)
|
||||
values = match.groupdict()
|
||||
year = int(values['year'])
|
||||
month = int(values['month'])
|
||||
day = int(values['day'])
|
||||
if not values['hour']:
|
||||
return datetime.date(year, month, day)
|
||||
hour = int(values['hour'])
|
||||
minute = int(values['minute'])
|
||||
second = int(values['second'])
|
||||
fraction = 0
|
||||
if values['fraction']:
|
||||
fraction = values['fraction'][:6]
|
||||
while len(fraction) < 6:
|
||||
fraction += '0'
|
||||
fraction = int(fraction)
|
||||
delta = None
|
||||
if values['tz_sign']:
|
||||
tz_hour = int(values['tz_hour'])
|
||||
tz_minute = int(values['tz_minute'] or 0)
|
||||
delta = datetime.timedelta(hours=tz_hour, minutes=tz_minute)
|
||||
if values['tz_sign'] == '-':
|
||||
delta = -delta
|
||||
data = datetime.datetime(year, month, day, hour, minute, second, fraction)
|
||||
if delta:
|
||||
data -= delta
|
||||
return data
|
||||
|
||||
def construct_yaml_omap(self, node):
|
||||
# Note: we do not check for duplicate keys, because it's too
|
||||
# CPU-expensive.
|
||||
omap = []
|
||||
yield omap
|
||||
if not isinstance(node, SequenceNode):
|
||||
raise ConstructorError("while constructing an ordered map", node.start_mark,
|
||||
"expected a sequence, but found %s" % node.id, node.start_mark)
|
||||
for subnode in node.value:
|
||||
if not isinstance(subnode, MappingNode):
|
||||
raise ConstructorError("while constructing an ordered map", node.start_mark,
|
||||
"expected a mapping of length 1, but found %s" % subnode.id,
|
||||
subnode.start_mark)
|
||||
if len(subnode.value) != 1:
|
||||
raise ConstructorError("while constructing an ordered map", node.start_mark,
|
||||
"expected a single mapping item, but found %d items" % len(subnode.value),
|
||||
subnode.start_mark)
|
||||
key_node, value_node = subnode.value[0]
|
||||
key = self.construct_object(key_node)
|
||||
value = self.construct_object(value_node)
|
||||
omap.append((key, value))
|
||||
|
||||
def construct_yaml_pairs(self, node):
|
||||
# Note: the same code as `construct_yaml_omap`.
|
||||
pairs = []
|
||||
yield pairs
|
||||
if not isinstance(node, SequenceNode):
|
||||
raise ConstructorError("while constructing pairs", node.start_mark,
|
||||
"expected a sequence, but found %s" % node.id, node.start_mark)
|
||||
for subnode in node.value:
|
||||
if not isinstance(subnode, MappingNode):
|
||||
raise ConstructorError("while constructing pairs", node.start_mark,
|
||||
"expected a mapping of length 1, but found %s" % subnode.id,
|
||||
subnode.start_mark)
|
||||
if len(subnode.value) != 1:
|
||||
raise ConstructorError("while constructing pairs", node.start_mark,
|
||||
"expected a single mapping item, but found %d items" % len(subnode.value),
|
||||
subnode.start_mark)
|
||||
key_node, value_node = subnode.value[0]
|
||||
key = self.construct_object(key_node)
|
||||
value = self.construct_object(value_node)
|
||||
pairs.append((key, value))
|
||||
|
||||
def construct_yaml_set(self, node):
|
||||
data = set()
|
||||
yield data
|
||||
value = self.construct_mapping(node)
|
||||
data.update(value)
|
||||
|
||||
def construct_yaml_str(self, node):
|
||||
value = self.construct_scalar(node)
|
||||
try:
|
||||
return value.encode('ascii')
|
||||
except UnicodeEncodeError:
|
||||
return value
|
||||
|
||||
def construct_yaml_seq(self, node):
|
||||
data = []
|
||||
yield data
|
||||
data.extend(self.construct_sequence(node))
|
||||
|
||||
def construct_yaml_map(self, node):
|
||||
data = {}
|
||||
yield data
|
||||
value = self.construct_mapping(node)
|
||||
data.update(value)
|
||||
|
||||
def construct_yaml_object(self, node, cls):
|
||||
data = cls.__new__(cls)
|
||||
yield data
|
||||
if hasattr(data, '__setstate__'):
|
||||
state = self.construct_mapping(node, deep=True)
|
||||
data.__setstate__(state)
|
||||
else:
|
||||
state = self.construct_mapping(node)
|
||||
data.__dict__.update(state)
|
||||
|
||||
def construct_undefined(self, node):
|
||||
raise ConstructorError(None, None,
|
||||
"could not determine a constructor for the tag %r" % node.tag.encode('utf-8'),
|
||||
node.start_mark)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:null',
|
||||
SafeConstructor.construct_yaml_null)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:bool',
|
||||
SafeConstructor.construct_yaml_bool)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:int',
|
||||
SafeConstructor.construct_yaml_int)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:float',
|
||||
SafeConstructor.construct_yaml_float)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:binary',
|
||||
SafeConstructor.construct_yaml_binary)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:timestamp',
|
||||
SafeConstructor.construct_yaml_timestamp)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:omap',
|
||||
SafeConstructor.construct_yaml_omap)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:pairs',
|
||||
SafeConstructor.construct_yaml_pairs)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:set',
|
||||
SafeConstructor.construct_yaml_set)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:str',
|
||||
SafeConstructor.construct_yaml_str)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:seq',
|
||||
SafeConstructor.construct_yaml_seq)
|
||||
|
||||
SafeConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:map',
|
||||
SafeConstructor.construct_yaml_map)
|
||||
|
||||
SafeConstructor.add_constructor(None,
|
||||
SafeConstructor.construct_undefined)
|
||||
|
||||
class FullConstructor(SafeConstructor):
|
||||
|
||||
def construct_python_str(self, node):
|
||||
return self.construct_scalar(node).encode('utf-8')
|
||||
|
||||
def construct_python_unicode(self, node):
|
||||
return self.construct_scalar(node)
|
||||
|
||||
def construct_python_long(self, node):
|
||||
return long(self.construct_yaml_int(node))
|
||||
|
||||
def construct_python_complex(self, node):
|
||||
return complex(self.construct_scalar(node))
|
||||
|
||||
def construct_python_tuple(self, node):
|
||||
return tuple(self.construct_sequence(node))
|
||||
|
||||
def find_python_module(self, name, mark, unsafe=False):
|
||||
if not name:
|
||||
raise ConstructorError("while constructing a Python module", mark,
|
||||
"expected non-empty name appended to the tag", mark)
|
||||
if unsafe:
|
||||
try:
|
||||
__import__(name)
|
||||
except ImportError, exc:
|
||||
raise ConstructorError("while constructing a Python module", mark,
|
||||
"cannot find module %r (%s)" % (name.encode('utf-8'), exc), mark)
|
||||
if not name in sys.modules:
|
||||
raise ConstructorError("while constructing a Python module", mark,
|
||||
"module %r is not imported" % name.encode('utf-8'), mark)
|
||||
return sys.modules[name]
|
||||
|
||||
def find_python_name(self, name, mark, unsafe=False):
|
||||
if not name:
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"expected non-empty name appended to the tag", mark)
|
||||
if u'.' in name:
|
||||
module_name, object_name = name.rsplit('.', 1)
|
||||
else:
|
||||
module_name = '__builtin__'
|
||||
object_name = name
|
||||
if unsafe:
|
||||
try:
|
||||
__import__(module_name)
|
||||
except ImportError, exc:
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"cannot find module %r (%s)" % (module_name.encode('utf-8'), exc), mark)
|
||||
if not module_name in sys.modules:
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"module %r is not imported" % module_name.encode('utf-8'), mark)
|
||||
module = sys.modules[module_name]
|
||||
if not hasattr(module, object_name):
|
||||
raise ConstructorError("while constructing a Python object", mark,
|
||||
"cannot find %r in the module %r" % (object_name.encode('utf-8'),
|
||||
module.__name__), mark)
|
||||
return getattr(module, object_name)
|
||||
|
||||
def construct_python_name(self, suffix, node):
|
||||
value = self.construct_scalar(node)
|
||||
if value:
|
||||
raise ConstructorError("while constructing a Python name", node.start_mark,
|
||||
"expected the empty value, but found %r" % value.encode('utf-8'),
|
||||
node.start_mark)
|
||||
return self.find_python_name(suffix, node.start_mark)
|
||||
|
||||
def construct_python_module(self, suffix, node):
|
||||
value = self.construct_scalar(node)
|
||||
if value:
|
||||
raise ConstructorError("while constructing a Python module", node.start_mark,
|
||||
"expected the empty value, but found %r" % value.encode('utf-8'),
|
||||
node.start_mark)
|
||||
return self.find_python_module(suffix, node.start_mark)
|
||||
|
||||
class classobj: pass
|
||||
|
||||
def make_python_instance(self, suffix, node,
|
||||
args=None, kwds=None, newobj=False, unsafe=False):
|
||||
if not args:
|
||||
args = []
|
||||
if not kwds:
|
||||
kwds = {}
|
||||
cls = self.find_python_name(suffix, node.start_mark)
|
||||
if not (unsafe or isinstance(cls, type) or isinstance(cls, type(self.classobj))):
|
||||
raise ConstructorError("while constructing a Python instance", node.start_mark,
|
||||
"expected a class, but found %r" % type(cls),
|
||||
node.start_mark)
|
||||
if newobj and isinstance(cls, type(self.classobj)) \
|
||||
and not args and not kwds:
|
||||
instance = self.classobj()
|
||||
instance.__class__ = cls
|
||||
return instance
|
||||
elif newobj and isinstance(cls, type):
|
||||
return cls.__new__(cls, *args, **kwds)
|
||||
else:
|
||||
return cls(*args, **kwds)
|
||||
|
||||
def set_python_instance_state(self, instance, state):
|
||||
if hasattr(instance, '__setstate__'):
|
||||
instance.__setstate__(state)
|
||||
else:
|
||||
slotstate = {}
|
||||
if isinstance(state, tuple) and len(state) == 2:
|
||||
state, slotstate = state
|
||||
if hasattr(instance, '__dict__'):
|
||||
instance.__dict__.update(state)
|
||||
elif state:
|
||||
slotstate.update(state)
|
||||
for key, value in slotstate.items():
|
||||
setattr(object, key, value)
|
||||
|
||||
def construct_python_object(self, suffix, node):
|
||||
# Format:
|
||||
# !!python/object:module.name { ... state ... }
|
||||
instance = self.make_python_instance(suffix, node, newobj=True)
|
||||
yield instance
|
||||
deep = hasattr(instance, '__setstate__')
|
||||
state = self.construct_mapping(node, deep=deep)
|
||||
self.set_python_instance_state(instance, state)
|
||||
|
||||
def construct_python_object_apply(self, suffix, node, newobj=False):
|
||||
# Format:
|
||||
# !!python/object/apply # (or !!python/object/new)
|
||||
# args: [ ... arguments ... ]
|
||||
# kwds: { ... keywords ... }
|
||||
# state: ... state ...
|
||||
# listitems: [ ... listitems ... ]
|
||||
# dictitems: { ... dictitems ... }
|
||||
# or short format:
|
||||
# !!python/object/apply [ ... arguments ... ]
|
||||
# The difference between !!python/object/apply and !!python/object/new
|
||||
# is how an object is created, check make_python_instance for details.
|
||||
if isinstance(node, SequenceNode):
|
||||
args = self.construct_sequence(node, deep=True)
|
||||
kwds = {}
|
||||
state = {}
|
||||
listitems = []
|
||||
dictitems = {}
|
||||
else:
|
||||
value = self.construct_mapping(node, deep=True)
|
||||
args = value.get('args', [])
|
||||
kwds = value.get('kwds', {})
|
||||
state = value.get('state', {})
|
||||
listitems = value.get('listitems', [])
|
||||
dictitems = value.get('dictitems', {})
|
||||
instance = self.make_python_instance(suffix, node, args, kwds, newobj)
|
||||
if state:
|
||||
self.set_python_instance_state(instance, state)
|
||||
if listitems:
|
||||
instance.extend(listitems)
|
||||
if dictitems:
|
||||
for key in dictitems:
|
||||
instance[key] = dictitems[key]
|
||||
return instance
|
||||
|
||||
def construct_python_object_new(self, suffix, node):
|
||||
return self.construct_python_object_apply(suffix, node, newobj=True)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/none',
|
||||
FullConstructor.construct_yaml_null)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/bool',
|
||||
FullConstructor.construct_yaml_bool)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/str',
|
||||
FullConstructor.construct_python_str)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/unicode',
|
||||
FullConstructor.construct_python_unicode)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/int',
|
||||
FullConstructor.construct_yaml_int)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/long',
|
||||
FullConstructor.construct_python_long)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/float',
|
||||
FullConstructor.construct_yaml_float)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/complex',
|
||||
FullConstructor.construct_python_complex)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/list',
|
||||
FullConstructor.construct_yaml_seq)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/tuple',
|
||||
FullConstructor.construct_python_tuple)
|
||||
|
||||
FullConstructor.add_constructor(
|
||||
u'tag:yaml.org,2002:python/dict',
|
||||
FullConstructor.construct_yaml_map)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/name:',
|
||||
FullConstructor.construct_python_name)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/module:',
|
||||
FullConstructor.construct_python_module)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object:',
|
||||
FullConstructor.construct_python_object)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object/apply:',
|
||||
FullConstructor.construct_python_object_apply)
|
||||
|
||||
FullConstructor.add_multi_constructor(
|
||||
u'tag:yaml.org,2002:python/object/new:',
|
||||
FullConstructor.construct_python_object_new)
|
||||
|
||||
class UnsafeConstructor(FullConstructor):
|
||||
|
||||
def find_python_module(self, name, mark):
|
||||
return super(UnsafeConstructor, self).find_python_module(name, mark, unsafe=True)
|
||||
|
||||
def find_python_name(self, name, mark):
|
||||
return super(UnsafeConstructor, self).find_python_name(name, mark, unsafe=True)
|
||||
|
||||
def make_python_instance(self, suffix, node, args=None, kwds=None, newobj=False):
|
||||
return super(UnsafeConstructor, self).make_python_instance(
|
||||
suffix, node, args, kwds, newobj, unsafe=True)
|
||||
|
||||
# Constructor is same as UnsafeConstructor. Need to leave this in place in case
|
||||
# people have extended it directly.
|
||||
class Constructor(UnsafeConstructor):
|
||||
pass
|
||||
@@ -1,101 +0,0 @@
|
||||
|
||||
__all__ = [
|
||||
'CBaseLoader', 'CSafeLoader', 'CFullLoader', 'CUnsafeLoader', 'CLoader',
|
||||
'CBaseDumper', 'CSafeDumper', 'CDumper'
|
||||
]
|
||||
|
||||
from _yaml import CParser, CEmitter
|
||||
|
||||
from constructor import *
|
||||
|
||||
from serializer import *
|
||||
from representer import *
|
||||
|
||||
from resolver import *
|
||||
|
||||
class CBaseLoader(CParser, BaseConstructor, BaseResolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
BaseConstructor.__init__(self)
|
||||
BaseResolver.__init__(self)
|
||||
|
||||
class CSafeLoader(CParser, SafeConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
SafeConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CFullLoader(CParser, FullConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
FullConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CUnsafeLoader(CParser, UnsafeConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
UnsafeConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CLoader(CParser, Constructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
CParser.__init__(self, stream)
|
||||
Constructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CBaseDumper(CEmitter, BaseRepresenter, BaseResolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
CEmitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width, encoding=encoding,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CSafeDumper(CEmitter, SafeRepresenter, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
CEmitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width, encoding=encoding,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
SafeRepresenter.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class CDumper(CEmitter, Serializer, Representer, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
CEmitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width, encoding=encoding,
|
||||
allow_unicode=allow_unicode, line_break=line_break,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
@@ -1,62 +0,0 @@
|
||||
|
||||
__all__ = ['BaseDumper', 'SafeDumper', 'Dumper']
|
||||
|
||||
from emitter import *
|
||||
from serializer import *
|
||||
from representer import *
|
||||
from resolver import *
|
||||
|
||||
class BaseDumper(Emitter, Serializer, BaseRepresenter, BaseResolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
Emitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
Serializer.__init__(self, encoding=encoding,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class SafeDumper(Emitter, Serializer, SafeRepresenter, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
Emitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
Serializer.__init__(self, encoding=encoding,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
SafeRepresenter.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class Dumper(Emitter, Serializer, Representer, Resolver):
|
||||
|
||||
def __init__(self, stream,
|
||||
default_style=None, default_flow_style=False,
|
||||
canonical=None, indent=None, width=None,
|
||||
allow_unicode=None, line_break=None,
|
||||
encoding=None, explicit_start=None, explicit_end=None,
|
||||
version=None, tags=None, sort_keys=True):
|
||||
Emitter.__init__(self, stream, canonical=canonical,
|
||||
indent=indent, width=width,
|
||||
allow_unicode=allow_unicode, line_break=line_break)
|
||||
Serializer.__init__(self, encoding=encoding,
|
||||
explicit_start=explicit_start, explicit_end=explicit_end,
|
||||
version=version, tags=tags)
|
||||
Representer.__init__(self, default_style=default_style,
|
||||
default_flow_style=default_flow_style, sort_keys=sort_keys)
|
||||
Resolver.__init__(self)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,75 +0,0 @@
|
||||
|
||||
__all__ = ['Mark', 'YAMLError', 'MarkedYAMLError']
|
||||
|
||||
class Mark(object):
|
||||
|
||||
def __init__(self, name, index, line, column, buffer, pointer):
|
||||
self.name = name
|
||||
self.index = index
|
||||
self.line = line
|
||||
self.column = column
|
||||
self.buffer = buffer
|
||||
self.pointer = pointer
|
||||
|
||||
def get_snippet(self, indent=4, max_length=75):
|
||||
if self.buffer is None:
|
||||
return None
|
||||
head = ''
|
||||
start = self.pointer
|
||||
while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029':
|
||||
start -= 1
|
||||
if self.pointer-start > max_length/2-1:
|
||||
head = ' ... '
|
||||
start += 5
|
||||
break
|
||||
tail = ''
|
||||
end = self.pointer
|
||||
while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029':
|
||||
end += 1
|
||||
if end-self.pointer > max_length/2-1:
|
||||
tail = ' ... '
|
||||
end -= 5
|
||||
break
|
||||
snippet = self.buffer[start:end].encode('utf-8')
|
||||
return ' '*indent + head + snippet + tail + '\n' \
|
||||
+ ' '*(indent+self.pointer-start+len(head)) + '^'
|
||||
|
||||
def __str__(self):
|
||||
snippet = self.get_snippet()
|
||||
where = " in \"%s\", line %d, column %d" \
|
||||
% (self.name, self.line+1, self.column+1)
|
||||
if snippet is not None:
|
||||
where += ":\n"+snippet
|
||||
return where
|
||||
|
||||
class YAMLError(Exception):
|
||||
pass
|
||||
|
||||
class MarkedYAMLError(YAMLError):
|
||||
|
||||
def __init__(self, context=None, context_mark=None,
|
||||
problem=None, problem_mark=None, note=None):
|
||||
self.context = context
|
||||
self.context_mark = context_mark
|
||||
self.problem = problem
|
||||
self.problem_mark = problem_mark
|
||||
self.note = note
|
||||
|
||||
def __str__(self):
|
||||
lines = []
|
||||
if self.context is not None:
|
||||
lines.append(self.context)
|
||||
if self.context_mark is not None \
|
||||
and (self.problem is None or self.problem_mark is None
|
||||
or self.context_mark.name != self.problem_mark.name
|
||||
or self.context_mark.line != self.problem_mark.line
|
||||
or self.context_mark.column != self.problem_mark.column):
|
||||
lines.append(str(self.context_mark))
|
||||
if self.problem is not None:
|
||||
lines.append(self.problem)
|
||||
if self.problem_mark is not None:
|
||||
lines.append(str(self.problem_mark))
|
||||
if self.note is not None:
|
||||
lines.append(self.note)
|
||||
return '\n'.join(lines)
|
||||
|
||||
@@ -1,86 +0,0 @@
|
||||
|
||||
# Abstract classes.
|
||||
|
||||
class Event(object):
|
||||
def __init__(self, start_mark=None, end_mark=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
def __repr__(self):
|
||||
attributes = [key for key in ['anchor', 'tag', 'implicit', 'value']
|
||||
if hasattr(self, key)]
|
||||
arguments = ', '.join(['%s=%r' % (key, getattr(self, key))
|
||||
for key in attributes])
|
||||
return '%s(%s)' % (self.__class__.__name__, arguments)
|
||||
|
||||
class NodeEvent(Event):
|
||||
def __init__(self, anchor, start_mark=None, end_mark=None):
|
||||
self.anchor = anchor
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
|
||||
class CollectionStartEvent(NodeEvent):
|
||||
def __init__(self, anchor, tag, implicit, start_mark=None, end_mark=None,
|
||||
flow_style=None):
|
||||
self.anchor = anchor
|
||||
self.tag = tag
|
||||
self.implicit = implicit
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.flow_style = flow_style
|
||||
|
||||
class CollectionEndEvent(Event):
|
||||
pass
|
||||
|
||||
# Implementations.
|
||||
|
||||
class StreamStartEvent(Event):
|
||||
def __init__(self, start_mark=None, end_mark=None, encoding=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.encoding = encoding
|
||||
|
||||
class StreamEndEvent(Event):
|
||||
pass
|
||||
|
||||
class DocumentStartEvent(Event):
|
||||
def __init__(self, start_mark=None, end_mark=None,
|
||||
explicit=None, version=None, tags=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.explicit = explicit
|
||||
self.version = version
|
||||
self.tags = tags
|
||||
|
||||
class DocumentEndEvent(Event):
|
||||
def __init__(self, start_mark=None, end_mark=None,
|
||||
explicit=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.explicit = explicit
|
||||
|
||||
class AliasEvent(NodeEvent):
|
||||
pass
|
||||
|
||||
class ScalarEvent(NodeEvent):
|
||||
def __init__(self, anchor, tag, implicit, value,
|
||||
start_mark=None, end_mark=None, style=None):
|
||||
self.anchor = anchor
|
||||
self.tag = tag
|
||||
self.implicit = implicit
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.style = style
|
||||
|
||||
class SequenceStartEvent(CollectionStartEvent):
|
||||
pass
|
||||
|
||||
class SequenceEndEvent(CollectionEndEvent):
|
||||
pass
|
||||
|
||||
class MappingStartEvent(CollectionStartEvent):
|
||||
pass
|
||||
|
||||
class MappingEndEvent(CollectionEndEvent):
|
||||
pass
|
||||
|
||||
@@ -1,63 +0,0 @@
|
||||
|
||||
__all__ = ['BaseLoader', 'FullLoader', 'SafeLoader', 'Loader', 'UnsafeLoader']
|
||||
|
||||
from reader import *
|
||||
from scanner import *
|
||||
from parser import *
|
||||
from composer import *
|
||||
from constructor import *
|
||||
from resolver import *
|
||||
|
||||
class BaseLoader(Reader, Scanner, Parser, Composer, BaseConstructor, BaseResolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
Reader.__init__(self, stream)
|
||||
Scanner.__init__(self)
|
||||
Parser.__init__(self)
|
||||
Composer.__init__(self)
|
||||
BaseConstructor.__init__(self)
|
||||
BaseResolver.__init__(self)
|
||||
|
||||
class FullLoader(Reader, Scanner, Parser, Composer, FullConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
Reader.__init__(self, stream)
|
||||
Scanner.__init__(self)
|
||||
Parser.__init__(self)
|
||||
Composer.__init__(self)
|
||||
FullConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class SafeLoader(Reader, Scanner, Parser, Composer, SafeConstructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
Reader.__init__(self, stream)
|
||||
Scanner.__init__(self)
|
||||
Parser.__init__(self)
|
||||
Composer.__init__(self)
|
||||
SafeConstructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
class Loader(Reader, Scanner, Parser, Composer, Constructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
Reader.__init__(self, stream)
|
||||
Scanner.__init__(self)
|
||||
Parser.__init__(self)
|
||||
Composer.__init__(self)
|
||||
Constructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
|
||||
# UnsafeLoader is the same as Loader (which is and was always unsafe on
|
||||
# untrusted input). Use of either Loader or UnsafeLoader should be rare, since
|
||||
# FullLoad should be able to load almost all YAML safely. Loader is left intact
|
||||
# to ensure backwards compatability.
|
||||
class UnsafeLoader(Reader, Scanner, Parser, Composer, Constructor, Resolver):
|
||||
|
||||
def __init__(self, stream):
|
||||
Reader.__init__(self, stream)
|
||||
Scanner.__init__(self)
|
||||
Parser.__init__(self)
|
||||
Composer.__init__(self)
|
||||
Constructor.__init__(self)
|
||||
Resolver.__init__(self)
|
||||
@@ -1,49 +0,0 @@
|
||||
|
||||
class Node(object):
|
||||
def __init__(self, tag, value, start_mark, end_mark):
|
||||
self.tag = tag
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
def __repr__(self):
|
||||
value = self.value
|
||||
#if isinstance(value, list):
|
||||
# if len(value) == 0:
|
||||
# value = '<empty>'
|
||||
# elif len(value) == 1:
|
||||
# value = '<1 item>'
|
||||
# else:
|
||||
# value = '<%d items>' % len(value)
|
||||
#else:
|
||||
# if len(value) > 75:
|
||||
# value = repr(value[:70]+u' ... ')
|
||||
# else:
|
||||
# value = repr(value)
|
||||
value = repr(value)
|
||||
return '%s(tag=%r, value=%s)' % (self.__class__.__name__, self.tag, value)
|
||||
|
||||
class ScalarNode(Node):
|
||||
id = 'scalar'
|
||||
def __init__(self, tag, value,
|
||||
start_mark=None, end_mark=None, style=None):
|
||||
self.tag = tag
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.style = style
|
||||
|
||||
class CollectionNode(Node):
|
||||
def __init__(self, tag, value,
|
||||
start_mark=None, end_mark=None, flow_style=None):
|
||||
self.tag = tag
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.flow_style = flow_style
|
||||
|
||||
class SequenceNode(CollectionNode):
|
||||
id = 'sequence'
|
||||
|
||||
class MappingNode(CollectionNode):
|
||||
id = 'mapping'
|
||||
|
||||
@@ -1,589 +0,0 @@
|
||||
|
||||
# The following YAML grammar is LL(1) and is parsed by a recursive descent
|
||||
# parser.
|
||||
#
|
||||
# stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
|
||||
# implicit_document ::= block_node DOCUMENT-END*
|
||||
# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
|
||||
# block_node_or_indentless_sequence ::=
|
||||
# ALIAS
|
||||
# | properties (block_content | indentless_block_sequence)?
|
||||
# | block_content
|
||||
# | indentless_block_sequence
|
||||
# block_node ::= ALIAS
|
||||
# | properties block_content?
|
||||
# | block_content
|
||||
# flow_node ::= ALIAS
|
||||
# | properties flow_content?
|
||||
# | flow_content
|
||||
# properties ::= TAG ANCHOR? | ANCHOR TAG?
|
||||
# block_content ::= block_collection | flow_collection | SCALAR
|
||||
# flow_content ::= flow_collection | SCALAR
|
||||
# block_collection ::= block_sequence | block_mapping
|
||||
# flow_collection ::= flow_sequence | flow_mapping
|
||||
# block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
|
||||
# indentless_sequence ::= (BLOCK-ENTRY block_node?)+
|
||||
# block_mapping ::= BLOCK-MAPPING_START
|
||||
# ((KEY block_node_or_indentless_sequence?)?
|
||||
# (VALUE block_node_or_indentless_sequence?)?)*
|
||||
# BLOCK-END
|
||||
# flow_sequence ::= FLOW-SEQUENCE-START
|
||||
# (flow_sequence_entry FLOW-ENTRY)*
|
||||
# flow_sequence_entry?
|
||||
# FLOW-SEQUENCE-END
|
||||
# flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
|
||||
# flow_mapping ::= FLOW-MAPPING-START
|
||||
# (flow_mapping_entry FLOW-ENTRY)*
|
||||
# flow_mapping_entry?
|
||||
# FLOW-MAPPING-END
|
||||
# flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
|
||||
#
|
||||
# FIRST sets:
|
||||
#
|
||||
# stream: { STREAM-START }
|
||||
# explicit_document: { DIRECTIVE DOCUMENT-START }
|
||||
# implicit_document: FIRST(block_node)
|
||||
# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
|
||||
# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
|
||||
# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
|
||||
# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
|
||||
# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
|
||||
# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
|
||||
# block_sequence: { BLOCK-SEQUENCE-START }
|
||||
# block_mapping: { BLOCK-MAPPING-START }
|
||||
# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
|
||||
# indentless_sequence: { ENTRY }
|
||||
# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
|
||||
# flow_sequence: { FLOW-SEQUENCE-START }
|
||||
# flow_mapping: { FLOW-MAPPING-START }
|
||||
# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
|
||||
# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
|
||||
|
||||
__all__ = ['Parser', 'ParserError']
|
||||
|
||||
from error import MarkedYAMLError
|
||||
from tokens import *
|
||||
from events import *
|
||||
from scanner import *
|
||||
|
||||
class ParserError(MarkedYAMLError):
|
||||
pass
|
||||
|
||||
class Parser(object):
|
||||
# Since writing a recursive-descendant parser is a straightforward task, we
|
||||
# do not give many comments here.
|
||||
|
||||
DEFAULT_TAGS = {
|
||||
u'!': u'!',
|
||||
u'!!': u'tag:yaml.org,2002:',
|
||||
}
|
||||
|
||||
def __init__(self):
|
||||
self.current_event = None
|
||||
self.yaml_version = None
|
||||
self.tag_handles = {}
|
||||
self.states = []
|
||||
self.marks = []
|
||||
self.state = self.parse_stream_start
|
||||
|
||||
def dispose(self):
|
||||
# Reset the state attributes (to clear self-references)
|
||||
self.states = []
|
||||
self.state = None
|
||||
|
||||
def check_event(self, *choices):
|
||||
# Check the type of the next event.
|
||||
if self.current_event is None:
|
||||
if self.state:
|
||||
self.current_event = self.state()
|
||||
if self.current_event is not None:
|
||||
if not choices:
|
||||
return True
|
||||
for choice in choices:
|
||||
if isinstance(self.current_event, choice):
|
||||
return True
|
||||
return False
|
||||
|
||||
def peek_event(self):
|
||||
# Get the next event.
|
||||
if self.current_event is None:
|
||||
if self.state:
|
||||
self.current_event = self.state()
|
||||
return self.current_event
|
||||
|
||||
def get_event(self):
|
||||
# Get the next event and proceed further.
|
||||
if self.current_event is None:
|
||||
if self.state:
|
||||
self.current_event = self.state()
|
||||
value = self.current_event
|
||||
self.current_event = None
|
||||
return value
|
||||
|
||||
# stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
|
||||
# implicit_document ::= block_node DOCUMENT-END*
|
||||
# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
|
||||
|
||||
def parse_stream_start(self):
|
||||
|
||||
# Parse the stream start.
|
||||
token = self.get_token()
|
||||
event = StreamStartEvent(token.start_mark, token.end_mark,
|
||||
encoding=token.encoding)
|
||||
|
||||
# Prepare the next state.
|
||||
self.state = self.parse_implicit_document_start
|
||||
|
||||
return event
|
||||
|
||||
def parse_implicit_document_start(self):
|
||||
|
||||
# Parse an implicit document.
|
||||
if not self.check_token(DirectiveToken, DocumentStartToken,
|
||||
StreamEndToken):
|
||||
self.tag_handles = self.DEFAULT_TAGS
|
||||
token = self.peek_token()
|
||||
start_mark = end_mark = token.start_mark
|
||||
event = DocumentStartEvent(start_mark, end_mark,
|
||||
explicit=False)
|
||||
|
||||
# Prepare the next state.
|
||||
self.states.append(self.parse_document_end)
|
||||
self.state = self.parse_block_node
|
||||
|
||||
return event
|
||||
|
||||
else:
|
||||
return self.parse_document_start()
|
||||
|
||||
def parse_document_start(self):
|
||||
|
||||
# Parse any extra document end indicators.
|
||||
while self.check_token(DocumentEndToken):
|
||||
self.get_token()
|
||||
|
||||
# Parse an explicit document.
|
||||
if not self.check_token(StreamEndToken):
|
||||
token = self.peek_token()
|
||||
start_mark = token.start_mark
|
||||
version, tags = self.process_directives()
|
||||
if not self.check_token(DocumentStartToken):
|
||||
raise ParserError(None, None,
|
||||
"expected '<document start>', but found %r"
|
||||
% self.peek_token().id,
|
||||
self.peek_token().start_mark)
|
||||
token = self.get_token()
|
||||
end_mark = token.end_mark
|
||||
event = DocumentStartEvent(start_mark, end_mark,
|
||||
explicit=True, version=version, tags=tags)
|
||||
self.states.append(self.parse_document_end)
|
||||
self.state = self.parse_document_content
|
||||
else:
|
||||
# Parse the end of the stream.
|
||||
token = self.get_token()
|
||||
event = StreamEndEvent(token.start_mark, token.end_mark)
|
||||
assert not self.states
|
||||
assert not self.marks
|
||||
self.state = None
|
||||
return event
|
||||
|
||||
def parse_document_end(self):
|
||||
|
||||
# Parse the document end.
|
||||
token = self.peek_token()
|
||||
start_mark = end_mark = token.start_mark
|
||||
explicit = False
|
||||
if self.check_token(DocumentEndToken):
|
||||
token = self.get_token()
|
||||
end_mark = token.end_mark
|
||||
explicit = True
|
||||
event = DocumentEndEvent(start_mark, end_mark,
|
||||
explicit=explicit)
|
||||
|
||||
# Prepare the next state.
|
||||
self.state = self.parse_document_start
|
||||
|
||||
return event
|
||||
|
||||
def parse_document_content(self):
|
||||
if self.check_token(DirectiveToken,
|
||||
DocumentStartToken, DocumentEndToken, StreamEndToken):
|
||||
event = self.process_empty_scalar(self.peek_token().start_mark)
|
||||
self.state = self.states.pop()
|
||||
return event
|
||||
else:
|
||||
return self.parse_block_node()
|
||||
|
||||
def process_directives(self):
|
||||
self.yaml_version = None
|
||||
self.tag_handles = {}
|
||||
while self.check_token(DirectiveToken):
|
||||
token = self.get_token()
|
||||
if token.name == u'YAML':
|
||||
if self.yaml_version is not None:
|
||||
raise ParserError(None, None,
|
||||
"found duplicate YAML directive", token.start_mark)
|
||||
major, minor = token.value
|
||||
if major != 1:
|
||||
raise ParserError(None, None,
|
||||
"found incompatible YAML document (version 1.* is required)",
|
||||
token.start_mark)
|
||||
self.yaml_version = token.value
|
||||
elif token.name == u'TAG':
|
||||
handle, prefix = token.value
|
||||
if handle in self.tag_handles:
|
||||
raise ParserError(None, None,
|
||||
"duplicate tag handle %r" % handle.encode('utf-8'),
|
||||
token.start_mark)
|
||||
self.tag_handles[handle] = prefix
|
||||
if self.tag_handles:
|
||||
value = self.yaml_version, self.tag_handles.copy()
|
||||
else:
|
||||
value = self.yaml_version, None
|
||||
for key in self.DEFAULT_TAGS:
|
||||
if key not in self.tag_handles:
|
||||
self.tag_handles[key] = self.DEFAULT_TAGS[key]
|
||||
return value
|
||||
|
||||
# block_node_or_indentless_sequence ::= ALIAS
|
||||
# | properties (block_content | indentless_block_sequence)?
|
||||
# | block_content
|
||||
# | indentless_block_sequence
|
||||
# block_node ::= ALIAS
|
||||
# | properties block_content?
|
||||
# | block_content
|
||||
# flow_node ::= ALIAS
|
||||
# | properties flow_content?
|
||||
# | flow_content
|
||||
# properties ::= TAG ANCHOR? | ANCHOR TAG?
|
||||
# block_content ::= block_collection | flow_collection | SCALAR
|
||||
# flow_content ::= flow_collection | SCALAR
|
||||
# block_collection ::= block_sequence | block_mapping
|
||||
# flow_collection ::= flow_sequence | flow_mapping
|
||||
|
||||
def parse_block_node(self):
|
||||
return self.parse_node(block=True)
|
||||
|
||||
def parse_flow_node(self):
|
||||
return self.parse_node()
|
||||
|
||||
def parse_block_node_or_indentless_sequence(self):
|
||||
return self.parse_node(block=True, indentless_sequence=True)
|
||||
|
||||
def parse_node(self, block=False, indentless_sequence=False):
|
||||
if self.check_token(AliasToken):
|
||||
token = self.get_token()
|
||||
event = AliasEvent(token.value, token.start_mark, token.end_mark)
|
||||
self.state = self.states.pop()
|
||||
else:
|
||||
anchor = None
|
||||
tag = None
|
||||
start_mark = end_mark = tag_mark = None
|
||||
if self.check_token(AnchorToken):
|
||||
token = self.get_token()
|
||||
start_mark = token.start_mark
|
||||
end_mark = token.end_mark
|
||||
anchor = token.value
|
||||
if self.check_token(TagToken):
|
||||
token = self.get_token()
|
||||
tag_mark = token.start_mark
|
||||
end_mark = token.end_mark
|
||||
tag = token.value
|
||||
elif self.check_token(TagToken):
|
||||
token = self.get_token()
|
||||
start_mark = tag_mark = token.start_mark
|
||||
end_mark = token.end_mark
|
||||
tag = token.value
|
||||
if self.check_token(AnchorToken):
|
||||
token = self.get_token()
|
||||
end_mark = token.end_mark
|
||||
anchor = token.value
|
||||
if tag is not None:
|
||||
handle, suffix = tag
|
||||
if handle is not None:
|
||||
if handle not in self.tag_handles:
|
||||
raise ParserError("while parsing a node", start_mark,
|
||||
"found undefined tag handle %r" % handle.encode('utf-8'),
|
||||
tag_mark)
|
||||
tag = self.tag_handles[handle]+suffix
|
||||
else:
|
||||
tag = suffix
|
||||
#if tag == u'!':
|
||||
# raise ParserError("while parsing a node", start_mark,
|
||||
# "found non-specific tag '!'", tag_mark,
|
||||
# "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
|
||||
if start_mark is None:
|
||||
start_mark = end_mark = self.peek_token().start_mark
|
||||
event = None
|
||||
implicit = (tag is None or tag == u'!')
|
||||
if indentless_sequence and self.check_token(BlockEntryToken):
|
||||
end_mark = self.peek_token().end_mark
|
||||
event = SequenceStartEvent(anchor, tag, implicit,
|
||||
start_mark, end_mark)
|
||||
self.state = self.parse_indentless_sequence_entry
|
||||
else:
|
||||
if self.check_token(ScalarToken):
|
||||
token = self.get_token()
|
||||
end_mark = token.end_mark
|
||||
if (token.plain and tag is None) or tag == u'!':
|
||||
implicit = (True, False)
|
||||
elif tag is None:
|
||||
implicit = (False, True)
|
||||
else:
|
||||
implicit = (False, False)
|
||||
event = ScalarEvent(anchor, tag, implicit, token.value,
|
||||
start_mark, end_mark, style=token.style)
|
||||
self.state = self.states.pop()
|
||||
elif self.check_token(FlowSequenceStartToken):
|
||||
end_mark = self.peek_token().end_mark
|
||||
event = SequenceStartEvent(anchor, tag, implicit,
|
||||
start_mark, end_mark, flow_style=True)
|
||||
self.state = self.parse_flow_sequence_first_entry
|
||||
elif self.check_token(FlowMappingStartToken):
|
||||
end_mark = self.peek_token().end_mark
|
||||
event = MappingStartEvent(anchor, tag, implicit,
|
||||
start_mark, end_mark, flow_style=True)
|
||||
self.state = self.parse_flow_mapping_first_key
|
||||
elif block and self.check_token(BlockSequenceStartToken):
|
||||
end_mark = self.peek_token().start_mark
|
||||
event = SequenceStartEvent(anchor, tag, implicit,
|
||||
start_mark, end_mark, flow_style=False)
|
||||
self.state = self.parse_block_sequence_first_entry
|
||||
elif block and self.check_token(BlockMappingStartToken):
|
||||
end_mark = self.peek_token().start_mark
|
||||
event = MappingStartEvent(anchor, tag, implicit,
|
||||
start_mark, end_mark, flow_style=False)
|
||||
self.state = self.parse_block_mapping_first_key
|
||||
elif anchor is not None or tag is not None:
|
||||
# Empty scalars are allowed even if a tag or an anchor is
|
||||
# specified.
|
||||
event = ScalarEvent(anchor, tag, (implicit, False), u'',
|
||||
start_mark, end_mark)
|
||||
self.state = self.states.pop()
|
||||
else:
|
||||
if block:
|
||||
node = 'block'
|
||||
else:
|
||||
node = 'flow'
|
||||
token = self.peek_token()
|
||||
raise ParserError("while parsing a %s node" % node, start_mark,
|
||||
"expected the node content, but found %r" % token.id,
|
||||
token.start_mark)
|
||||
return event
|
||||
|
||||
# block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
|
||||
|
||||
def parse_block_sequence_first_entry(self):
|
||||
token = self.get_token()
|
||||
self.marks.append(token.start_mark)
|
||||
return self.parse_block_sequence_entry()
|
||||
|
||||
def parse_block_sequence_entry(self):
|
||||
if self.check_token(BlockEntryToken):
|
||||
token = self.get_token()
|
||||
if not self.check_token(BlockEntryToken, BlockEndToken):
|
||||
self.states.append(self.parse_block_sequence_entry)
|
||||
return self.parse_block_node()
|
||||
else:
|
||||
self.state = self.parse_block_sequence_entry
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
if not self.check_token(BlockEndToken):
|
||||
token = self.peek_token()
|
||||
raise ParserError("while parsing a block collection", self.marks[-1],
|
||||
"expected <block end>, but found %r" % token.id, token.start_mark)
|
||||
token = self.get_token()
|
||||
event = SequenceEndEvent(token.start_mark, token.end_mark)
|
||||
self.state = self.states.pop()
|
||||
self.marks.pop()
|
||||
return event
|
||||
|
||||
# indentless_sequence ::= (BLOCK-ENTRY block_node?)+
|
||||
|
||||
def parse_indentless_sequence_entry(self):
|
||||
if self.check_token(BlockEntryToken):
|
||||
token = self.get_token()
|
||||
if not self.check_token(BlockEntryToken,
|
||||
KeyToken, ValueToken, BlockEndToken):
|
||||
self.states.append(self.parse_indentless_sequence_entry)
|
||||
return self.parse_block_node()
|
||||
else:
|
||||
self.state = self.parse_indentless_sequence_entry
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
token = self.peek_token()
|
||||
event = SequenceEndEvent(token.start_mark, token.start_mark)
|
||||
self.state = self.states.pop()
|
||||
return event
|
||||
|
||||
# block_mapping ::= BLOCK-MAPPING_START
|
||||
# ((KEY block_node_or_indentless_sequence?)?
|
||||
# (VALUE block_node_or_indentless_sequence?)?)*
|
||||
# BLOCK-END
|
||||
|
||||
def parse_block_mapping_first_key(self):
|
||||
token = self.get_token()
|
||||
self.marks.append(token.start_mark)
|
||||
return self.parse_block_mapping_key()
|
||||
|
||||
def parse_block_mapping_key(self):
|
||||
if self.check_token(KeyToken):
|
||||
token = self.get_token()
|
||||
if not self.check_token(KeyToken, ValueToken, BlockEndToken):
|
||||
self.states.append(self.parse_block_mapping_value)
|
||||
return self.parse_block_node_or_indentless_sequence()
|
||||
else:
|
||||
self.state = self.parse_block_mapping_value
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
if not self.check_token(BlockEndToken):
|
||||
token = self.peek_token()
|
||||
raise ParserError("while parsing a block mapping", self.marks[-1],
|
||||
"expected <block end>, but found %r" % token.id, token.start_mark)
|
||||
token = self.get_token()
|
||||
event = MappingEndEvent(token.start_mark, token.end_mark)
|
||||
self.state = self.states.pop()
|
||||
self.marks.pop()
|
||||
return event
|
||||
|
||||
def parse_block_mapping_value(self):
|
||||
if self.check_token(ValueToken):
|
||||
token = self.get_token()
|
||||
if not self.check_token(KeyToken, ValueToken, BlockEndToken):
|
||||
self.states.append(self.parse_block_mapping_key)
|
||||
return self.parse_block_node_or_indentless_sequence()
|
||||
else:
|
||||
self.state = self.parse_block_mapping_key
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
else:
|
||||
self.state = self.parse_block_mapping_key
|
||||
token = self.peek_token()
|
||||
return self.process_empty_scalar(token.start_mark)
|
||||
|
||||
# flow_sequence ::= FLOW-SEQUENCE-START
|
||||
# (flow_sequence_entry FLOW-ENTRY)*
|
||||
# flow_sequence_entry?
|
||||
# FLOW-SEQUENCE-END
|
||||
# flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
|
||||
#
|
||||
# Note that while production rules for both flow_sequence_entry and
|
||||
# flow_mapping_entry are equal, their interpretations are different.
|
||||
# For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
|
||||
# generate an inline mapping (set syntax).
|
||||
|
||||
def parse_flow_sequence_first_entry(self):
|
||||
token = self.get_token()
|
||||
self.marks.append(token.start_mark)
|
||||
return self.parse_flow_sequence_entry(first=True)
|
||||
|
||||
def parse_flow_sequence_entry(self, first=False):
|
||||
if not self.check_token(FlowSequenceEndToken):
|
||||
if not first:
|
||||
if self.check_token(FlowEntryToken):
|
||||
self.get_token()
|
||||
else:
|
||||
token = self.peek_token()
|
||||
raise ParserError("while parsing a flow sequence", self.marks[-1],
|
||||
"expected ',' or ']', but got %r" % token.id, token.start_mark)
|
||||
|
||||
if self.check_token(KeyToken):
|
||||
token = self.peek_token()
|
||||
event = MappingStartEvent(None, None, True,
|
||||
token.start_mark, token.end_mark,
|
||||
flow_style=True)
|
||||
self.state = self.parse_flow_sequence_entry_mapping_key
|
||||
return event
|
||||
elif not self.check_token(FlowSequenceEndToken):
|
||||
self.states.append(self.parse_flow_sequence_entry)
|
||||
return self.parse_flow_node()
|
||||
token = self.get_token()
|
||||
event = SequenceEndEvent(token.start_mark, token.end_mark)
|
||||
self.state = self.states.pop()
|
||||
self.marks.pop()
|
||||
return event
|
||||
|
||||
def parse_flow_sequence_entry_mapping_key(self):
|
||||
token = self.get_token()
|
||||
if not self.check_token(ValueToken,
|
||||
FlowEntryToken, FlowSequenceEndToken):
|
||||
self.states.append(self.parse_flow_sequence_entry_mapping_value)
|
||||
return self.parse_flow_node()
|
||||
else:
|
||||
self.state = self.parse_flow_sequence_entry_mapping_value
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
|
||||
def parse_flow_sequence_entry_mapping_value(self):
|
||||
if self.check_token(ValueToken):
|
||||
token = self.get_token()
|
||||
if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
|
||||
self.states.append(self.parse_flow_sequence_entry_mapping_end)
|
||||
return self.parse_flow_node()
|
||||
else:
|
||||
self.state = self.parse_flow_sequence_entry_mapping_end
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
else:
|
||||
self.state = self.parse_flow_sequence_entry_mapping_end
|
||||
token = self.peek_token()
|
||||
return self.process_empty_scalar(token.start_mark)
|
||||
|
||||
def parse_flow_sequence_entry_mapping_end(self):
|
||||
self.state = self.parse_flow_sequence_entry
|
||||
token = self.peek_token()
|
||||
return MappingEndEvent(token.start_mark, token.start_mark)
|
||||
|
||||
# flow_mapping ::= FLOW-MAPPING-START
|
||||
# (flow_mapping_entry FLOW-ENTRY)*
|
||||
# flow_mapping_entry?
|
||||
# FLOW-MAPPING-END
|
||||
# flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
|
||||
|
||||
def parse_flow_mapping_first_key(self):
|
||||
token = self.get_token()
|
||||
self.marks.append(token.start_mark)
|
||||
return self.parse_flow_mapping_key(first=True)
|
||||
|
||||
def parse_flow_mapping_key(self, first=False):
|
||||
if not self.check_token(FlowMappingEndToken):
|
||||
if not first:
|
||||
if self.check_token(FlowEntryToken):
|
||||
self.get_token()
|
||||
else:
|
||||
token = self.peek_token()
|
||||
raise ParserError("while parsing a flow mapping", self.marks[-1],
|
||||
"expected ',' or '}', but got %r" % token.id, token.start_mark)
|
||||
if self.check_token(KeyToken):
|
||||
token = self.get_token()
|
||||
if not self.check_token(ValueToken,
|
||||
FlowEntryToken, FlowMappingEndToken):
|
||||
self.states.append(self.parse_flow_mapping_value)
|
||||
return self.parse_flow_node()
|
||||
else:
|
||||
self.state = self.parse_flow_mapping_value
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
elif not self.check_token(FlowMappingEndToken):
|
||||
self.states.append(self.parse_flow_mapping_empty_value)
|
||||
return self.parse_flow_node()
|
||||
token = self.get_token()
|
||||
event = MappingEndEvent(token.start_mark, token.end_mark)
|
||||
self.state = self.states.pop()
|
||||
self.marks.pop()
|
||||
return event
|
||||
|
||||
def parse_flow_mapping_value(self):
|
||||
if self.check_token(ValueToken):
|
||||
token = self.get_token()
|
||||
if not self.check_token(FlowEntryToken, FlowMappingEndToken):
|
||||
self.states.append(self.parse_flow_mapping_key)
|
||||
return self.parse_flow_node()
|
||||
else:
|
||||
self.state = self.parse_flow_mapping_key
|
||||
return self.process_empty_scalar(token.end_mark)
|
||||
else:
|
||||
self.state = self.parse_flow_mapping_key
|
||||
token = self.peek_token()
|
||||
return self.process_empty_scalar(token.start_mark)
|
||||
|
||||
def parse_flow_mapping_empty_value(self):
|
||||
self.state = self.parse_flow_mapping_key
|
||||
return self.process_empty_scalar(self.peek_token().start_mark)
|
||||
|
||||
def process_empty_scalar(self, mark):
|
||||
return ScalarEvent(None, None, (True, False), u'', mark, mark)
|
||||
|
||||
@@ -1,188 +0,0 @@
|
||||
# This module contains abstractions for the input stream. You don't have to
|
||||
# looks further, there are no pretty code.
|
||||
#
|
||||
# We define two classes here.
|
||||
#
|
||||
# Mark(source, line, column)
|
||||
# It's just a record and its only use is producing nice error messages.
|
||||
# Parser does not use it for any other purposes.
|
||||
#
|
||||
# Reader(source, data)
|
||||
# Reader determines the encoding of `data` and converts it to unicode.
|
||||
# Reader provides the following methods and attributes:
|
||||
# reader.peek(length=1) - return the next `length` characters
|
||||
# reader.forward(length=1) - move the current position to `length` characters.
|
||||
# reader.index - the number of the current character.
|
||||
# reader.line, stream.column - the line and the column of the current character.
|
||||
|
||||
__all__ = ['Reader', 'ReaderError']
|
||||
|
||||
from error import YAMLError, Mark
|
||||
|
||||
import codecs, re, sys
|
||||
|
||||
has_ucs4 = sys.maxunicode > 0xffff
|
||||
|
||||
class ReaderError(YAMLError):
|
||||
|
||||
def __init__(self, name, position, character, encoding, reason):
|
||||
self.name = name
|
||||
self.character = character
|
||||
self.position = position
|
||||
self.encoding = encoding
|
||||
self.reason = reason
|
||||
|
||||
def __str__(self):
|
||||
if isinstance(self.character, str):
|
||||
return "'%s' codec can't decode byte #x%02x: %s\n" \
|
||||
" in \"%s\", position %d" \
|
||||
% (self.encoding, ord(self.character), self.reason,
|
||||
self.name, self.position)
|
||||
else:
|
||||
return "unacceptable character #x%04x: %s\n" \
|
||||
" in \"%s\", position %d" \
|
||||
% (self.character, self.reason,
|
||||
self.name, self.position)
|
||||
|
||||
class Reader(object):
|
||||
# Reader:
|
||||
# - determines the data encoding and converts it to unicode,
|
||||
# - checks if characters are in allowed range,
|
||||
# - adds '\0' to the end.
|
||||
|
||||
# Reader accepts
|
||||
# - a `str` object,
|
||||
# - a `unicode` object,
|
||||
# - a file-like object with its `read` method returning `str`,
|
||||
# - a file-like object with its `read` method returning `unicode`.
|
||||
|
||||
# Yeah, it's ugly and slow.
|
||||
|
||||
def __init__(self, stream):
|
||||
self.name = None
|
||||
self.stream = None
|
||||
self.stream_pointer = 0
|
||||
self.eof = True
|
||||
self.buffer = u''
|
||||
self.pointer = 0
|
||||
self.raw_buffer = None
|
||||
self.raw_decode = None
|
||||
self.encoding = None
|
||||
self.index = 0
|
||||
self.line = 0
|
||||
self.column = 0
|
||||
if isinstance(stream, unicode):
|
||||
self.name = "<unicode string>"
|
||||
self.check_printable(stream)
|
||||
self.buffer = stream+u'\0'
|
||||
elif isinstance(stream, str):
|
||||
self.name = "<string>"
|
||||
self.raw_buffer = stream
|
||||
self.determine_encoding()
|
||||
else:
|
||||
self.stream = stream
|
||||
self.name = getattr(stream, 'name', "<file>")
|
||||
self.eof = False
|
||||
self.raw_buffer = ''
|
||||
self.determine_encoding()
|
||||
|
||||
def peek(self, index=0):
|
||||
try:
|
||||
return self.buffer[self.pointer+index]
|
||||
except IndexError:
|
||||
self.update(index+1)
|
||||
return self.buffer[self.pointer+index]
|
||||
|
||||
def prefix(self, length=1):
|
||||
if self.pointer+length >= len(self.buffer):
|
||||
self.update(length)
|
||||
return self.buffer[self.pointer:self.pointer+length]
|
||||
|
||||
def forward(self, length=1):
|
||||
if self.pointer+length+1 >= len(self.buffer):
|
||||
self.update(length+1)
|
||||
while length:
|
||||
ch = self.buffer[self.pointer]
|
||||
self.pointer += 1
|
||||
self.index += 1
|
||||
if ch in u'\n\x85\u2028\u2029' \
|
||||
or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):
|
||||
self.line += 1
|
||||
self.column = 0
|
||||
elif ch != u'\uFEFF':
|
||||
self.column += 1
|
||||
length -= 1
|
||||
|
||||
def get_mark(self):
|
||||
if self.stream is None:
|
||||
return Mark(self.name, self.index, self.line, self.column,
|
||||
self.buffer, self.pointer)
|
||||
else:
|
||||
return Mark(self.name, self.index, self.line, self.column,
|
||||
None, None)
|
||||
|
||||
def determine_encoding(self):
|
||||
while not self.eof and len(self.raw_buffer) < 2:
|
||||
self.update_raw()
|
||||
if not isinstance(self.raw_buffer, unicode):
|
||||
if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
|
||||
self.raw_decode = codecs.utf_16_le_decode
|
||||
self.encoding = 'utf-16-le'
|
||||
elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
|
||||
self.raw_decode = codecs.utf_16_be_decode
|
||||
self.encoding = 'utf-16-be'
|
||||
else:
|
||||
self.raw_decode = codecs.utf_8_decode
|
||||
self.encoding = 'utf-8'
|
||||
self.update(1)
|
||||
|
||||
if has_ucs4:
|
||||
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010ffff]')
|
||||
else:
|
||||
NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
|
||||
def check_printable(self, data):
|
||||
match = self.NON_PRINTABLE.search(data)
|
||||
if match:
|
||||
character = match.group()
|
||||
position = self.index+(len(self.buffer)-self.pointer)+match.start()
|
||||
raise ReaderError(self.name, position, ord(character),
|
||||
'unicode', "special characters are not allowed")
|
||||
|
||||
def update(self, length):
|
||||
if self.raw_buffer is None:
|
||||
return
|
||||
self.buffer = self.buffer[self.pointer:]
|
||||
self.pointer = 0
|
||||
while len(self.buffer) < length:
|
||||
if not self.eof:
|
||||
self.update_raw()
|
||||
if self.raw_decode is not None:
|
||||
try:
|
||||
data, converted = self.raw_decode(self.raw_buffer,
|
||||
'strict', self.eof)
|
||||
except UnicodeDecodeError, exc:
|
||||
character = exc.object[exc.start]
|
||||
if self.stream is not None:
|
||||
position = self.stream_pointer-len(self.raw_buffer)+exc.start
|
||||
else:
|
||||
position = exc.start
|
||||
raise ReaderError(self.name, position, character,
|
||||
exc.encoding, exc.reason)
|
||||
else:
|
||||
data = self.raw_buffer
|
||||
converted = len(data)
|
||||
self.check_printable(data)
|
||||
self.buffer += data
|
||||
self.raw_buffer = self.raw_buffer[converted:]
|
||||
if self.eof:
|
||||
self.buffer += u'\0'
|
||||
self.raw_buffer = None
|
||||
break
|
||||
|
||||
def update_raw(self, size=1024):
|
||||
data = self.stream.read(size)
|
||||
if data:
|
||||
self.raw_buffer += data
|
||||
self.stream_pointer += len(data)
|
||||
else:
|
||||
self.eof = True
|
||||
@@ -1,488 +0,0 @@
|
||||
|
||||
__all__ = ['BaseRepresenter', 'SafeRepresenter', 'Representer',
|
||||
'RepresenterError']
|
||||
|
||||
from error import *
|
||||
from nodes import *
|
||||
|
||||
import datetime
|
||||
|
||||
import sys, copy_reg, types
|
||||
|
||||
class RepresenterError(YAMLError):
|
||||
pass
|
||||
|
||||
class BaseRepresenter(object):
|
||||
|
||||
yaml_representers = {}
|
||||
yaml_multi_representers = {}
|
||||
|
||||
def __init__(self, default_style=None, default_flow_style=False, sort_keys=True):
|
||||
self.default_style = default_style
|
||||
self.default_flow_style = default_flow_style
|
||||
self.sort_keys = sort_keys
|
||||
self.represented_objects = {}
|
||||
self.object_keeper = []
|
||||
self.alias_key = None
|
||||
|
||||
def represent(self, data):
|
||||
node = self.represent_data(data)
|
||||
self.serialize(node)
|
||||
self.represented_objects = {}
|
||||
self.object_keeper = []
|
||||
self.alias_key = None
|
||||
|
||||
def get_classobj_bases(self, cls):
|
||||
bases = [cls]
|
||||
for base in cls.__bases__:
|
||||
bases.extend(self.get_classobj_bases(base))
|
||||
return bases
|
||||
|
||||
def represent_data(self, data):
|
||||
if self.ignore_aliases(data):
|
||||
self.alias_key = None
|
||||
else:
|
||||
self.alias_key = id(data)
|
||||
if self.alias_key is not None:
|
||||
if self.alias_key in self.represented_objects:
|
||||
node = self.represented_objects[self.alias_key]
|
||||
#if node is None:
|
||||
# raise RepresenterError("recursive objects are not allowed: %r" % data)
|
||||
return node
|
||||
#self.represented_objects[alias_key] = None
|
||||
self.object_keeper.append(data)
|
||||
data_types = type(data).__mro__
|
||||
if type(data) is types.InstanceType:
|
||||
data_types = self.get_classobj_bases(data.__class__)+list(data_types)
|
||||
if data_types[0] in self.yaml_representers:
|
||||
node = self.yaml_representers[data_types[0]](self, data)
|
||||
else:
|
||||
for data_type in data_types:
|
||||
if data_type in self.yaml_multi_representers:
|
||||
node = self.yaml_multi_representers[data_type](self, data)
|
||||
break
|
||||
else:
|
||||
if None in self.yaml_multi_representers:
|
||||
node = self.yaml_multi_representers[None](self, data)
|
||||
elif None in self.yaml_representers:
|
||||
node = self.yaml_representers[None](self, data)
|
||||
else:
|
||||
node = ScalarNode(None, unicode(data))
|
||||
#if alias_key is not None:
|
||||
# self.represented_objects[alias_key] = node
|
||||
return node
|
||||
|
||||
def add_representer(cls, data_type, representer):
|
||||
if not 'yaml_representers' in cls.__dict__:
|
||||
cls.yaml_representers = cls.yaml_representers.copy()
|
||||
cls.yaml_representers[data_type] = representer
|
||||
add_representer = classmethod(add_representer)
|
||||
|
||||
def add_multi_representer(cls, data_type, representer):
|
||||
if not 'yaml_multi_representers' in cls.__dict__:
|
||||
cls.yaml_multi_representers = cls.yaml_multi_representers.copy()
|
||||
cls.yaml_multi_representers[data_type] = representer
|
||||
add_multi_representer = classmethod(add_multi_representer)
|
||||
|
||||
def represent_scalar(self, tag, value, style=None):
|
||||
if style is None:
|
||||
style = self.default_style
|
||||
node = ScalarNode(tag, value, style=style)
|
||||
if self.alias_key is not None:
|
||||
self.represented_objects[self.alias_key] = node
|
||||
return node
|
||||
|
||||
def represent_sequence(self, tag, sequence, flow_style=None):
|
||||
value = []
|
||||
node = SequenceNode(tag, value, flow_style=flow_style)
|
||||
if self.alias_key is not None:
|
||||
self.represented_objects[self.alias_key] = node
|
||||
best_style = True
|
||||
for item in sequence:
|
||||
node_item = self.represent_data(item)
|
||||
if not (isinstance(node_item, ScalarNode) and not node_item.style):
|
||||
best_style = False
|
||||
value.append(node_item)
|
||||
if flow_style is None:
|
||||
if self.default_flow_style is not None:
|
||||
node.flow_style = self.default_flow_style
|
||||
else:
|
||||
node.flow_style = best_style
|
||||
return node
|
||||
|
||||
def represent_mapping(self, tag, mapping, flow_style=None):
|
||||
value = []
|
||||
node = MappingNode(tag, value, flow_style=flow_style)
|
||||
if self.alias_key is not None:
|
||||
self.represented_objects[self.alias_key] = node
|
||||
best_style = True
|
||||
if hasattr(mapping, 'items'):
|
||||
mapping = mapping.items()
|
||||
if self.sort_keys:
|
||||
mapping.sort()
|
||||
for item_key, item_value in mapping:
|
||||
node_key = self.represent_data(item_key)
|
||||
node_value = self.represent_data(item_value)
|
||||
if not (isinstance(node_key, ScalarNode) and not node_key.style):
|
||||
best_style = False
|
||||
if not (isinstance(node_value, ScalarNode) and not node_value.style):
|
||||
best_style = False
|
||||
value.append((node_key, node_value))
|
||||
if flow_style is None:
|
||||
if self.default_flow_style is not None:
|
||||
node.flow_style = self.default_flow_style
|
||||
else:
|
||||
node.flow_style = best_style
|
||||
return node
|
||||
|
||||
def ignore_aliases(self, data):
|
||||
return False
|
||||
|
||||
class SafeRepresenter(BaseRepresenter):
|
||||
|
||||
def ignore_aliases(self, data):
|
||||
if data is None:
|
||||
return True
|
||||
if isinstance(data, tuple) and data == ():
|
||||
return True
|
||||
if isinstance(data, (str, unicode, bool, int, float)):
|
||||
return True
|
||||
|
||||
def represent_none(self, data):
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:null',
|
||||
u'null')
|
||||
|
||||
def represent_str(self, data):
|
||||
tag = None
|
||||
style = None
|
||||
try:
|
||||
data = unicode(data, 'ascii')
|
||||
tag = u'tag:yaml.org,2002:str'
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
data = unicode(data, 'utf-8')
|
||||
tag = u'tag:yaml.org,2002:str'
|
||||
except UnicodeDecodeError:
|
||||
data = data.encode('base64')
|
||||
tag = u'tag:yaml.org,2002:binary'
|
||||
style = '|'
|
||||
return self.represent_scalar(tag, data, style=style)
|
||||
|
||||
def represent_unicode(self, data):
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:str', data)
|
||||
|
||||
def represent_bool(self, data):
|
||||
if data:
|
||||
value = u'true'
|
||||
else:
|
||||
value = u'false'
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:bool', value)
|
||||
|
||||
def represent_int(self, data):
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:int', unicode(data))
|
||||
|
||||
def represent_long(self, data):
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:int', unicode(data))
|
||||
|
||||
inf_value = 1e300
|
||||
while repr(inf_value) != repr(inf_value*inf_value):
|
||||
inf_value *= inf_value
|
||||
|
||||
def represent_float(self, data):
|
||||
if data != data or (data == 0.0 and data == 1.0):
|
||||
value = u'.nan'
|
||||
elif data == self.inf_value:
|
||||
value = u'.inf'
|
||||
elif data == -self.inf_value:
|
||||
value = u'-.inf'
|
||||
else:
|
||||
value = unicode(repr(data)).lower()
|
||||
# Note that in some cases `repr(data)` represents a float number
|
||||
# without the decimal parts. For instance:
|
||||
# >>> repr(1e17)
|
||||
# '1e17'
|
||||
# Unfortunately, this is not a valid float representation according
|
||||
# to the definition of the `!!float` tag. We fix this by adding
|
||||
# '.0' before the 'e' symbol.
|
||||
if u'.' not in value and u'e' in value:
|
||||
value = value.replace(u'e', u'.0e', 1)
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:float', value)
|
||||
|
||||
def represent_list(self, data):
|
||||
#pairs = (len(data) > 0 and isinstance(data, list))
|
||||
#if pairs:
|
||||
# for item in data:
|
||||
# if not isinstance(item, tuple) or len(item) != 2:
|
||||
# pairs = False
|
||||
# break
|
||||
#if not pairs:
|
||||
return self.represent_sequence(u'tag:yaml.org,2002:seq', data)
|
||||
#value = []
|
||||
#for item_key, item_value in data:
|
||||
# value.append(self.represent_mapping(u'tag:yaml.org,2002:map',
|
||||
# [(item_key, item_value)]))
|
||||
#return SequenceNode(u'tag:yaml.org,2002:pairs', value)
|
||||
|
||||
def represent_dict(self, data):
|
||||
return self.represent_mapping(u'tag:yaml.org,2002:map', data)
|
||||
|
||||
def represent_set(self, data):
|
||||
value = {}
|
||||
for key in data:
|
||||
value[key] = None
|
||||
return self.represent_mapping(u'tag:yaml.org,2002:set', value)
|
||||
|
||||
def represent_date(self, data):
|
||||
value = unicode(data.isoformat())
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:timestamp', value)
|
||||
|
||||
def represent_datetime(self, data):
|
||||
value = unicode(data.isoformat(' '))
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:timestamp', value)
|
||||
|
||||
def represent_yaml_object(self, tag, data, cls, flow_style=None):
|
||||
if hasattr(data, '__getstate__'):
|
||||
state = data.__getstate__()
|
||||
else:
|
||||
state = data.__dict__.copy()
|
||||
return self.represent_mapping(tag, state, flow_style=flow_style)
|
||||
|
||||
def represent_undefined(self, data):
|
||||
raise RepresenterError("cannot represent an object", data)
|
||||
|
||||
SafeRepresenter.add_representer(type(None),
|
||||
SafeRepresenter.represent_none)
|
||||
|
||||
SafeRepresenter.add_representer(str,
|
||||
SafeRepresenter.represent_str)
|
||||
|
||||
SafeRepresenter.add_representer(unicode,
|
||||
SafeRepresenter.represent_unicode)
|
||||
|
||||
SafeRepresenter.add_representer(bool,
|
||||
SafeRepresenter.represent_bool)
|
||||
|
||||
SafeRepresenter.add_representer(int,
|
||||
SafeRepresenter.represent_int)
|
||||
|
||||
SafeRepresenter.add_representer(long,
|
||||
SafeRepresenter.represent_long)
|
||||
|
||||
SafeRepresenter.add_representer(float,
|
||||
SafeRepresenter.represent_float)
|
||||
|
||||
SafeRepresenter.add_representer(list,
|
||||
SafeRepresenter.represent_list)
|
||||
|
||||
SafeRepresenter.add_representer(tuple,
|
||||
SafeRepresenter.represent_list)
|
||||
|
||||
SafeRepresenter.add_representer(dict,
|
||||
SafeRepresenter.represent_dict)
|
||||
|
||||
SafeRepresenter.add_representer(set,
|
||||
SafeRepresenter.represent_set)
|
||||
|
||||
SafeRepresenter.add_representer(datetime.date,
|
||||
SafeRepresenter.represent_date)
|
||||
|
||||
SafeRepresenter.add_representer(datetime.datetime,
|
||||
SafeRepresenter.represent_datetime)
|
||||
|
||||
SafeRepresenter.add_representer(None,
|
||||
SafeRepresenter.represent_undefined)
|
||||
|
||||
class Representer(SafeRepresenter):
|
||||
|
||||
def represent_str(self, data):
|
||||
tag = None
|
||||
style = None
|
||||
try:
|
||||
data = unicode(data, 'ascii')
|
||||
tag = u'tag:yaml.org,2002:str'
|
||||
except UnicodeDecodeError:
|
||||
try:
|
||||
data = unicode(data, 'utf-8')
|
||||
tag = u'tag:yaml.org,2002:python/str'
|
||||
except UnicodeDecodeError:
|
||||
data = data.encode('base64')
|
||||
tag = u'tag:yaml.org,2002:binary'
|
||||
style = '|'
|
||||
return self.represent_scalar(tag, data, style=style)
|
||||
|
||||
def represent_unicode(self, data):
|
||||
tag = None
|
||||
try:
|
||||
data.encode('ascii')
|
||||
tag = u'tag:yaml.org,2002:python/unicode'
|
||||
except UnicodeEncodeError:
|
||||
tag = u'tag:yaml.org,2002:str'
|
||||
return self.represent_scalar(tag, data)
|
||||
|
||||
def represent_long(self, data):
|
||||
tag = u'tag:yaml.org,2002:int'
|
||||
if int(data) is not data:
|
||||
tag = u'tag:yaml.org,2002:python/long'
|
||||
return self.represent_scalar(tag, unicode(data))
|
||||
|
||||
def represent_complex(self, data):
|
||||
if data.imag == 0.0:
|
||||
data = u'%r' % data.real
|
||||
elif data.real == 0.0:
|
||||
data = u'%rj' % data.imag
|
||||
elif data.imag > 0:
|
||||
data = u'%r+%rj' % (data.real, data.imag)
|
||||
else:
|
||||
data = u'%r%rj' % (data.real, data.imag)
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:python/complex', data)
|
||||
|
||||
def represent_tuple(self, data):
|
||||
return self.represent_sequence(u'tag:yaml.org,2002:python/tuple', data)
|
||||
|
||||
def represent_name(self, data):
|
||||
name = u'%s.%s' % (data.__module__, data.__name__)
|
||||
return self.represent_scalar(u'tag:yaml.org,2002:python/name:'+name, u'')
|
||||
|
||||
def represent_module(self, data):
|
||||
return self.represent_scalar(
|
||||
u'tag:yaml.org,2002:python/module:'+data.__name__, u'')
|
||||
|
||||
def represent_instance(self, data):
|
||||
# For instances of classic classes, we use __getinitargs__ and
|
||||
# __getstate__ to serialize the data.
|
||||
|
||||
# If data.__getinitargs__ exists, the object must be reconstructed by
|
||||
# calling cls(**args), where args is a tuple returned by
|
||||
# __getinitargs__. Otherwise, the cls.__init__ method should never be
|
||||
# called and the class instance is created by instantiating a trivial
|
||||
# class and assigning to the instance's __class__ variable.
|
||||
|
||||
# If data.__getstate__ exists, it returns the state of the object.
|
||||
# Otherwise, the state of the object is data.__dict__.
|
||||
|
||||
# We produce either a !!python/object or !!python/object/new node.
|
||||
# If data.__getinitargs__ does not exist and state is a dictionary, we
|
||||
# produce a !!python/object node . Otherwise we produce a
|
||||
# !!python/object/new node.
|
||||
|
||||
cls = data.__class__
|
||||
class_name = u'%s.%s' % (cls.__module__, cls.__name__)
|
||||
args = None
|
||||
state = None
|
||||
if hasattr(data, '__getinitargs__'):
|
||||
args = list(data.__getinitargs__())
|
||||
if hasattr(data, '__getstate__'):
|
||||
state = data.__getstate__()
|
||||
else:
|
||||
state = data.__dict__
|
||||
if args is None and isinstance(state, dict):
|
||||
return self.represent_mapping(
|
||||
u'tag:yaml.org,2002:python/object:'+class_name, state)
|
||||
if isinstance(state, dict) and not state:
|
||||
return self.represent_sequence(
|
||||
u'tag:yaml.org,2002:python/object/new:'+class_name, args)
|
||||
value = {}
|
||||
if args:
|
||||
value['args'] = args
|
||||
value['state'] = state
|
||||
return self.represent_mapping(
|
||||
u'tag:yaml.org,2002:python/object/new:'+class_name, value)
|
||||
|
||||
def represent_object(self, data):
|
||||
# We use __reduce__ API to save the data. data.__reduce__ returns
|
||||
# a tuple of length 2-5:
|
||||
# (function, args, state, listitems, dictitems)
|
||||
|
||||
# For reconstructing, we calls function(*args), then set its state,
|
||||
# listitems, and dictitems if they are not None.
|
||||
|
||||
# A special case is when function.__name__ == '__newobj__'. In this
|
||||
# case we create the object with args[0].__new__(*args).
|
||||
|
||||
# Another special case is when __reduce__ returns a string - we don't
|
||||
# support it.
|
||||
|
||||
# We produce a !!python/object, !!python/object/new or
|
||||
# !!python/object/apply node.
|
||||
|
||||
cls = type(data)
|
||||
if cls in copy_reg.dispatch_table:
|
||||
reduce = copy_reg.dispatch_table[cls](data)
|
||||
elif hasattr(data, '__reduce_ex__'):
|
||||
reduce = data.__reduce_ex__(2)
|
||||
elif hasattr(data, '__reduce__'):
|
||||
reduce = data.__reduce__()
|
||||
else:
|
||||
raise RepresenterError("cannot represent an object", data)
|
||||
reduce = (list(reduce)+[None]*5)[:5]
|
||||
function, args, state, listitems, dictitems = reduce
|
||||
args = list(args)
|
||||
if state is None:
|
||||
state = {}
|
||||
if listitems is not None:
|
||||
listitems = list(listitems)
|
||||
if dictitems is not None:
|
||||
dictitems = dict(dictitems)
|
||||
if function.__name__ == '__newobj__':
|
||||
function = args[0]
|
||||
args = args[1:]
|
||||
tag = u'tag:yaml.org,2002:python/object/new:'
|
||||
newobj = True
|
||||
else:
|
||||
tag = u'tag:yaml.org,2002:python/object/apply:'
|
||||
newobj = False
|
||||
function_name = u'%s.%s' % (function.__module__, function.__name__)
|
||||
if not args and not listitems and not dictitems \
|
||||
and isinstance(state, dict) and newobj:
|
||||
return self.represent_mapping(
|
||||
u'tag:yaml.org,2002:python/object:'+function_name, state)
|
||||
if not listitems and not dictitems \
|
||||
and isinstance(state, dict) and not state:
|
||||
return self.represent_sequence(tag+function_name, args)
|
||||
value = {}
|
||||
if args:
|
||||
value['args'] = args
|
||||
if state or not isinstance(state, dict):
|
||||
value['state'] = state
|
||||
if listitems:
|
||||
value['listitems'] = listitems
|
||||
if dictitems:
|
||||
value['dictitems'] = dictitems
|
||||
return self.represent_mapping(tag+function_name, value)
|
||||
|
||||
Representer.add_representer(str,
|
||||
Representer.represent_str)
|
||||
|
||||
Representer.add_representer(unicode,
|
||||
Representer.represent_unicode)
|
||||
|
||||
Representer.add_representer(long,
|
||||
Representer.represent_long)
|
||||
|
||||
Representer.add_representer(complex,
|
||||
Representer.represent_complex)
|
||||
|
||||
Representer.add_representer(tuple,
|
||||
Representer.represent_tuple)
|
||||
|
||||
Representer.add_representer(type,
|
||||
Representer.represent_name)
|
||||
|
||||
Representer.add_representer(types.ClassType,
|
||||
Representer.represent_name)
|
||||
|
||||
Representer.add_representer(types.FunctionType,
|
||||
Representer.represent_name)
|
||||
|
||||
Representer.add_representer(types.BuiltinFunctionType,
|
||||
Representer.represent_name)
|
||||
|
||||
Representer.add_representer(types.ModuleType,
|
||||
Representer.represent_module)
|
||||
|
||||
Representer.add_multi_representer(types.InstanceType,
|
||||
Representer.represent_instance)
|
||||
|
||||
Representer.add_multi_representer(object,
|
||||
Representer.represent_object)
|
||||
|
||||
@@ -1,227 +0,0 @@
|
||||
|
||||
__all__ = ['BaseResolver', 'Resolver']
|
||||
|
||||
from error import *
|
||||
from nodes import *
|
||||
|
||||
import re
|
||||
|
||||
class ResolverError(YAMLError):
|
||||
pass
|
||||
|
||||
class BaseResolver(object):
|
||||
|
||||
DEFAULT_SCALAR_TAG = u'tag:yaml.org,2002:str'
|
||||
DEFAULT_SEQUENCE_TAG = u'tag:yaml.org,2002:seq'
|
||||
DEFAULT_MAPPING_TAG = u'tag:yaml.org,2002:map'
|
||||
|
||||
yaml_implicit_resolvers = {}
|
||||
yaml_path_resolvers = {}
|
||||
|
||||
def __init__(self):
|
||||
self.resolver_exact_paths = []
|
||||
self.resolver_prefix_paths = []
|
||||
|
||||
def add_implicit_resolver(cls, tag, regexp, first):
|
||||
if not 'yaml_implicit_resolvers' in cls.__dict__:
|
||||
implicit_resolvers = {}
|
||||
for key in cls.yaml_implicit_resolvers:
|
||||
implicit_resolvers[key] = cls.yaml_implicit_resolvers[key][:]
|
||||
cls.yaml_implicit_resolvers = implicit_resolvers
|
||||
if first is None:
|
||||
first = [None]
|
||||
for ch in first:
|
||||
cls.yaml_implicit_resolvers.setdefault(ch, []).append((tag, regexp))
|
||||
add_implicit_resolver = classmethod(add_implicit_resolver)
|
||||
|
||||
def add_path_resolver(cls, tag, path, kind=None):
|
||||
# Note: `add_path_resolver` is experimental. The API could be changed.
|
||||
# `new_path` is a pattern that is matched against the path from the
|
||||
# root to the node that is being considered. `node_path` elements are
|
||||
# tuples `(node_check, index_check)`. `node_check` is a node class:
|
||||
# `ScalarNode`, `SequenceNode`, `MappingNode` or `None`. `None`
|
||||
# matches any kind of a node. `index_check` could be `None`, a boolean
|
||||
# value, a string value, or a number. `None` and `False` match against
|
||||
# any _value_ of sequence and mapping nodes. `True` matches against
|
||||
# any _key_ of a mapping node. A string `index_check` matches against
|
||||
# a mapping value that corresponds to a scalar key which content is
|
||||
# equal to the `index_check` value. An integer `index_check` matches
|
||||
# against a sequence value with the index equal to `index_check`.
|
||||
if not 'yaml_path_resolvers' in cls.__dict__:
|
||||
cls.yaml_path_resolvers = cls.yaml_path_resolvers.copy()
|
||||
new_path = []
|
||||
for element in path:
|
||||
if isinstance(element, (list, tuple)):
|
||||
if len(element) == 2:
|
||||
node_check, index_check = element
|
||||
elif len(element) == 1:
|
||||
node_check = element[0]
|
||||
index_check = True
|
||||
else:
|
||||
raise ResolverError("Invalid path element: %s" % element)
|
||||
else:
|
||||
node_check = None
|
||||
index_check = element
|
||||
if node_check is str:
|
||||
node_check = ScalarNode
|
||||
elif node_check is list:
|
||||
node_check = SequenceNode
|
||||
elif node_check is dict:
|
||||
node_check = MappingNode
|
||||
elif node_check not in [ScalarNode, SequenceNode, MappingNode] \
|
||||
and not isinstance(node_check, basestring) \
|
||||
and node_check is not None:
|
||||
raise ResolverError("Invalid node checker: %s" % node_check)
|
||||
if not isinstance(index_check, (basestring, int)) \
|
||||
and index_check is not None:
|
||||
raise ResolverError("Invalid index checker: %s" % index_check)
|
||||
new_path.append((node_check, index_check))
|
||||
if kind is str:
|
||||
kind = ScalarNode
|
||||
elif kind is list:
|
||||
kind = SequenceNode
|
||||
elif kind is dict:
|
||||
kind = MappingNode
|
||||
elif kind not in [ScalarNode, SequenceNode, MappingNode] \
|
||||
and kind is not None:
|
||||
raise ResolverError("Invalid node kind: %s" % kind)
|
||||
cls.yaml_path_resolvers[tuple(new_path), kind] = tag
|
||||
add_path_resolver = classmethod(add_path_resolver)
|
||||
|
||||
def descend_resolver(self, current_node, current_index):
|
||||
if not self.yaml_path_resolvers:
|
||||
return
|
||||
exact_paths = {}
|
||||
prefix_paths = []
|
||||
if current_node:
|
||||
depth = len(self.resolver_prefix_paths)
|
||||
for path, kind in self.resolver_prefix_paths[-1]:
|
||||
if self.check_resolver_prefix(depth, path, kind,
|
||||
current_node, current_index):
|
||||
if len(path) > depth:
|
||||
prefix_paths.append((path, kind))
|
||||
else:
|
||||
exact_paths[kind] = self.yaml_path_resolvers[path, kind]
|
||||
else:
|
||||
for path, kind in self.yaml_path_resolvers:
|
||||
if not path:
|
||||
exact_paths[kind] = self.yaml_path_resolvers[path, kind]
|
||||
else:
|
||||
prefix_paths.append((path, kind))
|
||||
self.resolver_exact_paths.append(exact_paths)
|
||||
self.resolver_prefix_paths.append(prefix_paths)
|
||||
|
||||
def ascend_resolver(self):
|
||||
if not self.yaml_path_resolvers:
|
||||
return
|
||||
self.resolver_exact_paths.pop()
|
||||
self.resolver_prefix_paths.pop()
|
||||
|
||||
def check_resolver_prefix(self, depth, path, kind,
|
||||
current_node, current_index):
|
||||
node_check, index_check = path[depth-1]
|
||||
if isinstance(node_check, basestring):
|
||||
if current_node.tag != node_check:
|
||||
return
|
||||
elif node_check is not None:
|
||||
if not isinstance(current_node, node_check):
|
||||
return
|
||||
if index_check is True and current_index is not None:
|
||||
return
|
||||
if (index_check is False or index_check is None) \
|
||||
and current_index is None:
|
||||
return
|
||||
if isinstance(index_check, basestring):
|
||||
if not (isinstance(current_index, ScalarNode)
|
||||
and index_check == current_index.value):
|
||||
return
|
||||
elif isinstance(index_check, int) and not isinstance(index_check, bool):
|
||||
if index_check != current_index:
|
||||
return
|
||||
return True
|
||||
|
||||
def resolve(self, kind, value, implicit):
|
||||
if kind is ScalarNode and implicit[0]:
|
||||
if value == u'':
|
||||
resolvers = self.yaml_implicit_resolvers.get(u'', [])
|
||||
else:
|
||||
resolvers = self.yaml_implicit_resolvers.get(value[0], [])
|
||||
resolvers += self.yaml_implicit_resolvers.get(None, [])
|
||||
for tag, regexp in resolvers:
|
||||
if regexp.match(value):
|
||||
return tag
|
||||
implicit = implicit[1]
|
||||
if self.yaml_path_resolvers:
|
||||
exact_paths = self.resolver_exact_paths[-1]
|
||||
if kind in exact_paths:
|
||||
return exact_paths[kind]
|
||||
if None in exact_paths:
|
||||
return exact_paths[None]
|
||||
if kind is ScalarNode:
|
||||
return self.DEFAULT_SCALAR_TAG
|
||||
elif kind is SequenceNode:
|
||||
return self.DEFAULT_SEQUENCE_TAG
|
||||
elif kind is MappingNode:
|
||||
return self.DEFAULT_MAPPING_TAG
|
||||
|
||||
class Resolver(BaseResolver):
|
||||
pass
|
||||
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:bool',
|
||||
re.compile(ur'''^(?:yes|Yes|YES|no|No|NO
|
||||
|true|True|TRUE|false|False|FALSE
|
||||
|on|On|ON|off|Off|OFF)$''', re.X),
|
||||
list(u'yYnNtTfFoO'))
|
||||
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:float',
|
||||
re.compile(ur'''^(?:[-+]?(?:[0-9][0-9_]*)\.[0-9_]*(?:[eE][-+][0-9]+)?
|
||||
|\.[0-9_]+(?:[eE][-+][0-9]+)?
|
||||
|[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\.[0-9_]*
|
||||
|[-+]?\.(?:inf|Inf|INF)
|
||||
|\.(?:nan|NaN|NAN))$''', re.X),
|
||||
list(u'-+0123456789.'))
|
||||
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:int',
|
||||
re.compile(ur'''^(?:[-+]?0b[0-1_]+
|
||||
|[-+]?0[0-7_]+
|
||||
|[-+]?(?:0|[1-9][0-9_]*)
|
||||
|[-+]?0x[0-9a-fA-F_]+
|
||||
|[-+]?[1-9][0-9_]*(?::[0-5]?[0-9])+)$''', re.X),
|
||||
list(u'-+0123456789'))
|
||||
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:merge',
|
||||
re.compile(ur'^(?:<<)$'),
|
||||
[u'<'])
|
||||
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:null',
|
||||
re.compile(ur'''^(?: ~
|
||||
|null|Null|NULL
|
||||
| )$''', re.X),
|
||||
[u'~', u'n', u'N', u''])
|
||||
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:timestamp',
|
||||
re.compile(ur'''^(?:[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]
|
||||
|[0-9][0-9][0-9][0-9] -[0-9][0-9]? -[0-9][0-9]?
|
||||
(?:[Tt]|[ \t]+)[0-9][0-9]?
|
||||
:[0-9][0-9] :[0-9][0-9] (?:\.[0-9]*)?
|
||||
(?:[ \t]*(?:Z|[-+][0-9][0-9]?(?::[0-9][0-9])?))?)$''', re.X),
|
||||
list(u'0123456789'))
|
||||
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:value',
|
||||
re.compile(ur'^(?:=)$'),
|
||||
[u'='])
|
||||
|
||||
# The following resolver is only for documentation purposes. It cannot work
|
||||
# because plain scalars cannot start with '!', '&', or '*'.
|
||||
Resolver.add_implicit_resolver(
|
||||
u'tag:yaml.org,2002:yaml',
|
||||
re.compile(ur'^(?:!|&|\*)$'),
|
||||
list(u'!&*'))
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,111 +0,0 @@
|
||||
|
||||
__all__ = ['Serializer', 'SerializerError']
|
||||
|
||||
from error import YAMLError
|
||||
from events import *
|
||||
from nodes import *
|
||||
|
||||
class SerializerError(YAMLError):
|
||||
pass
|
||||
|
||||
class Serializer(object):
|
||||
|
||||
ANCHOR_TEMPLATE = u'id%03d'
|
||||
|
||||
def __init__(self, encoding=None,
|
||||
explicit_start=None, explicit_end=None, version=None, tags=None):
|
||||
self.use_encoding = encoding
|
||||
self.use_explicit_start = explicit_start
|
||||
self.use_explicit_end = explicit_end
|
||||
self.use_version = version
|
||||
self.use_tags = tags
|
||||
self.serialized_nodes = {}
|
||||
self.anchors = {}
|
||||
self.last_anchor_id = 0
|
||||
self.closed = None
|
||||
|
||||
def open(self):
|
||||
if self.closed is None:
|
||||
self.emit(StreamStartEvent(encoding=self.use_encoding))
|
||||
self.closed = False
|
||||
elif self.closed:
|
||||
raise SerializerError("serializer is closed")
|
||||
else:
|
||||
raise SerializerError("serializer is already opened")
|
||||
|
||||
def close(self):
|
||||
if self.closed is None:
|
||||
raise SerializerError("serializer is not opened")
|
||||
elif not self.closed:
|
||||
self.emit(StreamEndEvent())
|
||||
self.closed = True
|
||||
|
||||
#def __del__(self):
|
||||
# self.close()
|
||||
|
||||
def serialize(self, node):
|
||||
if self.closed is None:
|
||||
raise SerializerError("serializer is not opened")
|
||||
elif self.closed:
|
||||
raise SerializerError("serializer is closed")
|
||||
self.emit(DocumentStartEvent(explicit=self.use_explicit_start,
|
||||
version=self.use_version, tags=self.use_tags))
|
||||
self.anchor_node(node)
|
||||
self.serialize_node(node, None, None)
|
||||
self.emit(DocumentEndEvent(explicit=self.use_explicit_end))
|
||||
self.serialized_nodes = {}
|
||||
self.anchors = {}
|
||||
self.last_anchor_id = 0
|
||||
|
||||
def anchor_node(self, node):
|
||||
if node in self.anchors:
|
||||
if self.anchors[node] is None:
|
||||
self.anchors[node] = self.generate_anchor(node)
|
||||
else:
|
||||
self.anchors[node] = None
|
||||
if isinstance(node, SequenceNode):
|
||||
for item in node.value:
|
||||
self.anchor_node(item)
|
||||
elif isinstance(node, MappingNode):
|
||||
for key, value in node.value:
|
||||
self.anchor_node(key)
|
||||
self.anchor_node(value)
|
||||
|
||||
def generate_anchor(self, node):
|
||||
self.last_anchor_id += 1
|
||||
return self.ANCHOR_TEMPLATE % self.last_anchor_id
|
||||
|
||||
def serialize_node(self, node, parent, index):
|
||||
alias = self.anchors[node]
|
||||
if node in self.serialized_nodes:
|
||||
self.emit(AliasEvent(alias))
|
||||
else:
|
||||
self.serialized_nodes[node] = True
|
||||
self.descend_resolver(parent, index)
|
||||
if isinstance(node, ScalarNode):
|
||||
detected_tag = self.resolve(ScalarNode, node.value, (True, False))
|
||||
default_tag = self.resolve(ScalarNode, node.value, (False, True))
|
||||
implicit = (node.tag == detected_tag), (node.tag == default_tag)
|
||||
self.emit(ScalarEvent(alias, node.tag, implicit, node.value,
|
||||
style=node.style))
|
||||
elif isinstance(node, SequenceNode):
|
||||
implicit = (node.tag
|
||||
== self.resolve(SequenceNode, node.value, True))
|
||||
self.emit(SequenceStartEvent(alias, node.tag, implicit,
|
||||
flow_style=node.flow_style))
|
||||
index = 0
|
||||
for item in node.value:
|
||||
self.serialize_node(item, node, index)
|
||||
index += 1
|
||||
self.emit(SequenceEndEvent())
|
||||
elif isinstance(node, MappingNode):
|
||||
implicit = (node.tag
|
||||
== self.resolve(MappingNode, node.value, True))
|
||||
self.emit(MappingStartEvent(alias, node.tag, implicit,
|
||||
flow_style=node.flow_style))
|
||||
for key, value in node.value:
|
||||
self.serialize_node(key, node, None)
|
||||
self.serialize_node(value, node, key)
|
||||
self.emit(MappingEndEvent())
|
||||
self.ascend_resolver()
|
||||
|
||||
@@ -1,104 +0,0 @@
|
||||
|
||||
class Token(object):
|
||||
def __init__(self, start_mark, end_mark):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
def __repr__(self):
|
||||
attributes = [key for key in self.__dict__
|
||||
if not key.endswith('_mark')]
|
||||
attributes.sort()
|
||||
arguments = ', '.join(['%s=%r' % (key, getattr(self, key))
|
||||
for key in attributes])
|
||||
return '%s(%s)' % (self.__class__.__name__, arguments)
|
||||
|
||||
#class BOMToken(Token):
|
||||
# id = '<byte order mark>'
|
||||
|
||||
class DirectiveToken(Token):
|
||||
id = '<directive>'
|
||||
def __init__(self, name, value, start_mark, end_mark):
|
||||
self.name = name
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
|
||||
class DocumentStartToken(Token):
|
||||
id = '<document start>'
|
||||
|
||||
class DocumentEndToken(Token):
|
||||
id = '<document end>'
|
||||
|
||||
class StreamStartToken(Token):
|
||||
id = '<stream start>'
|
||||
def __init__(self, start_mark=None, end_mark=None,
|
||||
encoding=None):
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.encoding = encoding
|
||||
|
||||
class StreamEndToken(Token):
|
||||
id = '<stream end>'
|
||||
|
||||
class BlockSequenceStartToken(Token):
|
||||
id = '<block sequence start>'
|
||||
|
||||
class BlockMappingStartToken(Token):
|
||||
id = '<block mapping start>'
|
||||
|
||||
class BlockEndToken(Token):
|
||||
id = '<block end>'
|
||||
|
||||
class FlowSequenceStartToken(Token):
|
||||
id = '['
|
||||
|
||||
class FlowMappingStartToken(Token):
|
||||
id = '{'
|
||||
|
||||
class FlowSequenceEndToken(Token):
|
||||
id = ']'
|
||||
|
||||
class FlowMappingEndToken(Token):
|
||||
id = '}'
|
||||
|
||||
class KeyToken(Token):
|
||||
id = '?'
|
||||
|
||||
class ValueToken(Token):
|
||||
id = ':'
|
||||
|
||||
class BlockEntryToken(Token):
|
||||
id = '-'
|
||||
|
||||
class FlowEntryToken(Token):
|
||||
id = ','
|
||||
|
||||
class AliasToken(Token):
|
||||
id = '<alias>'
|
||||
def __init__(self, value, start_mark, end_mark):
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
|
||||
class AnchorToken(Token):
|
||||
id = '<anchor>'
|
||||
def __init__(self, value, start_mark, end_mark):
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
|
||||
class TagToken(Token):
|
||||
id = '<tag>'
|
||||
def __init__(self, value, start_mark, end_mark):
|
||||
self.value = value
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
|
||||
class ScalarToken(Token):
|
||||
id = '<scalar>'
|
||||
def __init__(self, value, plain, start_mark, end_mark, style=None):
|
||||
self.value = value
|
||||
self.plain = plain
|
||||
self.start_mark = start_mark
|
||||
self.end_mark = end_mark
|
||||
self.style = style
|
||||
|
||||
@@ -1,616 +0,0 @@
|
||||
"""Beautiful Soup
|
||||
Elixir and Tonic
|
||||
"The Screen-Scraper's Friend"
|
||||
http://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Beautiful Soup uses a pluggable XML or HTML parser to parse a
|
||||
(possibly invalid) document into a tree representation. Beautiful Soup
|
||||
provides methods and Pythonic idioms that make it easy to navigate,
|
||||
search, and modify the parse tree.
|
||||
|
||||
Beautiful Soup works with Python 2.7 and up. It works better if lxml
|
||||
and/or html5lib is installed.
|
||||
|
||||
For more than you ever wanted to know about Beautiful Soup, see the
|
||||
documentation:
|
||||
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
|
||||
"""
|
||||
|
||||
__author__ = "Leonard Richardson (leonardr@segfault.org)"
|
||||
__version__ = "4.8.0"
|
||||
__copyright__ = "Copyright (c) 2004-2019 Leonard Richardson"
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = ['BeautifulSoup']
|
||||
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import traceback
|
||||
import warnings
|
||||
|
||||
from .builder import builder_registry, ParserRejectedMarkup
|
||||
from .dammit import UnicodeDammit
|
||||
from .element import (
|
||||
CData,
|
||||
Comment,
|
||||
DEFAULT_OUTPUT_ENCODING,
|
||||
Declaration,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
PageElement,
|
||||
ProcessingInstruction,
|
||||
ResultSet,
|
||||
SoupStrainer,
|
||||
Tag,
|
||||
)
|
||||
|
||||
# The very first thing we do is give a useful error if someone is
|
||||
# running this code under Python 3 without converting it.
|
||||
'You are trying to run the Python 2 version of Beautiful Soup under Python 3. This will not work.'!='You need to convert the code, either by installing it (`python setup.py install`) or by running 2to3 (`2to3 -w bs4`).'
|
||||
|
||||
class BeautifulSoup(Tag):
|
||||
"""
|
||||
This class defines the basic interface called by the tree builders.
|
||||
|
||||
These methods will be called by the parser:
|
||||
reset()
|
||||
feed(markup)
|
||||
|
||||
The tree builder may call these methods from its feed() implementation:
|
||||
handle_starttag(name, attrs) # See note about return value
|
||||
handle_endtag(name)
|
||||
handle_data(data) # Appends to the current data node
|
||||
endData(containerClass=NavigableString) # Ends the current data node
|
||||
|
||||
No matter how complicated the underlying parser is, you should be
|
||||
able to build a tree using 'start tag' events, 'end tag' events,
|
||||
'data' events, and "done with data" events.
|
||||
|
||||
If you encounter an empty-element tag (aka a self-closing tag,
|
||||
like HTML's <br> tag), call handle_starttag and then
|
||||
handle_endtag.
|
||||
"""
|
||||
ROOT_TAG_NAME = '[document]'
|
||||
|
||||
# If the end-user gives no indication which tree builder they
|
||||
# want, look for one with these features.
|
||||
DEFAULT_BUILDER_FEATURES = ['html', 'fast']
|
||||
|
||||
ASCII_SPACES = '\x20\x0a\x09\x0c\x0d'
|
||||
|
||||
NO_PARSER_SPECIFIED_WARNING = "No parser was explicitly specified, so I'm using the best available %(markup_type)s parser for this system (\"%(parser)s\"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.\n\nThe code that caused this warning is on line %(line_number)s of the file %(filename)s. To get rid of this warning, pass the additional argument 'features=\"%(parser)s\"' to the BeautifulSoup constructor.\n"
|
||||
|
||||
def __init__(self, markup="", features=None, builder=None,
|
||||
parse_only=None, from_encoding=None, exclude_encodings=None,
|
||||
**kwargs):
|
||||
"""Constructor.
|
||||
|
||||
:param markup: A string or a file-like object representing
|
||||
markup to be parsed.
|
||||
|
||||
:param features: Desirable features of the parser to be used. This
|
||||
may be the name of a specific parser ("lxml", "lxml-xml",
|
||||
"html.parser", or "html5lib") or it may be the type of markup
|
||||
to be used ("html", "html5", "xml"). It's recommended that you
|
||||
name a specific parser, so that Beautiful Soup gives you the
|
||||
same results across platforms and virtual environments.
|
||||
|
||||
:param builder: A TreeBuilder subclass to instantiate (or
|
||||
instance to use) instead of looking one up based on
|
||||
`features`. You only need to use this if you've implemented a
|
||||
custom TreeBuilder.
|
||||
|
||||
:param parse_only: A SoupStrainer. Only parts of the document
|
||||
matching the SoupStrainer will be considered. This is useful
|
||||
when parsing part of a document that would otherwise be too
|
||||
large to fit into memory.
|
||||
|
||||
:param from_encoding: A string indicating the encoding of the
|
||||
document to be parsed. Pass this in if Beautiful Soup is
|
||||
guessing wrongly about the document's encoding.
|
||||
|
||||
:param exclude_encodings: A list of strings indicating
|
||||
encodings known to be wrong. Pass this in if you don't know
|
||||
the document's encoding but you know Beautiful Soup's guess is
|
||||
wrong.
|
||||
|
||||
:param kwargs: For backwards compatibility purposes, the
|
||||
constructor accepts certain keyword arguments used in
|
||||
Beautiful Soup 3. None of these arguments do anything in
|
||||
Beautiful Soup 4; they will result in a warning and then be ignored.
|
||||
|
||||
Apart from this, any keyword arguments passed into the BeautifulSoup
|
||||
constructor are propagated to the TreeBuilder constructor. This
|
||||
makes it possible to configure a TreeBuilder beyond saying
|
||||
which one to use.
|
||||
|
||||
"""
|
||||
|
||||
if 'convertEntities' in kwargs:
|
||||
del kwargs['convertEntities']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the convertEntities argument to the "
|
||||
"BeautifulSoup constructor. Entities are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'markupMassage' in kwargs:
|
||||
del kwargs['markupMassage']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the markupMassage argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for any necessary markup massage.")
|
||||
|
||||
if 'smartQuotesTo' in kwargs:
|
||||
del kwargs['smartQuotesTo']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the smartQuotesTo argument to the "
|
||||
"BeautifulSoup constructor. Smart quotes are always converted "
|
||||
"to Unicode characters.")
|
||||
|
||||
if 'selfClosingTags' in kwargs:
|
||||
del kwargs['selfClosingTags']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the selfClosingTags argument to the "
|
||||
"BeautifulSoup constructor. The tree builder is responsible "
|
||||
"for understanding self-closing tags.")
|
||||
|
||||
if 'isHTML' in kwargs:
|
||||
del kwargs['isHTML']
|
||||
warnings.warn(
|
||||
"BS4 does not respect the isHTML argument to the "
|
||||
"BeautifulSoup constructor. Suggest you use "
|
||||
"features='lxml' for HTML and features='lxml-xml' for "
|
||||
"XML.")
|
||||
|
||||
def deprecated_argument(old_name, new_name):
|
||||
if old_name in kwargs:
|
||||
warnings.warn(
|
||||
'The "%s" argument to the BeautifulSoup constructor '
|
||||
'has been renamed to "%s."' % (old_name, new_name))
|
||||
value = kwargs[old_name]
|
||||
del kwargs[old_name]
|
||||
return value
|
||||
return None
|
||||
|
||||
parse_only = parse_only or deprecated_argument(
|
||||
"parseOnlyThese", "parse_only")
|
||||
|
||||
from_encoding = from_encoding or deprecated_argument(
|
||||
"fromEncoding", "from_encoding")
|
||||
|
||||
if from_encoding and isinstance(markup, str):
|
||||
warnings.warn("You provided Unicode markup but also provided a value for from_encoding. Your from_encoding will be ignored.")
|
||||
from_encoding = None
|
||||
|
||||
# We need this information to track whether or not the builder
|
||||
# was specified well enough that we can omit the 'you need to
|
||||
# specify a parser' warning.
|
||||
original_builder = builder
|
||||
original_features = features
|
||||
|
||||
if isinstance(builder, type):
|
||||
# A builder class was passed in; it needs to be instantiated.
|
||||
builder_class = builder
|
||||
builder = None
|
||||
elif builder is None:
|
||||
if isinstance(features, str):
|
||||
features = [features]
|
||||
if features is None or len(features) == 0:
|
||||
features = self.DEFAULT_BUILDER_FEATURES
|
||||
builder_class = builder_registry.lookup(*features)
|
||||
if builder_class is None:
|
||||
raise FeatureNotFound(
|
||||
"Couldn't find a tree builder with the features you "
|
||||
"requested: %s. Do you need to install a parser library?"
|
||||
% ",".join(features))
|
||||
|
||||
# At this point either we have a TreeBuilder instance in
|
||||
# builder, or we have a builder_class that we can instantiate
|
||||
# with the remaining **kwargs.
|
||||
if builder is None:
|
||||
builder = builder_class(**kwargs)
|
||||
if not original_builder and not (
|
||||
original_features == builder.NAME or
|
||||
original_features in builder.ALTERNATE_NAMES
|
||||
):
|
||||
if builder.is_xml:
|
||||
markup_type = "XML"
|
||||
else:
|
||||
markup_type = "HTML"
|
||||
|
||||
# This code adapted from warnings.py so that we get the same line
|
||||
# of code as our warnings.warn() call gets, even if the answer is wrong
|
||||
# (as it may be in a multithreading situation).
|
||||
caller = None
|
||||
try:
|
||||
caller = sys._getframe(1)
|
||||
except ValueError:
|
||||
pass
|
||||
if caller:
|
||||
globals = caller.f_globals
|
||||
line_number = caller.f_lineno
|
||||
else:
|
||||
globals = sys.__dict__
|
||||
line_number= 1
|
||||
filename = globals.get('__file__')
|
||||
if filename:
|
||||
fnl = filename.lower()
|
||||
if fnl.endswith((".pyc", ".pyo")):
|
||||
filename = filename[:-1]
|
||||
if filename:
|
||||
# If there is no filename at all, the user is most likely in a REPL,
|
||||
# and the warning is not necessary.
|
||||
values = dict(
|
||||
filename=filename,
|
||||
line_number=line_number,
|
||||
parser=builder.NAME,
|
||||
markup_type=markup_type
|
||||
)
|
||||
warnings.warn(self.NO_PARSER_SPECIFIED_WARNING % values, stacklevel=2)
|
||||
else:
|
||||
if kwargs:
|
||||
warnings.warn("Keyword arguments to the BeautifulSoup constructor will be ignored. These would normally be passed into the TreeBuilder constructor, but a TreeBuilder instance was passed in as `builder`.")
|
||||
|
||||
self.builder = builder
|
||||
self.is_xml = builder.is_xml
|
||||
self.known_xml = self.is_xml
|
||||
self._namespaces = dict()
|
||||
self.parse_only = parse_only
|
||||
|
||||
self.builder.initialize_soup(self)
|
||||
|
||||
if hasattr(markup, 'read'): # It's a file-type object.
|
||||
markup = markup.read()
|
||||
elif len(markup) <= 256 and (
|
||||
(isinstance(markup, bytes) and not b'<' in markup)
|
||||
or (isinstance(markup, str) and not '<' in markup)
|
||||
):
|
||||
# Print out warnings for a couple beginner problems
|
||||
# involving passing non-markup to Beautiful Soup.
|
||||
# Beautiful Soup will still parse the input as markup,
|
||||
# just in case that's what the user really wants.
|
||||
if (isinstance(markup, str)
|
||||
and not os.path.supports_unicode_filenames):
|
||||
possible_filename = markup.encode("utf8")
|
||||
else:
|
||||
possible_filename = markup
|
||||
is_file = False
|
||||
try:
|
||||
is_file = os.path.exists(possible_filename)
|
||||
except Exception as e:
|
||||
# This is almost certainly a problem involving
|
||||
# characters not valid in filenames on this
|
||||
# system. Just let it go.
|
||||
pass
|
||||
if is_file:
|
||||
if isinstance(markup, str):
|
||||
markup = markup.encode("utf8")
|
||||
warnings.warn(
|
||||
'"%s" looks like a filename, not markup. You should'
|
||||
' probably open this file and pass the filehandle into'
|
||||
' Beautiful Soup.' % markup)
|
||||
self._check_markup_is_url(markup)
|
||||
|
||||
for (self.markup, self.original_encoding, self.declared_html_encoding,
|
||||
self.contains_replacement_characters) in (
|
||||
self.builder.prepare_markup(
|
||||
markup, from_encoding, exclude_encodings=exclude_encodings)):
|
||||
self.reset()
|
||||
try:
|
||||
self._feed()
|
||||
break
|
||||
except ParserRejectedMarkup:
|
||||
pass
|
||||
|
||||
# Clear out the markup and remove the builder's circular
|
||||
# reference to this object.
|
||||
self.markup = None
|
||||
self.builder.soup = None
|
||||
|
||||
def __copy__(self):
|
||||
copy = type(self)(
|
||||
self.encode('utf-8'), builder=self.builder, from_encoding='utf-8'
|
||||
)
|
||||
|
||||
# Although we encoded the tree to UTF-8, that may not have
|
||||
# been the encoding of the original markup. Set the copy's
|
||||
# .original_encoding to reflect the original object's
|
||||
# .original_encoding.
|
||||
copy.original_encoding = self.original_encoding
|
||||
return copy
|
||||
|
||||
def __getstate__(self):
|
||||
# Frequently a tree builder can't be pickled.
|
||||
d = dict(self.__dict__)
|
||||
if 'builder' in d and not self.builder.picklable:
|
||||
d['builder'] = None
|
||||
return d
|
||||
|
||||
@staticmethod
|
||||
def _check_markup_is_url(markup):
|
||||
"""
|
||||
Check if markup looks like it's actually a url and raise a warning
|
||||
if so. Markup can be unicode or str (py2) / bytes (py3).
|
||||
"""
|
||||
if isinstance(markup, bytes):
|
||||
space = b' '
|
||||
cant_start_with = (b"http:", b"https:")
|
||||
elif isinstance(markup, str):
|
||||
space = ' '
|
||||
cant_start_with = ("http:", "https:")
|
||||
else:
|
||||
return
|
||||
|
||||
if any(markup.startswith(prefix) for prefix in cant_start_with):
|
||||
if not space in markup:
|
||||
if isinstance(markup, bytes):
|
||||
decoded_markup = markup.decode('utf-8', 'replace')
|
||||
else:
|
||||
decoded_markup = markup
|
||||
warnings.warn(
|
||||
'"%s" looks like a URL. Beautiful Soup is not an'
|
||||
' HTTP client. You should probably use an HTTP client like'
|
||||
' requests to get the document behind the URL, and feed'
|
||||
' that document to Beautiful Soup.' % decoded_markup
|
||||
)
|
||||
|
||||
def _feed(self):
|
||||
# Convert the document to Unicode.
|
||||
self.builder.reset()
|
||||
|
||||
self.builder.feed(self.markup)
|
||||
# Close out any unfinished strings and close all the open tags.
|
||||
self.endData()
|
||||
while self.currentTag.name != self.ROOT_TAG_NAME:
|
||||
self.popTag()
|
||||
|
||||
def reset(self):
|
||||
Tag.__init__(self, self, self.builder, self.ROOT_TAG_NAME)
|
||||
self.hidden = 1
|
||||
self.builder.reset()
|
||||
self.current_data = []
|
||||
self.currentTag = None
|
||||
self.tagStack = []
|
||||
self.preserve_whitespace_tag_stack = []
|
||||
self.pushTag(self)
|
||||
|
||||
def new_tag(self, name, namespace=None, nsprefix=None, attrs={}, **kwattrs):
|
||||
"""Create a new tag associated with this soup."""
|
||||
kwattrs.update(attrs)
|
||||
return Tag(None, self.builder, name, namespace, nsprefix, kwattrs)
|
||||
|
||||
def new_string(self, s, subclass=NavigableString):
|
||||
"""Create a new NavigableString associated with this soup."""
|
||||
return subclass(s)
|
||||
|
||||
def insert_before(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_before().")
|
||||
|
||||
def insert_after(self, successor):
|
||||
raise NotImplementedError("BeautifulSoup objects don't support insert_after().")
|
||||
|
||||
def popTag(self):
|
||||
tag = self.tagStack.pop()
|
||||
if self.preserve_whitespace_tag_stack and tag == self.preserve_whitespace_tag_stack[-1]:
|
||||
self.preserve_whitespace_tag_stack.pop()
|
||||
#print "Pop", tag.name
|
||||
if self.tagStack:
|
||||
self.currentTag = self.tagStack[-1]
|
||||
return self.currentTag
|
||||
|
||||
def pushTag(self, tag):
|
||||
#print "Push", tag.name
|
||||
if self.currentTag is not None:
|
||||
self.currentTag.contents.append(tag)
|
||||
self.tagStack.append(tag)
|
||||
self.currentTag = self.tagStack[-1]
|
||||
if tag.name in self.builder.preserve_whitespace_tags:
|
||||
self.preserve_whitespace_tag_stack.append(tag)
|
||||
|
||||
def endData(self, containerClass=NavigableString):
|
||||
if self.current_data:
|
||||
current_data = ''.join(self.current_data)
|
||||
# If whitespace is not preserved, and this string contains
|
||||
# nothing but ASCII spaces, replace it with a single space
|
||||
# or newline.
|
||||
if not self.preserve_whitespace_tag_stack:
|
||||
strippable = True
|
||||
for i in current_data:
|
||||
if i not in self.ASCII_SPACES:
|
||||
strippable = False
|
||||
break
|
||||
if strippable:
|
||||
if '\n' in current_data:
|
||||
current_data = '\n'
|
||||
else:
|
||||
current_data = ' '
|
||||
|
||||
# Reset the data collector.
|
||||
self.current_data = []
|
||||
|
||||
# Should we add this string to the tree at all?
|
||||
if self.parse_only and len(self.tagStack) <= 1 and \
|
||||
(not self.parse_only.text or \
|
||||
not self.parse_only.search(current_data)):
|
||||
return
|
||||
|
||||
o = containerClass(current_data)
|
||||
self.object_was_parsed(o)
|
||||
|
||||
def object_was_parsed(self, o, parent=None, most_recent_element=None):
|
||||
"""Add an object to the parse tree."""
|
||||
if parent is None:
|
||||
parent = self.currentTag
|
||||
if most_recent_element is not None:
|
||||
previous_element = most_recent_element
|
||||
else:
|
||||
previous_element = self._most_recent_element
|
||||
|
||||
next_element = previous_sibling = next_sibling = None
|
||||
if isinstance(o, Tag):
|
||||
next_element = o.next_element
|
||||
next_sibling = o.next_sibling
|
||||
previous_sibling = o.previous_sibling
|
||||
if previous_element is None:
|
||||
previous_element = o.previous_element
|
||||
|
||||
fix = parent.next_element is not None
|
||||
|
||||
o.setup(parent, previous_element, next_element, previous_sibling, next_sibling)
|
||||
|
||||
self._most_recent_element = o
|
||||
parent.contents.append(o)
|
||||
|
||||
# Check if we are inserting into an already parsed node.
|
||||
if fix:
|
||||
self._linkage_fixer(parent)
|
||||
|
||||
def _linkage_fixer(self, el):
|
||||
"""Make sure linkage of this fragment is sound."""
|
||||
|
||||
first = el.contents[0]
|
||||
child = el.contents[-1]
|
||||
descendant = child
|
||||
|
||||
if child is first and el.parent is not None:
|
||||
# Parent should be linked to first child
|
||||
el.next_element = child
|
||||
# We are no longer linked to whatever this element is
|
||||
prev_el = child.previous_element
|
||||
if prev_el is not None and prev_el is not el:
|
||||
prev_el.next_element = None
|
||||
# First child should be linked to the parent, and no previous siblings.
|
||||
child.previous_element = el
|
||||
child.previous_sibling = None
|
||||
|
||||
# We have no sibling as we've been appended as the last.
|
||||
child.next_sibling = None
|
||||
|
||||
# This index is a tag, dig deeper for a "last descendant"
|
||||
if isinstance(child, Tag) and child.contents:
|
||||
descendant = child._last_descendant(False)
|
||||
|
||||
# As the final step, link last descendant. It should be linked
|
||||
# to the parent's next sibling (if found), else walk up the chain
|
||||
# and find a parent with a sibling. It should have no next sibling.
|
||||
descendant.next_element = None
|
||||
descendant.next_sibling = None
|
||||
target = el
|
||||
while True:
|
||||
if target is None:
|
||||
break
|
||||
elif target.next_sibling is not None:
|
||||
descendant.next_element = target.next_sibling
|
||||
target.next_sibling.previous_element = child
|
||||
break
|
||||
target = target.parent
|
||||
|
||||
def _popToTag(self, name, nsprefix=None, inclusivePop=True):
|
||||
"""Pops the tag stack up to and including the most recent
|
||||
instance of the given tag. If inclusivePop is false, pops the tag
|
||||
stack up to but *not* including the most recent instqance of
|
||||
the given tag."""
|
||||
#print "Popping to %s" % name
|
||||
if name == self.ROOT_TAG_NAME:
|
||||
# The BeautifulSoup object itself can never be popped.
|
||||
return
|
||||
|
||||
most_recently_popped = None
|
||||
|
||||
stack_size = len(self.tagStack)
|
||||
for i in range(stack_size - 1, 0, -1):
|
||||
t = self.tagStack[i]
|
||||
if (name == t.name and nsprefix == t.prefix):
|
||||
if inclusivePop:
|
||||
most_recently_popped = self.popTag()
|
||||
break
|
||||
most_recently_popped = self.popTag()
|
||||
|
||||
return most_recently_popped
|
||||
|
||||
def handle_starttag(self, name, namespace, nsprefix, attrs):
|
||||
"""Push a start tag on to the stack.
|
||||
|
||||
If this method returns None, the tag was rejected by the
|
||||
SoupStrainer. You should proceed as if the tag had not occurred
|
||||
in the document. For instance, if this was a self-closing tag,
|
||||
don't call handle_endtag.
|
||||
"""
|
||||
|
||||
# print "Start tag %s: %s" % (name, attrs)
|
||||
self.endData()
|
||||
|
||||
if (self.parse_only and len(self.tagStack) <= 1
|
||||
and (self.parse_only.text
|
||||
or not self.parse_only.search_tag(name, attrs))):
|
||||
return None
|
||||
|
||||
tag = Tag(self, self.builder, name, namespace, nsprefix, attrs,
|
||||
self.currentTag, self._most_recent_element)
|
||||
if tag is None:
|
||||
return tag
|
||||
if self._most_recent_element is not None:
|
||||
self._most_recent_element.next_element = tag
|
||||
self._most_recent_element = tag
|
||||
self.pushTag(tag)
|
||||
return tag
|
||||
|
||||
def handle_endtag(self, name, nsprefix=None):
|
||||
#print "End tag: " + name
|
||||
self.endData()
|
||||
self._popToTag(name, nsprefix)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.current_data.append(data)
|
||||
|
||||
def decode(self, pretty_print=False,
|
||||
eventual_encoding=DEFAULT_OUTPUT_ENCODING,
|
||||
formatter="minimal"):
|
||||
"""Returns a string or Unicode representation of this document.
|
||||
To get Unicode, pass None for encoding."""
|
||||
|
||||
if self.is_xml:
|
||||
# Print the XML declaration
|
||||
encoding_part = ''
|
||||
if eventual_encoding != None:
|
||||
encoding_part = ' encoding="%s"' % eventual_encoding
|
||||
prefix = '<?xml version="1.0"%s?>\n' % encoding_part
|
||||
else:
|
||||
prefix = ''
|
||||
if not pretty_print:
|
||||
indent_level = None
|
||||
else:
|
||||
indent_level = 0
|
||||
return prefix + super(BeautifulSoup, self).decode(
|
||||
indent_level, eventual_encoding, formatter)
|
||||
|
||||
# Alias to make it easier to type import: 'from bs4 import _soup'
|
||||
_s = BeautifulSoup
|
||||
_soup = BeautifulSoup
|
||||
|
||||
class BeautifulStoneSoup(BeautifulSoup):
|
||||
"""Deprecated interface to an XML parser."""
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
kwargs['features'] = 'xml'
|
||||
warnings.warn(
|
||||
'The BeautifulStoneSoup class is deprecated. Instead of using '
|
||||
'it, pass features="xml" into the BeautifulSoup constructor.')
|
||||
super(BeautifulStoneSoup, self).__init__(*args, **kwargs)
|
||||
|
||||
|
||||
class StopParsing(Exception):
|
||||
pass
|
||||
|
||||
class FeatureNotFound(ValueError):
|
||||
pass
|
||||
|
||||
|
||||
#By default, act as an HTML pretty-printer.
|
||||
if __name__ == '__main__':
|
||||
import sys
|
||||
soup = BeautifulSoup(sys.stdin)
|
||||
print(soup.prettify())
|
||||
@@ -1,367 +0,0 @@
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
from collections import defaultdict
|
||||
import itertools
|
||||
import sys
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
nonwhitespace_re
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'HTMLTreeBuilder',
|
||||
'SAXTreeBuilder',
|
||||
'TreeBuilder',
|
||||
'TreeBuilderRegistry',
|
||||
]
|
||||
|
||||
# Some useful features for a TreeBuilder to have.
|
||||
FAST = 'fast'
|
||||
PERMISSIVE = 'permissive'
|
||||
STRICT = 'strict'
|
||||
XML = 'xml'
|
||||
HTML = 'html'
|
||||
HTML_5 = 'html5'
|
||||
|
||||
|
||||
class TreeBuilderRegistry(object):
|
||||
|
||||
def __init__(self):
|
||||
self.builders_for_feature = defaultdict(list)
|
||||
self.builders = []
|
||||
|
||||
def register(self, treebuilder_class):
|
||||
"""Register a treebuilder based on its advertised features."""
|
||||
for feature in treebuilder_class.features:
|
||||
self.builders_for_feature[feature].insert(0, treebuilder_class)
|
||||
self.builders.insert(0, treebuilder_class)
|
||||
|
||||
def lookup(self, *features):
|
||||
if len(self.builders) == 0:
|
||||
# There are no builders at all.
|
||||
return None
|
||||
|
||||
if len(features) == 0:
|
||||
# They didn't ask for any features. Give them the most
|
||||
# recently registered builder.
|
||||
return self.builders[0]
|
||||
|
||||
# Go down the list of features in order, and eliminate any builders
|
||||
# that don't match every feature.
|
||||
features = list(features)
|
||||
features.reverse()
|
||||
candidates = None
|
||||
candidate_set = None
|
||||
while len(features) > 0:
|
||||
feature = features.pop()
|
||||
we_have_the_feature = self.builders_for_feature.get(feature, [])
|
||||
if len(we_have_the_feature) > 0:
|
||||
if candidates is None:
|
||||
candidates = we_have_the_feature
|
||||
candidate_set = set(candidates)
|
||||
else:
|
||||
# Eliminate any candidates that don't have this feature.
|
||||
candidate_set = candidate_set.intersection(
|
||||
set(we_have_the_feature))
|
||||
|
||||
# The only valid candidates are the ones in candidate_set.
|
||||
# Go through the original list of candidates and pick the first one
|
||||
# that's in candidate_set.
|
||||
if candidate_set is None:
|
||||
return None
|
||||
for candidate in candidates:
|
||||
if candidate in candidate_set:
|
||||
return candidate
|
||||
return None
|
||||
|
||||
# The BeautifulSoup class will take feature lists from developers and use them
|
||||
# to look up builders in this registry.
|
||||
builder_registry = TreeBuilderRegistry()
|
||||
|
||||
class TreeBuilder(object):
|
||||
"""Turn a document into a Beautiful Soup object tree."""
|
||||
|
||||
NAME = "[Unknown tree builder]"
|
||||
ALTERNATE_NAMES = []
|
||||
features = []
|
||||
|
||||
is_xml = False
|
||||
picklable = False
|
||||
empty_element_tags = None # A tag will be considered an empty-element
|
||||
# tag when and only when it has no contents.
|
||||
|
||||
# A value for these tag/attribute combinations is a space- or
|
||||
# comma-separated list of CDATA, rather than a single CDATA.
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = {}
|
||||
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set()
|
||||
|
||||
USE_DEFAULT = object()
|
||||
|
||||
def __init__(self, multi_valued_attributes=USE_DEFAULT, preserve_whitespace_tags=USE_DEFAULT):
|
||||
"""Constructor.
|
||||
|
||||
:param multi_valued_attributes: If this is set to None, the
|
||||
TreeBuilder will not turn any values for attributes like
|
||||
'class' into lists. Setting this do a dictionary will
|
||||
customize this behavior; look at DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||
for an example.
|
||||
|
||||
Internally, these are called "CDATA list attributes", but that
|
||||
probably doesn't make sense to an end-user, so the argument name
|
||||
is `multi_valued_attributes`.
|
||||
|
||||
:param preserve_whitespace_tags:
|
||||
"""
|
||||
self.soup = None
|
||||
if multi_valued_attributes is self.USE_DEFAULT:
|
||||
multi_valued_attributes = self.DEFAULT_CDATA_LIST_ATTRIBUTES
|
||||
self.cdata_list_attributes = multi_valued_attributes
|
||||
if preserve_whitespace_tags is self.USE_DEFAULT:
|
||||
preserve_whitespace_tags = self.DEFAULT_PRESERVE_WHITESPACE_TAGS
|
||||
self.preserve_whitespace_tags = preserve_whitespace_tags
|
||||
|
||||
def initialize_soup(self, soup):
|
||||
"""The BeautifulSoup object has been initialized and is now
|
||||
being associated with the TreeBuilder.
|
||||
"""
|
||||
self.soup = soup
|
||||
|
||||
def reset(self):
|
||||
pass
|
||||
|
||||
def can_be_empty_element(self, tag_name):
|
||||
"""Might a tag with this name be an empty-element tag?
|
||||
|
||||
The final markup may or may not actually present this tag as
|
||||
self-closing.
|
||||
|
||||
For instance: an HTMLBuilder does not consider a <p> tag to be
|
||||
an empty-element tag (it's not in
|
||||
HTMLBuilder.empty_element_tags). This means an empty <p> tag
|
||||
will be presented as "<p></p>", not "<p />".
|
||||
|
||||
The default implementation has no opinion about which tags are
|
||||
empty-element tags, so a tag will be presented as an
|
||||
empty-element tag if and only if it has no contents.
|
||||
"<foo></foo>" will become "<foo />", and "<foo>bar</foo>" will
|
||||
be left alone.
|
||||
"""
|
||||
if self.empty_element_tags is None:
|
||||
return True
|
||||
return tag_name in self.empty_element_tags
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None):
|
||||
return markup, None, None, False
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""Wrap an HTML fragment to make it look like a document.
|
||||
|
||||
Different parsers do this differently. For instance, lxml
|
||||
introduces an empty <head> tag, and html5lib
|
||||
doesn't. Abstracting this away lets us write simple tests
|
||||
which run HTML fragments through the parser and compare the
|
||||
results against other HTML fragments.
|
||||
|
||||
This method should not be used outside of tests.
|
||||
"""
|
||||
return fragment
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
return False
|
||||
|
||||
def _replace_cdata_list_attribute_values(self, tag_name, attrs):
|
||||
"""Replaces class="foo bar" with class=["foo", "bar"]
|
||||
|
||||
Modifies its input in place.
|
||||
"""
|
||||
if not attrs:
|
||||
return attrs
|
||||
if self.cdata_list_attributes:
|
||||
universal = self.cdata_list_attributes.get('*', [])
|
||||
tag_specific = self.cdata_list_attributes.get(
|
||||
tag_name.lower(), None)
|
||||
for attr in list(attrs.keys()):
|
||||
if attr in universal or (tag_specific and attr in tag_specific):
|
||||
# We have a "class"-type attribute whose string
|
||||
# value is a whitespace-separated list of
|
||||
# values. Split it into a list.
|
||||
value = attrs[attr]
|
||||
if isinstance(value, str):
|
||||
values = nonwhitespace_re.findall(value)
|
||||
else:
|
||||
# html5lib sometimes calls setAttributes twice
|
||||
# for the same tag when rearranging the parse
|
||||
# tree. On the second call the attribute value
|
||||
# here is already a list. If this happens,
|
||||
# leave the value alone rather than trying to
|
||||
# split it again.
|
||||
values = value
|
||||
attrs[attr] = values
|
||||
return attrs
|
||||
|
||||
class SAXTreeBuilder(TreeBuilder):
|
||||
"""A Beautiful Soup treebuilder that listens for SAX events."""
|
||||
|
||||
def feed(self, markup):
|
||||
raise NotImplementedError()
|
||||
|
||||
def close(self):
|
||||
pass
|
||||
|
||||
def startElement(self, name, attrs):
|
||||
attrs = dict((key[1], value) for key, value in list(attrs.items()))
|
||||
#print "Start %s, %r" % (name, attrs)
|
||||
self.soup.handle_starttag(name, attrs)
|
||||
|
||||
def endElement(self, name):
|
||||
#print "End %s" % name
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def startElementNS(self, nsTuple, nodeName, attrs):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.startElement(nodeName, attrs)
|
||||
|
||||
def endElementNS(self, nsTuple, nodeName):
|
||||
# Throw away (ns, nodeName) for now.
|
||||
self.endElement(nodeName)
|
||||
#handler.endElementNS((ns, node.nodeName), node.nodeName)
|
||||
|
||||
def startPrefixMapping(self, prefix, nodeValue):
|
||||
# Ignore the prefix for now.
|
||||
pass
|
||||
|
||||
def endPrefixMapping(self, prefix):
|
||||
# Ignore the prefix for now.
|
||||
# handler.endPrefixMapping(prefix)
|
||||
pass
|
||||
|
||||
def characters(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def startDocument(self):
|
||||
pass
|
||||
|
||||
def endDocument(self):
|
||||
pass
|
||||
|
||||
|
||||
class HTMLTreeBuilder(TreeBuilder):
|
||||
"""This TreeBuilder knows facts about HTML.
|
||||
|
||||
Such as which tags are empty-element tags.
|
||||
"""
|
||||
|
||||
empty_element_tags = set([
|
||||
# These are from HTML5.
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
|
||||
# These are from earlier versions of HTML and are removed in HTML5.
|
||||
'basefont', 'bgsound', 'command', 'frame', 'image', 'isindex', 'nextid', 'spacer'
|
||||
])
|
||||
|
||||
# The HTML standard defines these as block-level elements. Beautiful
|
||||
# Soup does not treat these elements differently from other elements,
|
||||
# but it may do so eventually, and this information is available if
|
||||
# you need to use it.
|
||||
block_elements = set(["address", "article", "aside", "blockquote", "canvas", "dd", "div", "dl", "dt", "fieldset", "figcaption", "figure", "footer", "form", "h1", "h2", "h3", "h4", "h5", "h6", "header", "hr", "li", "main", "nav", "noscript", "ol", "output", "p", "pre", "section", "table", "tfoot", "ul", "video"])
|
||||
|
||||
# The HTML standard defines these attributes as containing a
|
||||
# space-separated list of values, not a single value. That is,
|
||||
# class="foo bar" means that the 'class' attribute has two values,
|
||||
# 'foo' and 'bar', not the single value 'foo bar'. When we
|
||||
# encounter one of these attributes, we will parse its value into
|
||||
# a list of values if possible. Upon output, the list will be
|
||||
# converted back into a string.
|
||||
DEFAULT_CDATA_LIST_ATTRIBUTES = {
|
||||
"*" : ['class', 'accesskey', 'dropzone'],
|
||||
"a" : ['rel', 'rev'],
|
||||
"link" : ['rel', 'rev'],
|
||||
"td" : ["headers"],
|
||||
"th" : ["headers"],
|
||||
"td" : ["headers"],
|
||||
"form" : ["accept-charset"],
|
||||
"object" : ["archive"],
|
||||
|
||||
# These are HTML5 specific, as are *.accesskey and *.dropzone above.
|
||||
"area" : ["rel"],
|
||||
"icon" : ["sizes"],
|
||||
"iframe" : ["sandbox"],
|
||||
"output" : ["for"],
|
||||
}
|
||||
|
||||
DEFAULT_PRESERVE_WHITESPACE_TAGS = set(['pre', 'textarea'])
|
||||
|
||||
def set_up_substitutions(self, tag):
|
||||
# We are only interested in <meta> tags
|
||||
if tag.name != 'meta':
|
||||
return False
|
||||
|
||||
http_equiv = tag.get('http-equiv')
|
||||
content = tag.get('content')
|
||||
charset = tag.get('charset')
|
||||
|
||||
# We are interested in <meta> tags that say what encoding the
|
||||
# document was originally in. This means HTML 5-style <meta>
|
||||
# tags that provide the "charset" attribute. It also means
|
||||
# HTML 4-style <meta> tags that provide the "content"
|
||||
# attribute and have "http-equiv" set to "content-type".
|
||||
#
|
||||
# In both cases we will replace the value of the appropriate
|
||||
# attribute with a standin object that can take on any
|
||||
# encoding.
|
||||
meta_encoding = None
|
||||
if charset is not None:
|
||||
# HTML 5 style:
|
||||
# <meta charset="utf8">
|
||||
meta_encoding = charset
|
||||
tag['charset'] = CharsetMetaAttributeValue(charset)
|
||||
|
||||
elif (content is not None and http_equiv is not None
|
||||
and http_equiv.lower() == 'content-type'):
|
||||
# HTML 4 style:
|
||||
# <meta http-equiv="content-type" content="text/html; charset=utf8">
|
||||
tag['content'] = ContentMetaAttributeValue(content)
|
||||
|
||||
return (meta_encoding is not None)
|
||||
|
||||
def register_treebuilders_from(module):
|
||||
"""Copy TreeBuilders from the given module into this module."""
|
||||
# I'm fairly sure this is not the best way to do this.
|
||||
this_module = sys.modules['bs4.builder']
|
||||
for name in module.__all__:
|
||||
obj = getattr(module, name)
|
||||
|
||||
if issubclass(obj, TreeBuilder):
|
||||
setattr(this_module, name, obj)
|
||||
this_module.__all__.append(name)
|
||||
# Register the builder while we're at it.
|
||||
this_module.builder_registry.register(obj)
|
||||
|
||||
class ParserRejectedMarkup(Exception):
|
||||
pass
|
||||
|
||||
# Builders are registered in reverse order of priority, so that custom
|
||||
# builder registrations will take precedence. In general, we want lxml
|
||||
# to take precedence over html5lib, because it's faster. And we only
|
||||
# want to use HTMLParser as a last result.
|
||||
from . import _htmlparser
|
||||
register_treebuilders_from(_htmlparser)
|
||||
try:
|
||||
from . import _html5lib
|
||||
register_treebuilders_from(_html5lib)
|
||||
except ImportError:
|
||||
# They don't have html5lib installed.
|
||||
pass
|
||||
try:
|
||||
from . import _lxml
|
||||
register_treebuilders_from(_lxml)
|
||||
except ImportError:
|
||||
# They don't have lxml installed.
|
||||
pass
|
||||
@@ -1,426 +0,0 @@
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'HTML5TreeBuilder',
|
||||
]
|
||||
|
||||
import warnings
|
||||
import re
|
||||
from bs4.builder import (
|
||||
PERMISSIVE,
|
||||
HTML,
|
||||
HTML_5,
|
||||
HTMLTreeBuilder,
|
||||
)
|
||||
from bs4.element import (
|
||||
NamespacedAttribute,
|
||||
nonwhitespace_re,
|
||||
)
|
||||
import html5lib
|
||||
from html5lib.constants import (
|
||||
namespaces,
|
||||
prefixes,
|
||||
)
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NavigableString,
|
||||
Tag,
|
||||
)
|
||||
|
||||
try:
|
||||
# Pre-0.99999999
|
||||
from html5lib.treebuilders import _base as treebuilder_base
|
||||
new_html5lib = False
|
||||
except ImportError as e:
|
||||
# 0.99999999 and up
|
||||
from html5lib.treebuilders import base as treebuilder_base
|
||||
new_html5lib = True
|
||||
|
||||
class HTML5TreeBuilder(HTMLTreeBuilder):
|
||||
"""Use html5lib to build a tree."""
|
||||
|
||||
NAME = "html5lib"
|
||||
|
||||
features = [NAME, PERMISSIVE, HTML_5, HTML]
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
# Store the user-specified encoding for use later on.
|
||||
self.user_specified_encoding = user_specified_encoding
|
||||
|
||||
# document_declared_encoding and exclude_encodings aren't used
|
||||
# ATM because the html5lib TreeBuilder doesn't use
|
||||
# UnicodeDammit.
|
||||
if exclude_encodings:
|
||||
warnings.warn("You provided a value for exclude_encoding, but the html5lib tree builder doesn't support exclude_encoding.")
|
||||
yield (markup, None, None, False)
|
||||
|
||||
# These methods are defined by Beautiful Soup.
|
||||
def feed(self, markup):
|
||||
if self.soup.parse_only is not None:
|
||||
warnings.warn("You provided a value for parse_only, but the html5lib tree builder doesn't support parse_only. The entire document will be parsed.")
|
||||
parser = html5lib.HTMLParser(tree=self.create_treebuilder)
|
||||
|
||||
extra_kwargs = dict()
|
||||
if not isinstance(markup, str):
|
||||
if new_html5lib:
|
||||
extra_kwargs['override_encoding'] = self.user_specified_encoding
|
||||
else:
|
||||
extra_kwargs['encoding'] = self.user_specified_encoding
|
||||
doc = parser.parse(markup, **extra_kwargs)
|
||||
|
||||
# Set the character encoding detected by the tokenizer.
|
||||
if isinstance(markup, str):
|
||||
# We need to special-case this because html5lib sets
|
||||
# charEncoding to UTF-8 if it gets Unicode input.
|
||||
doc.original_encoding = None
|
||||
else:
|
||||
original_encoding = parser.tokenizer.stream.charEncoding[0]
|
||||
if not isinstance(original_encoding, str):
|
||||
# In 0.99999999 and up, the encoding is an html5lib
|
||||
# Encoding object. We want to use a string for compatibility
|
||||
# with other tree builders.
|
||||
original_encoding = original_encoding.name
|
||||
doc.original_encoding = original_encoding
|
||||
|
||||
def create_treebuilder(self, namespaceHTMLElements):
|
||||
self.underlying_builder = TreeBuilderForHtml5lib(
|
||||
namespaceHTMLElements, self.soup)
|
||||
return self.underlying_builder
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return '<html><head></head><body>%s</body></html>' % fragment
|
||||
|
||||
|
||||
class TreeBuilderForHtml5lib(treebuilder_base.TreeBuilder):
|
||||
|
||||
def __init__(self, namespaceHTMLElements, soup=None):
|
||||
if soup:
|
||||
self.soup = soup
|
||||
else:
|
||||
from bs4 import BeautifulSoup
|
||||
self.soup = BeautifulSoup("", "html.parser")
|
||||
super(TreeBuilderForHtml5lib, self).__init__(namespaceHTMLElements)
|
||||
|
||||
def documentClass(self):
|
||||
self.soup.reset()
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def insertDoctype(self, token):
|
||||
name = token["name"]
|
||||
publicId = token["publicId"]
|
||||
systemId = token["systemId"]
|
||||
|
||||
doctype = Doctype.for_name_and_ids(name, publicId, systemId)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def elementClass(self, name, namespace):
|
||||
tag = self.soup.new_tag(name, namespace)
|
||||
return Element(tag, self.soup, namespace)
|
||||
|
||||
def commentClass(self, data):
|
||||
return TextNode(Comment(data), self.soup)
|
||||
|
||||
def fragmentClass(self):
|
||||
from bs4 import BeautifulSoup
|
||||
self.soup = BeautifulSoup("", "html.parser")
|
||||
self.soup.name = "[document_fragment]"
|
||||
return Element(self.soup, self.soup, None)
|
||||
|
||||
def appendChild(self, node):
|
||||
# XXX This code is not covered by the BS4 tests.
|
||||
self.soup.append(node.element)
|
||||
|
||||
def getDocument(self):
|
||||
return self.soup
|
||||
|
||||
def getFragment(self):
|
||||
return treebuilder_base.TreeBuilder.getFragment(self).element
|
||||
|
||||
def testSerializer(self, element):
|
||||
from bs4 import BeautifulSoup
|
||||
rv = []
|
||||
doctype_re = re.compile(r'^(.*?)(?: PUBLIC "(.*?)"(?: "(.*?)")?| SYSTEM "(.*?)")?$')
|
||||
|
||||
def serializeElement(element, indent=0):
|
||||
if isinstance(element, BeautifulSoup):
|
||||
pass
|
||||
if isinstance(element, Doctype):
|
||||
m = doctype_re.match(element)
|
||||
if m:
|
||||
name = m.group(1)
|
||||
if m.lastindex > 1:
|
||||
publicId = m.group(2) or ""
|
||||
systemId = m.group(3) or m.group(4) or ""
|
||||
rv.append("""|%s<!DOCTYPE %s "%s" "%s">""" %
|
||||
(' ' * indent, name, publicId, systemId))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE %s>" % (' ' * indent, name))
|
||||
else:
|
||||
rv.append("|%s<!DOCTYPE >" % (' ' * indent,))
|
||||
elif isinstance(element, Comment):
|
||||
rv.append("|%s<!-- %s -->" % (' ' * indent, element))
|
||||
elif isinstance(element, NavigableString):
|
||||
rv.append("|%s\"%s\"" % (' ' * indent, element))
|
||||
else:
|
||||
if element.namespace:
|
||||
name = "%s %s" % (prefixes[element.namespace],
|
||||
element.name)
|
||||
else:
|
||||
name = element.name
|
||||
rv.append("|%s<%s>" % (' ' * indent, name))
|
||||
if element.attrs:
|
||||
attributes = []
|
||||
for name, value in list(element.attrs.items()):
|
||||
if isinstance(name, NamespacedAttribute):
|
||||
name = "%s %s" % (prefixes[name.namespace], name.name)
|
||||
if isinstance(value, list):
|
||||
value = " ".join(value)
|
||||
attributes.append((name, value))
|
||||
|
||||
for name, value in sorted(attributes):
|
||||
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
|
||||
indent += 2
|
||||
for child in element.children:
|
||||
serializeElement(child, indent)
|
||||
serializeElement(element, 0)
|
||||
|
||||
return "\n".join(rv)
|
||||
|
||||
class AttrList(object):
|
||||
def __init__(self, element):
|
||||
self.element = element
|
||||
self.attrs = dict(self.element.attrs)
|
||||
def __iter__(self):
|
||||
return list(self.attrs.items()).__iter__()
|
||||
def __setitem__(self, name, value):
|
||||
# If this attribute is a multi-valued attribute for this element,
|
||||
# turn its value into a list.
|
||||
list_attr = self.element.cdata_list_attributes
|
||||
if (name in list_attr['*']
|
||||
or (self.element.name in list_attr
|
||||
and name in list_attr[self.element.name])):
|
||||
# A node that is being cloned may have already undergone
|
||||
# this procedure.
|
||||
if not isinstance(value, list):
|
||||
value = nonwhitespace_re.findall(value)
|
||||
self.element[name] = value
|
||||
def items(self):
|
||||
return list(self.attrs.items())
|
||||
def keys(self):
|
||||
return list(self.attrs.keys())
|
||||
def __len__(self):
|
||||
return len(self.attrs)
|
||||
def __getitem__(self, name):
|
||||
return self.attrs[name]
|
||||
def __contains__(self, name):
|
||||
return name in list(self.attrs.keys())
|
||||
|
||||
|
||||
class Element(treebuilder_base.Node):
|
||||
def __init__(self, element, soup, namespace):
|
||||
treebuilder_base.Node.__init__(self, element.name)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
self.namespace = namespace
|
||||
|
||||
def appendChild(self, node):
|
||||
string_child = child = None
|
||||
if isinstance(node, str):
|
||||
# Some other piece of code decided to pass in a string
|
||||
# instead of creating a TextElement object to contain the
|
||||
# string.
|
||||
string_child = child = node
|
||||
elif isinstance(node, Tag):
|
||||
# Some other piece of code decided to pass in a Tag
|
||||
# instead of creating an Element object to contain the
|
||||
# Tag.
|
||||
child = node
|
||||
elif node.element.__class__ == NavigableString:
|
||||
string_child = child = node.element
|
||||
node.parent = self
|
||||
else:
|
||||
child = node.element
|
||||
node.parent = self
|
||||
|
||||
if not isinstance(child, str) and child.parent is not None:
|
||||
node.element.extract()
|
||||
|
||||
if (string_child is not None and self.element.contents
|
||||
and self.element.contents[-1].__class__ == NavigableString):
|
||||
# We are appending a string onto another string.
|
||||
# TODO This has O(n^2) performance, for input like
|
||||
# "a</a>a</a>a</a>..."
|
||||
old_element = self.element.contents[-1]
|
||||
new_element = self.soup.new_string(old_element + string_child)
|
||||
old_element.replace_with(new_element)
|
||||
self.soup._most_recent_element = new_element
|
||||
else:
|
||||
if isinstance(node, str):
|
||||
# Create a brand new NavigableString from this string.
|
||||
child = self.soup.new_string(node)
|
||||
|
||||
# Tell Beautiful Soup to act as if it parsed this element
|
||||
# immediately after the parent's last descendant. (Or
|
||||
# immediately after the parent, if it has no children.)
|
||||
if self.element.contents:
|
||||
most_recent_element = self.element._last_descendant(False)
|
||||
elif self.element.next_element is not None:
|
||||
# Something from further ahead in the parse tree is
|
||||
# being inserted into this earlier element. This is
|
||||
# very annoying because it means an expensive search
|
||||
# for the last element in the tree.
|
||||
most_recent_element = self.soup._last_descendant()
|
||||
else:
|
||||
most_recent_element = self.element
|
||||
|
||||
self.soup.object_was_parsed(
|
||||
child, parent=self.element,
|
||||
most_recent_element=most_recent_element)
|
||||
|
||||
def getAttributes(self):
|
||||
if isinstance(self.element, Comment):
|
||||
return {}
|
||||
return AttrList(self.element)
|
||||
|
||||
def setAttributes(self, attributes):
|
||||
|
||||
if attributes is not None and len(attributes) > 0:
|
||||
|
||||
converted_attributes = []
|
||||
for name, value in list(attributes.items()):
|
||||
if isinstance(name, tuple):
|
||||
new_name = NamespacedAttribute(*name)
|
||||
del attributes[name]
|
||||
attributes[new_name] = value
|
||||
|
||||
self.soup.builder._replace_cdata_list_attribute_values(
|
||||
self.name, attributes)
|
||||
for name, value in list(attributes.items()):
|
||||
self.element[name] = value
|
||||
|
||||
# The attributes may contain variables that need substitution.
|
||||
# Call set_up_substitutions manually.
|
||||
#
|
||||
# The Tag constructor called this method when the Tag was created,
|
||||
# but we just set/changed the attributes, so call it again.
|
||||
self.soup.builder.set_up_substitutions(self.element)
|
||||
attributes = property(getAttributes, setAttributes)
|
||||
|
||||
def insertText(self, data, insertBefore=None):
|
||||
text = TextNode(self.soup.new_string(data), self.soup)
|
||||
if insertBefore:
|
||||
self.insertBefore(text, insertBefore)
|
||||
else:
|
||||
self.appendChild(text)
|
||||
|
||||
def insertBefore(self, node, refNode):
|
||||
index = self.element.index(refNode.element)
|
||||
if (node.element.__class__ == NavigableString and self.element.contents
|
||||
and self.element.contents[index-1].__class__ == NavigableString):
|
||||
# (See comments in appendChild)
|
||||
old_node = self.element.contents[index-1]
|
||||
new_str = self.soup.new_string(old_node + node.element)
|
||||
old_node.replace_with(new_str)
|
||||
else:
|
||||
self.element.insert(index, node.element)
|
||||
node.parent = self
|
||||
|
||||
def removeChild(self, node):
|
||||
node.element.extract()
|
||||
|
||||
def reparentChildren(self, new_parent):
|
||||
"""Move all of this tag's children into another tag."""
|
||||
# print "MOVE", self.element.contents
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent.element
|
||||
|
||||
element = self.element
|
||||
new_parent_element = new_parent.element
|
||||
# Determine what this tag's next_element will be once all the children
|
||||
# are removed.
|
||||
final_next_element = element.next_sibling
|
||||
|
||||
new_parents_last_descendant = new_parent_element._last_descendant(False, False)
|
||||
if len(new_parent_element.contents) > 0:
|
||||
# The new parent already contains children. We will be
|
||||
# appending this tag's children to the end.
|
||||
new_parents_last_child = new_parent_element.contents[-1]
|
||||
new_parents_last_descendant_next_element = new_parents_last_descendant.next_element
|
||||
else:
|
||||
# The new parent contains no children.
|
||||
new_parents_last_child = None
|
||||
new_parents_last_descendant_next_element = new_parent_element.next_element
|
||||
|
||||
to_append = element.contents
|
||||
if len(to_append) > 0:
|
||||
# Set the first child's previous_element and previous_sibling
|
||||
# to elements within the new parent
|
||||
first_child = to_append[0]
|
||||
if new_parents_last_descendant is not None:
|
||||
first_child.previous_element = new_parents_last_descendant
|
||||
else:
|
||||
first_child.previous_element = new_parent_element
|
||||
first_child.previous_sibling = new_parents_last_child
|
||||
if new_parents_last_descendant is not None:
|
||||
new_parents_last_descendant.next_element = first_child
|
||||
else:
|
||||
new_parent_element.next_element = first_child
|
||||
if new_parents_last_child is not None:
|
||||
new_parents_last_child.next_sibling = first_child
|
||||
|
||||
# Find the very last element being moved. It is now the
|
||||
# parent's last descendant. It has no .next_sibling and
|
||||
# its .next_element is whatever the previous last
|
||||
# descendant had.
|
||||
last_childs_last_descendant = to_append[-1]._last_descendant(False, True)
|
||||
|
||||
last_childs_last_descendant.next_element = new_parents_last_descendant_next_element
|
||||
if new_parents_last_descendant_next_element is not None:
|
||||
# TODO: This code has no test coverage and I'm not sure
|
||||
# how to get html5lib to go through this path, but it's
|
||||
# just the other side of the previous line.
|
||||
new_parents_last_descendant_next_element.previous_element = last_childs_last_descendant
|
||||
last_childs_last_descendant.next_sibling = None
|
||||
|
||||
for child in to_append:
|
||||
child.parent = new_parent_element
|
||||
new_parent_element.contents.append(child)
|
||||
|
||||
# Now that this element has no children, change its .next_element.
|
||||
element.contents = []
|
||||
element.next_element = final_next_element
|
||||
|
||||
# print "DONE WITH MOVE"
|
||||
# print "FROM", self.element
|
||||
# print "TO", new_parent_element
|
||||
|
||||
def cloneNode(self):
|
||||
tag = self.soup.new_tag(self.element.name, self.namespace)
|
||||
node = Element(tag, self.soup, self.namespace)
|
||||
for key,value in self.attributes:
|
||||
node.attributes[key] = value
|
||||
return node
|
||||
|
||||
def hasContent(self):
|
||||
return self.element.contents
|
||||
|
||||
def getNameTuple(self):
|
||||
if self.namespace == None:
|
||||
return namespaces["html"], self.name
|
||||
else:
|
||||
return self.namespace, self.name
|
||||
|
||||
nameTuple = property(getNameTuple)
|
||||
|
||||
class TextNode(Element):
|
||||
def __init__(self, element, soup):
|
||||
treebuilder_base.Node.__init__(self, None)
|
||||
self.element = element
|
||||
self.soup = soup
|
||||
|
||||
def cloneNode(self):
|
||||
raise NotImplementedError
|
||||
@@ -1,350 +0,0 @@
|
||||
# encoding: utf-8
|
||||
"""Use the HTMLParser library to parse HTML files that aren't too bad."""
|
||||
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'HTMLParserTreeBuilder',
|
||||
]
|
||||
|
||||
from html.parser import HTMLParser
|
||||
|
||||
try:
|
||||
from html.parser import HTMLParseError
|
||||
except ImportError as e:
|
||||
# HTMLParseError is removed in Python 3.5. Since it can never be
|
||||
# thrown in 3.5, we can just define our own class as a placeholder.
|
||||
class HTMLParseError(Exception):
|
||||
pass
|
||||
|
||||
import sys
|
||||
import warnings
|
||||
|
||||
# Starting in Python 3.2, the HTMLParser constructor takes a 'strict'
|
||||
# argument, which we'd like to set to False. Unfortunately,
|
||||
# http://bugs.python.org/issue13273 makes strict=True a better bet
|
||||
# before Python 3.2.3.
|
||||
#
|
||||
# At the end of this file, we monkeypatch HTMLParser so that
|
||||
# strict=True works well on Python 3.2.2.
|
||||
major, minor, release = sys.version_info[:3]
|
||||
CONSTRUCTOR_TAKES_STRICT = major == 3 and minor == 2 and release >= 3
|
||||
CONSTRUCTOR_STRICT_IS_DEPRECATED = major == 3 and minor == 3
|
||||
CONSTRUCTOR_TAKES_CONVERT_CHARREFS = major == 3 and minor >= 4
|
||||
|
||||
|
||||
from bs4.element import (
|
||||
CData,
|
||||
Comment,
|
||||
Declaration,
|
||||
Doctype,
|
||||
ProcessingInstruction,
|
||||
)
|
||||
from bs4.dammit import EntitySubstitution, UnicodeDammit
|
||||
|
||||
from bs4.builder import (
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
STRICT,
|
||||
)
|
||||
|
||||
|
||||
HTMLPARSER = 'html.parser'
|
||||
|
||||
class BeautifulSoupHTMLParser(HTMLParser):
|
||||
|
||||
def __init__(self, *args, **kwargs):
|
||||
HTMLParser.__init__(self, *args, **kwargs)
|
||||
|
||||
# Keep a list of empty-element tags that were encountered
|
||||
# without an explicit closing tag. If we encounter a closing tag
|
||||
# of this type, we'll associate it with one of those entries.
|
||||
#
|
||||
# This isn't a stack because we don't care about the
|
||||
# order. It's a list of closing tags we've already handled and
|
||||
# will ignore, assuming they ever show up.
|
||||
self.already_closed_empty_element = []
|
||||
|
||||
def error(self, msg):
|
||||
"""In Python 3, HTMLParser subclasses must implement error(), although this
|
||||
requirement doesn't appear to be documented.
|
||||
|
||||
In Python 2, HTMLParser implements error() as raising an exception.
|
||||
|
||||
In any event, this method is called only on very strange markup and our best strategy
|
||||
is to pretend it didn't happen and keep going.
|
||||
"""
|
||||
warnings.warn(msg)
|
||||
|
||||
def handle_startendtag(self, name, attrs):
|
||||
# This is only called when the markup looks like
|
||||
# <tag/>.
|
||||
|
||||
# is_startend() tells handle_starttag not to close the tag
|
||||
# just because its name matches a known empty-element tag. We
|
||||
# know that this is an empty-element tag and we want to call
|
||||
# handle_endtag ourselves.
|
||||
tag = self.handle_starttag(name, attrs, handle_empty_element=False)
|
||||
self.handle_endtag(name)
|
||||
|
||||
def handle_starttag(self, name, attrs, handle_empty_element=True):
|
||||
# XXX namespace
|
||||
attr_dict = {}
|
||||
for key, value in attrs:
|
||||
# Change None attribute values to the empty string
|
||||
# for consistency with the other tree builders.
|
||||
if value is None:
|
||||
value = ''
|
||||
attr_dict[key] = value
|
||||
attrvalue = '""'
|
||||
#print "START", name
|
||||
tag = self.soup.handle_starttag(name, None, None, attr_dict)
|
||||
if tag and tag.is_empty_element and handle_empty_element:
|
||||
# Unlike other parsers, html.parser doesn't send separate end tag
|
||||
# events for empty-element tags. (It's handled in
|
||||
# handle_startendtag, but only if the original markup looked like
|
||||
# <tag/>.)
|
||||
#
|
||||
# So we need to call handle_endtag() ourselves. Since we
|
||||
# know the start event is identical to the end event, we
|
||||
# don't want handle_endtag() to cross off any previous end
|
||||
# events for tags of this name.
|
||||
self.handle_endtag(name, check_already_closed=False)
|
||||
|
||||
# But we might encounter an explicit closing tag for this tag
|
||||
# later on. If so, we want to ignore it.
|
||||
self.already_closed_empty_element.append(name)
|
||||
|
||||
def handle_endtag(self, name, check_already_closed=True):
|
||||
#print "END", name
|
||||
if check_already_closed and name in self.already_closed_empty_element:
|
||||
# This is a redundant end tag for an empty-element tag.
|
||||
# We've already called handle_endtag() for it, so just
|
||||
# check it off the list.
|
||||
# print "ALREADY CLOSED", name
|
||||
self.already_closed_empty_element.remove(name)
|
||||
else:
|
||||
self.soup.handle_endtag(name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self.soup.handle_data(data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
# XXX workaround for a bug in HTMLParser. Remove this once
|
||||
# it's fixed in all supported versions.
|
||||
# http://bugs.python.org/issue13633
|
||||
if name.startswith('x'):
|
||||
real_name = int(name.lstrip('x'), 16)
|
||||
elif name.startswith('X'):
|
||||
real_name = int(name.lstrip('X'), 16)
|
||||
else:
|
||||
real_name = int(name)
|
||||
|
||||
data = None
|
||||
if real_name < 256:
|
||||
# HTML numeric entities are supposed to reference Unicode
|
||||
# code points, but sometimes they reference code points in
|
||||
# some other encoding (ahem, Windows-1252). E.g. “
|
||||
# instead of É for LEFT DOUBLE QUOTATION MARK. This
|
||||
# code tries to detect this situation and compensate.
|
||||
for encoding in (self.soup.original_encoding, 'windows-1252'):
|
||||
if not encoding:
|
||||
continue
|
||||
try:
|
||||
data = bytearray([real_name]).decode(encoding)
|
||||
except UnicodeDecodeError as e:
|
||||
pass
|
||||
if not data:
|
||||
try:
|
||||
data = chr(real_name)
|
||||
except (ValueError, OverflowError) as e:
|
||||
pass
|
||||
data = data or "\N{REPLACEMENT CHARACTER}"
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
character = EntitySubstitution.HTML_ENTITY_TO_CHARACTER.get(name)
|
||||
if character is not None:
|
||||
data = character
|
||||
else:
|
||||
# If this were XML, it would be ambiguous whether "&foo"
|
||||
# was an character entity reference with a missing
|
||||
# semicolon or the literal string "&foo". Since this is
|
||||
# HTML, we have a complete list of all character entity references,
|
||||
# and this one wasn't found, so assume it's the literal string "&foo".
|
||||
data = "&%s" % name
|
||||
self.handle_data(data)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self.soup.endData()
|
||||
if data.startswith("DOCTYPE "):
|
||||
data = data[len("DOCTYPE "):]
|
||||
elif data == 'DOCTYPE':
|
||||
# i.e. "<!DOCTYPE>"
|
||||
data = ''
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(Doctype)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
if data.upper().startswith('CDATA['):
|
||||
cls = CData
|
||||
data = data[len('CDATA['):]
|
||||
else:
|
||||
cls = Declaration
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(cls)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(data)
|
||||
self.soup.endData(ProcessingInstruction)
|
||||
|
||||
|
||||
class HTMLParserTreeBuilder(HTMLTreeBuilder):
|
||||
|
||||
is_xml = False
|
||||
picklable = True
|
||||
NAME = HTMLPARSER
|
||||
features = [NAME, HTML, STRICT]
|
||||
|
||||
def __init__(self, parser_args=None, parser_kwargs=None, **kwargs):
|
||||
super(HTMLParserTreeBuilder, self).__init__(**kwargs)
|
||||
parser_args = parser_args or []
|
||||
parser_kwargs = parser_kwargs or {}
|
||||
if CONSTRUCTOR_TAKES_STRICT and not CONSTRUCTOR_STRICT_IS_DEPRECATED:
|
||||
parser_kwargs['strict'] = False
|
||||
if CONSTRUCTOR_TAKES_CONVERT_CHARREFS:
|
||||
parser_kwargs['convert_charrefs'] = False
|
||||
self.parser_args = (parser_args, parser_kwargs)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
document_declared_encoding=None, exclude_encodings=None):
|
||||
"""
|
||||
:return: A 4-tuple (markup, original encoding, encoding
|
||||
declared within markup, whether any characters had to be
|
||||
replaced with REPLACEMENT CHARACTER).
|
||||
"""
|
||||
if isinstance(markup, str):
|
||||
yield (markup, None, None, False)
|
||||
return
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
dammit = UnicodeDammit(markup, try_encodings, is_html=True,
|
||||
exclude_encodings=exclude_encodings)
|
||||
yield (dammit.markup, dammit.original_encoding,
|
||||
dammit.declared_html_encoding,
|
||||
dammit.contains_replacement_characters)
|
||||
|
||||
def feed(self, markup):
|
||||
args, kwargs = self.parser_args
|
||||
parser = BeautifulSoupHTMLParser(*args, **kwargs)
|
||||
parser.soup = self.soup
|
||||
try:
|
||||
parser.feed(markup)
|
||||
parser.close()
|
||||
except HTMLParseError as e:
|
||||
warnings.warn(RuntimeWarning(
|
||||
"Python's built-in HTMLParser cannot parse the given document. This is not a bug in Beautiful Soup. The best solution is to install an external parser (lxml or html5lib), and use Beautiful Soup with that parser. See http://www.crummy.com/software/BeautifulSoup/bs4/doc/#installing-a-parser for help."))
|
||||
raise e
|
||||
parser.already_closed_empty_element = []
|
||||
|
||||
# Patch 3.2 versions of HTMLParser earlier than 3.2.3 to use some
|
||||
# 3.2.3 code. This ensures they don't treat markup like <p></p> as a
|
||||
# string.
|
||||
#
|
||||
# XXX This code can be removed once most Python 3 users are on 3.2.3.
|
||||
if major == 3 and minor == 2 and not CONSTRUCTOR_TAKES_STRICT:
|
||||
import re
|
||||
attrfind_tolerant = re.compile(
|
||||
r'\s*((?<=[\'"\s])[^\s/>][^\s/=>]*)(\s*=+\s*'
|
||||
r'(\'[^\']*\'|"[^"]*"|(?![\'"])[^>\s]*))?')
|
||||
HTMLParserTreeBuilder.attrfind_tolerant = attrfind_tolerant
|
||||
|
||||
locatestarttagend = re.compile(r"""
|
||||
<[a-zA-Z][-.a-zA-Z0-9:_]* # tag name
|
||||
(?:\s+ # whitespace before attribute name
|
||||
(?:[a-zA-Z_][-.:a-zA-Z0-9_]* # attribute name
|
||||
(?:\s*=\s* # value indicator
|
||||
(?:'[^']*' # LITA-enclosed value
|
||||
|\"[^\"]*\" # LIT-enclosed value
|
||||
|[^'\">\s]+ # bare value
|
||||
)
|
||||
)?
|
||||
)
|
||||
)*
|
||||
\s* # trailing whitespace
|
||||
""", re.VERBOSE)
|
||||
BeautifulSoupHTMLParser.locatestarttagend = locatestarttagend
|
||||
|
||||
from html.parser import tagfind, attrfind
|
||||
|
||||
def parse_starttag(self, i):
|
||||
self.__starttag_text = None
|
||||
endpos = self.check_for_whole_start_tag(i)
|
||||
if endpos < 0:
|
||||
return endpos
|
||||
rawdata = self.rawdata
|
||||
self.__starttag_text = rawdata[i:endpos]
|
||||
|
||||
# Now parse the data between i+1 and j into a tag and attrs
|
||||
attrs = []
|
||||
match = tagfind.match(rawdata, i+1)
|
||||
assert match, 'unexpected call to parse_starttag()'
|
||||
k = match.end()
|
||||
self.lasttag = tag = rawdata[i+1:k].lower()
|
||||
while k < endpos:
|
||||
if self.strict:
|
||||
m = attrfind.match(rawdata, k)
|
||||
else:
|
||||
m = attrfind_tolerant.match(rawdata, k)
|
||||
if not m:
|
||||
break
|
||||
attrname, rest, attrvalue = m.group(1, 2, 3)
|
||||
if not rest:
|
||||
attrvalue = None
|
||||
elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
|
||||
attrvalue[:1] == '"' == attrvalue[-1:]:
|
||||
attrvalue = attrvalue[1:-1]
|
||||
if attrvalue:
|
||||
attrvalue = self.unescape(attrvalue)
|
||||
attrs.append((attrname.lower(), attrvalue))
|
||||
k = m.end()
|
||||
|
||||
end = rawdata[k:endpos].strip()
|
||||
if end not in (">", "/>"):
|
||||
lineno, offset = self.getpos()
|
||||
if "\n" in self.__starttag_text:
|
||||
lineno = lineno + self.__starttag_text.count("\n")
|
||||
offset = len(self.__starttag_text) \
|
||||
- self.__starttag_text.rfind("\n")
|
||||
else:
|
||||
offset = offset + len(self.__starttag_text)
|
||||
if self.strict:
|
||||
self.error("junk characters in start tag: %r"
|
||||
% (rawdata[k:endpos][:20],))
|
||||
self.handle_data(rawdata[i:endpos])
|
||||
return endpos
|
||||
if end.endswith('/>'):
|
||||
# XHTML-style empty tag: <span attr="value" />
|
||||
self.handle_startendtag(tag, attrs)
|
||||
else:
|
||||
self.handle_starttag(tag, attrs)
|
||||
if tag in self.CDATA_CONTENT_ELEMENTS:
|
||||
self.set_cdata_mode(tag)
|
||||
return endpos
|
||||
|
||||
def set_cdata_mode(self, elem):
|
||||
self.cdata_elem = elem.lower()
|
||||
self.interesting = re.compile(r'</\s*%s\s*>' % self.cdata_elem, re.I)
|
||||
|
||||
BeautifulSoupHTMLParser.parse_starttag = parse_starttag
|
||||
BeautifulSoupHTMLParser.set_cdata_mode = set_cdata_mode
|
||||
|
||||
CONSTRUCTOR_TAKES_STRICT = True
|
||||
@@ -1,296 +0,0 @@
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
__all__ = [
|
||||
'LXMLTreeBuilderForXML',
|
||||
'LXMLTreeBuilder',
|
||||
]
|
||||
|
||||
try:
|
||||
from collections.abc import Callable # Python 3.6
|
||||
except ImportError as e:
|
||||
from collections import Callable
|
||||
|
||||
from io import BytesIO
|
||||
from io import StringIO
|
||||
from lxml import etree
|
||||
from bs4.element import (
|
||||
Comment,
|
||||
Doctype,
|
||||
NamespacedAttribute,
|
||||
ProcessingInstruction,
|
||||
XMLProcessingInstruction,
|
||||
)
|
||||
from bs4.builder import (
|
||||
FAST,
|
||||
HTML,
|
||||
HTMLTreeBuilder,
|
||||
PERMISSIVE,
|
||||
ParserRejectedMarkup,
|
||||
TreeBuilder,
|
||||
XML)
|
||||
from bs4.dammit import EncodingDetector
|
||||
|
||||
LXML = 'lxml'
|
||||
|
||||
def _invert(d):
|
||||
"Invert a dictionary."
|
||||
return dict((v,k) for k, v in list(d.items()))
|
||||
|
||||
class LXMLTreeBuilderForXML(TreeBuilder):
|
||||
DEFAULT_PARSER_CLASS = etree.XMLParser
|
||||
|
||||
is_xml = True
|
||||
processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
NAME = "lxml-xml"
|
||||
ALTERNATE_NAMES = ["xml"]
|
||||
|
||||
# Well, it's permissive by XML parser standards.
|
||||
features = [NAME, LXML, XML, FAST, PERMISSIVE]
|
||||
|
||||
CHUNK_SIZE = 512
|
||||
|
||||
# This namespace mapping is specified in the XML Namespace
|
||||
# standard.
|
||||
DEFAULT_NSMAPS = dict(xml='http://www.w3.org/XML/1998/namespace')
|
||||
|
||||
DEFAULT_NSMAPS_INVERTED = _invert(DEFAULT_NSMAPS)
|
||||
|
||||
def initialize_soup(self, soup):
|
||||
"""Let the BeautifulSoup object know about the standard namespace
|
||||
mapping.
|
||||
"""
|
||||
super(LXMLTreeBuilderForXML, self).initialize_soup(soup)
|
||||
self._register_namespaces(self.DEFAULT_NSMAPS)
|
||||
|
||||
def _register_namespaces(self, mapping):
|
||||
"""Let the BeautifulSoup object know about namespaces encountered
|
||||
while parsing the document.
|
||||
|
||||
This might be useful later on when creating CSS selectors.
|
||||
"""
|
||||
for key, value in list(mapping.items()):
|
||||
if key and key not in self.soup._namespaces:
|
||||
# Let the BeautifulSoup object know about a new namespace.
|
||||
# If there are multiple namespaces defined with the same
|
||||
# prefix, the first one in the document takes precedence.
|
||||
self.soup._namespaces[key] = value
|
||||
|
||||
def default_parser(self, encoding):
|
||||
# This can either return a parser object or a class, which
|
||||
# will be instantiated with default arguments.
|
||||
if self._default_parser is not None:
|
||||
return self._default_parser
|
||||
return etree.XMLParser(
|
||||
target=self, strip_cdata=False, recover=True, encoding=encoding)
|
||||
|
||||
def parser_for(self, encoding):
|
||||
# Use the default parser.
|
||||
parser = self.default_parser(encoding)
|
||||
|
||||
if isinstance(parser, Callable):
|
||||
# Instantiate the parser with default arguments
|
||||
parser = parser(target=self, strip_cdata=False, encoding=encoding)
|
||||
return parser
|
||||
|
||||
def __init__(self, parser=None, empty_element_tags=None, **kwargs):
|
||||
# TODO: Issue a warning if parser is present but not a
|
||||
# callable, since that means there's no way to create new
|
||||
# parsers for different encodings.
|
||||
self._default_parser = parser
|
||||
if empty_element_tags is not None:
|
||||
self.empty_element_tags = set(empty_element_tags)
|
||||
self.soup = None
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
super(LXMLTreeBuilderForXML, self).__init__(**kwargs)
|
||||
|
||||
def _getNsTag(self, tag):
|
||||
# Split the namespace URL out of a fully-qualified lxml tag
|
||||
# name. Copied from lxml's src/lxml/sax.py.
|
||||
if tag[0] == '{':
|
||||
return tuple(tag[1:].split('}', 1))
|
||||
else:
|
||||
return (None, tag)
|
||||
|
||||
def prepare_markup(self, markup, user_specified_encoding=None,
|
||||
exclude_encodings=None,
|
||||
document_declared_encoding=None):
|
||||
"""
|
||||
:yield: A series of 4-tuples.
|
||||
(markup, encoding, declared encoding,
|
||||
has undergone character replacement)
|
||||
|
||||
Each 4-tuple represents a strategy for parsing the document.
|
||||
"""
|
||||
# Instead of using UnicodeDammit to convert the bytestring to
|
||||
# Unicode using different encodings, use EncodingDetector to
|
||||
# iterate over the encodings, and tell lxml to try to parse
|
||||
# the document as each one in turn.
|
||||
is_html = not self.is_xml
|
||||
if is_html:
|
||||
self.processing_instruction_class = ProcessingInstruction
|
||||
else:
|
||||
self.processing_instruction_class = XMLProcessingInstruction
|
||||
|
||||
if isinstance(markup, str):
|
||||
# We were given Unicode. Maybe lxml can parse Unicode on
|
||||
# this system?
|
||||
yield markup, None, document_declared_encoding, False
|
||||
|
||||
if isinstance(markup, str):
|
||||
# No, apparently not. Convert the Unicode to UTF-8 and
|
||||
# tell lxml to parse it as UTF-8.
|
||||
yield (markup.encode("utf8"), "utf8",
|
||||
document_declared_encoding, False)
|
||||
|
||||
try_encodings = [user_specified_encoding, document_declared_encoding]
|
||||
detector = EncodingDetector(
|
||||
markup, try_encodings, is_html, exclude_encodings)
|
||||
for encoding in detector.encodings:
|
||||
yield (detector.markup, encoding, document_declared_encoding, False)
|
||||
|
||||
def feed(self, markup):
|
||||
if isinstance(markup, bytes):
|
||||
markup = BytesIO(markup)
|
||||
elif isinstance(markup, str):
|
||||
markup = StringIO(markup)
|
||||
|
||||
# Call feed() at least once, even if the markup is empty,
|
||||
# or the parser won't be initialized.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
try:
|
||||
self.parser = self.parser_for(self.soup.original_encoding)
|
||||
self.parser.feed(data)
|
||||
while len(data) != 0:
|
||||
# Now call feed() on the rest of the data, chunk by chunk.
|
||||
data = markup.read(self.CHUNK_SIZE)
|
||||
if len(data) != 0:
|
||||
self.parser.feed(data)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
def close(self):
|
||||
self.nsmaps = [self.DEFAULT_NSMAPS_INVERTED]
|
||||
|
||||
def start(self, name, attrs, nsmap={}):
|
||||
# Make sure attrs is a mutable dict--lxml may send an immutable dictproxy.
|
||||
attrs = dict(attrs)
|
||||
nsprefix = None
|
||||
# Invert each namespace map as it comes in.
|
||||
if len(nsmap) == 0 and len(self.nsmaps) > 1:
|
||||
# There are no new namespaces for this tag, but
|
||||
# non-default namespaces are in play, so we need a
|
||||
# separate tag stack to know when they end.
|
||||
self.nsmaps.append(None)
|
||||
elif len(nsmap) > 0:
|
||||
# A new namespace mapping has come into play.
|
||||
|
||||
# First, Let the BeautifulSoup object know about it.
|
||||
self._register_namespaces(nsmap)
|
||||
|
||||
# Then, add it to our running list of inverted namespace
|
||||
# mappings.
|
||||
self.nsmaps.append(_invert(nsmap))
|
||||
|
||||
# Also treat the namespace mapping as a set of attributes on the
|
||||
# tag, so we can recreate it later.
|
||||
attrs = attrs.copy()
|
||||
for prefix, namespace in list(nsmap.items()):
|
||||
attribute = NamespacedAttribute(
|
||||
"xmlns", prefix, "http://www.w3.org/2000/xmlns/")
|
||||
attrs[attribute] = namespace
|
||||
|
||||
# Namespaces are in play. Find any attributes that came in
|
||||
# from lxml with namespaces attached to their names, and
|
||||
# turn then into NamespacedAttribute objects.
|
||||
new_attrs = {}
|
||||
for attr, value in list(attrs.items()):
|
||||
namespace, attr = self._getNsTag(attr)
|
||||
if namespace is None:
|
||||
new_attrs[attr] = value
|
||||
else:
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
attr = NamespacedAttribute(nsprefix, attr, namespace)
|
||||
new_attrs[attr] = value
|
||||
attrs = new_attrs
|
||||
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = self._prefix_for_namespace(namespace)
|
||||
self.soup.handle_starttag(name, namespace, nsprefix, attrs)
|
||||
|
||||
def _prefix_for_namespace(self, namespace):
|
||||
"""Find the currently active prefix for the given namespace."""
|
||||
if namespace is None:
|
||||
return None
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
return inverted_nsmap[namespace]
|
||||
return None
|
||||
|
||||
def end(self, name):
|
||||
self.soup.endData()
|
||||
completed_tag = self.soup.tagStack[-1]
|
||||
namespace, name = self._getNsTag(name)
|
||||
nsprefix = None
|
||||
if namespace is not None:
|
||||
for inverted_nsmap in reversed(self.nsmaps):
|
||||
if inverted_nsmap is not None and namespace in inverted_nsmap:
|
||||
nsprefix = inverted_nsmap[namespace]
|
||||
break
|
||||
self.soup.handle_endtag(name, nsprefix)
|
||||
if len(self.nsmaps) > 1:
|
||||
# This tag, or one of its parents, introduced a namespace
|
||||
# mapping, so pop it off the stack.
|
||||
self.nsmaps.pop()
|
||||
|
||||
def pi(self, target, data):
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(target + ' ' + data)
|
||||
self.soup.endData(self.processing_instruction_class)
|
||||
|
||||
def data(self, content):
|
||||
self.soup.handle_data(content)
|
||||
|
||||
def doctype(self, name, pubid, system):
|
||||
self.soup.endData()
|
||||
doctype = Doctype.for_name_and_ids(name, pubid, system)
|
||||
self.soup.object_was_parsed(doctype)
|
||||
|
||||
def comment(self, content):
|
||||
"Handle comments as Comment objects."
|
||||
self.soup.endData()
|
||||
self.soup.handle_data(content)
|
||||
self.soup.endData(Comment)
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return '<?xml version="1.0" encoding="utf-8"?>\n%s' % fragment
|
||||
|
||||
|
||||
class LXMLTreeBuilder(HTMLTreeBuilder, LXMLTreeBuilderForXML):
|
||||
|
||||
NAME = LXML
|
||||
ALTERNATE_NAMES = ["lxml-html"]
|
||||
|
||||
features = ALTERNATE_NAMES + [NAME, HTML, FAST, PERMISSIVE]
|
||||
is_xml = False
|
||||
processing_instruction_class = ProcessingInstruction
|
||||
|
||||
def default_parser(self, encoding):
|
||||
return etree.HTMLParser
|
||||
|
||||
def feed(self, markup):
|
||||
encoding = self.soup.original_encoding
|
||||
try:
|
||||
self.parser = self.parser_for(encoding)
|
||||
self.parser.feed(markup)
|
||||
self.parser.close()
|
||||
except (UnicodeDecodeError, LookupError, etree.ParserError) as e:
|
||||
raise ParserRejectedMarkup(str(e))
|
||||
|
||||
|
||||
def test_fragment_to_document(self, fragment):
|
||||
"""See `TreeBuilder`."""
|
||||
return '<html><body>%s</body></html>' % fragment
|
||||
@@ -1,850 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Beautiful Soup bonus library: Unicode, Dammit
|
||||
|
||||
This library converts a bytestream to Unicode through any means
|
||||
necessary. It is heavily based on code from Mark Pilgrim's Universal
|
||||
Feed Parser. It works best on XML and HTML, but it does not rewrite the
|
||||
XML or HTML to reflect a new encoding; that's the tree builder's job.
|
||||
"""
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import codecs
|
||||
from html.entities import codepoint2name
|
||||
import re
|
||||
import logging
|
||||
import string
|
||||
|
||||
# Import a library to autodetect character encodings.
|
||||
chardet_type = None
|
||||
try:
|
||||
# First try the fast C implementation.
|
||||
# PyPI package: cchardet
|
||||
import cchardet
|
||||
def chardet_dammit(s):
|
||||
return cchardet.detect(s)['encoding']
|
||||
except ImportError:
|
||||
try:
|
||||
# Fall back to the pure Python implementation
|
||||
# Debian package: python-chardet
|
||||
# PyPI package: chardet
|
||||
import chardet
|
||||
def chardet_dammit(s):
|
||||
return chardet.detect(s)['encoding']
|
||||
#import chardet.constants
|
||||
#chardet.constants._debug = 1
|
||||
except ImportError:
|
||||
# No chardet available.
|
||||
def chardet_dammit(s):
|
||||
return None
|
||||
|
||||
# Available from http://cjkpython.i18n.org/.
|
||||
try:
|
||||
import iconv_codec
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
xml_encoding_re = re.compile(
|
||||
'^<\\?.*encoding=[\'"](.*?)[\'"].*\\?>'.encode(), re.I)
|
||||
html_meta_re = re.compile(
|
||||
'<\\s*meta[^>]+charset\\s*=\\s*["\']?([^>]*?)[ /;\'">]'.encode(), re.I)
|
||||
|
||||
class EntitySubstitution(object):
|
||||
|
||||
"""Substitute XML or HTML entities for the corresponding characters."""
|
||||
|
||||
def _populate_class_variables():
|
||||
lookup = {}
|
||||
reverse_lookup = {}
|
||||
characters_for_re = []
|
||||
|
||||
# &apos is an XHTML entity and an HTML 5, but not an HTML 4
|
||||
# entity. We don't want to use it, but we want to recognize it on the way in.
|
||||
#
|
||||
# TODO: Ideally we would be able to recognize all HTML 5 named
|
||||
# entities, but that's a little tricky.
|
||||
extra = [(39, 'apos')]
|
||||
for codepoint, name in list(codepoint2name.items()) + extra:
|
||||
character = chr(codepoint)
|
||||
if codepoint not in (34, 39):
|
||||
# There's no point in turning the quotation mark into
|
||||
# " or the single quote into ', unless it
|
||||
# happens within an attribute value, which is handled
|
||||
# elsewhere.
|
||||
characters_for_re.append(character)
|
||||
lookup[character] = name
|
||||
# But we do want to recognize those entities on the way in and
|
||||
# convert them to Unicode characters.
|
||||
reverse_lookup[name] = character
|
||||
re_definition = "[%s]" % "".join(characters_for_re)
|
||||
return lookup, reverse_lookup, re.compile(re_definition)
|
||||
(CHARACTER_TO_HTML_ENTITY, HTML_ENTITY_TO_CHARACTER,
|
||||
CHARACTER_TO_HTML_ENTITY_RE) = _populate_class_variables()
|
||||
|
||||
CHARACTER_TO_XML_ENTITY = {
|
||||
"'": "apos",
|
||||
'"': "quot",
|
||||
"&": "amp",
|
||||
"<": "lt",
|
||||
">": "gt",
|
||||
}
|
||||
|
||||
BARE_AMPERSAND_OR_BRACKET = re.compile("([<>]|"
|
||||
"&(?!#\\d+;|#x[0-9a-fA-F]+;|\\w+;)"
|
||||
")")
|
||||
|
||||
AMPERSAND_OR_BRACKET = re.compile("([<>&])")
|
||||
|
||||
@classmethod
|
||||
def _substitute_html_entity(cls, matchobj):
|
||||
entity = cls.CHARACTER_TO_HTML_ENTITY.get(matchobj.group(0))
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def _substitute_xml_entity(cls, matchobj):
|
||||
"""Used with a regular expression to substitute the
|
||||
appropriate XML entity for an XML special character."""
|
||||
entity = cls.CHARACTER_TO_XML_ENTITY[matchobj.group(0)]
|
||||
return "&%s;" % entity
|
||||
|
||||
@classmethod
|
||||
def quoted_attribute_value(self, value):
|
||||
"""Make a value into a quoted XML attribute, possibly escaping it.
|
||||
|
||||
Most strings will be quoted using double quotes.
|
||||
|
||||
Bob's Bar -> "Bob's Bar"
|
||||
|
||||
If a string contains double quotes, it will be quoted using
|
||||
single quotes.
|
||||
|
||||
Welcome to "my bar" -> 'Welcome to "my bar"'
|
||||
|
||||
If a string contains both single and double quotes, the
|
||||
double quotes will be escaped, and the string will be quoted
|
||||
using double quotes.
|
||||
|
||||
Welcome to "Bob's Bar" -> "Welcome to "Bob's bar"
|
||||
"""
|
||||
quote_with = '"'
|
||||
if '"' in value:
|
||||
if "'" in value:
|
||||
# The string contains both single and double
|
||||
# quotes. Turn the double quotes into
|
||||
# entities. We quote the double quotes rather than
|
||||
# the single quotes because the entity name is
|
||||
# """ whether this is HTML or XML. If we
|
||||
# quoted the single quotes, we'd have to decide
|
||||
# between ' and &squot;.
|
||||
replace_with = """
|
||||
value = value.replace('"', replace_with)
|
||||
else:
|
||||
# There are double quotes but no single quotes.
|
||||
# We can use single quotes to quote the attribute.
|
||||
quote_with = "'"
|
||||
return quote_with + value + quote_with
|
||||
|
||||
@classmethod
|
||||
def substitute_xml(cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign
|
||||
will become <, the greater-than sign will become >,
|
||||
and any ampersands will become &. If you want ampersands
|
||||
that appear to be part of an entity definition to be left
|
||||
alone, use substitute_xml_containing_entities() instead.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets and ampersands.
|
||||
value = cls.AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_xml_containing_entities(
|
||||
cls, value, make_quoted_attribute=False):
|
||||
"""Substitute XML entities for special XML characters.
|
||||
|
||||
:param value: A string to be substituted. The less-than sign will
|
||||
become <, the greater-than sign will become >, and any
|
||||
ampersands that are not part of an entity defition will
|
||||
become &.
|
||||
|
||||
:param make_quoted_attribute: If True, then the string will be
|
||||
quoted, as befits an attribute value.
|
||||
"""
|
||||
# Escape angle brackets, and ampersands that aren't part of
|
||||
# entities.
|
||||
value = cls.BARE_AMPERSAND_OR_BRACKET.sub(
|
||||
cls._substitute_xml_entity, value)
|
||||
|
||||
if make_quoted_attribute:
|
||||
value = cls.quoted_attribute_value(value)
|
||||
return value
|
||||
|
||||
@classmethod
|
||||
def substitute_html(cls, s):
|
||||
"""Replace certain Unicode characters with named HTML entities.
|
||||
|
||||
This differs from data.encode(encoding, 'xmlcharrefreplace')
|
||||
in that the goal is to make the result more readable (to those
|
||||
with ASCII displays) rather than to recover from
|
||||
errors. There's absolutely nothing wrong with a UTF-8 string
|
||||
containg a LATIN SMALL LETTER E WITH ACUTE, but replacing that
|
||||
character with "é" will make it more readable to some
|
||||
people.
|
||||
"""
|
||||
return cls.CHARACTER_TO_HTML_ENTITY_RE.sub(
|
||||
cls._substitute_html_entity, s)
|
||||
|
||||
|
||||
class EncodingDetector:
|
||||
"""Suggests a number of possible encodings for a bytestring.
|
||||
|
||||
Order of precedence:
|
||||
|
||||
1. Encodings you specifically tell EncodingDetector to try first
|
||||
(the override_encodings argument to the constructor).
|
||||
|
||||
2. An encoding declared within the bytestring itself, either in an
|
||||
XML declaration (if the bytestring is to be interpreted as an XML
|
||||
document), or in a <meta> tag (if the bytestring is to be
|
||||
interpreted as an HTML document.)
|
||||
|
||||
3. An encoding detected through textual analysis by chardet,
|
||||
cchardet, or a similar external library.
|
||||
|
||||
4. UTF-8.
|
||||
|
||||
5. Windows-1252.
|
||||
"""
|
||||
def __init__(self, markup, override_encodings=None, is_html=False,
|
||||
exclude_encodings=None):
|
||||
self.override_encodings = override_encodings or []
|
||||
exclude_encodings = exclude_encodings or []
|
||||
self.exclude_encodings = set([x.lower() for x in exclude_encodings])
|
||||
self.chardet_encoding = None
|
||||
self.is_html = is_html
|
||||
self.declared_encoding = None
|
||||
|
||||
# First order of business: strip a byte-order mark.
|
||||
self.markup, self.sniffed_encoding = self.strip_byte_order_mark(markup)
|
||||
|
||||
def _usable(self, encoding, tried):
|
||||
if encoding is not None:
|
||||
encoding = encoding.lower()
|
||||
if encoding in self.exclude_encodings:
|
||||
return False
|
||||
if encoding not in tried:
|
||||
tried.add(encoding)
|
||||
return True
|
||||
return False
|
||||
|
||||
@property
|
||||
def encodings(self):
|
||||
"""Yield a number of encodings that might work for this markup."""
|
||||
tried = set()
|
||||
for e in self.override_encodings:
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
# Did the document originally start with a byte-order mark
|
||||
# that indicated its encoding?
|
||||
if self._usable(self.sniffed_encoding, tried):
|
||||
yield self.sniffed_encoding
|
||||
|
||||
# Look within the document for an XML or HTML encoding
|
||||
# declaration.
|
||||
if self.declared_encoding is None:
|
||||
self.declared_encoding = self.find_declared_encoding(
|
||||
self.markup, self.is_html)
|
||||
if self._usable(self.declared_encoding, tried):
|
||||
yield self.declared_encoding
|
||||
|
||||
# Use third-party character set detection to guess at the
|
||||
# encoding.
|
||||
if self.chardet_encoding is None:
|
||||
self.chardet_encoding = chardet_dammit(self.markup)
|
||||
if self._usable(self.chardet_encoding, tried):
|
||||
yield self.chardet_encoding
|
||||
|
||||
# As a last-ditch effort, try utf-8 and windows-1252.
|
||||
for e in ('utf-8', 'windows-1252'):
|
||||
if self._usable(e, tried):
|
||||
yield e
|
||||
|
||||
@classmethod
|
||||
def strip_byte_order_mark(cls, data):
|
||||
"""If a byte-order mark is present, strip it and return the encoding it implies."""
|
||||
encoding = None
|
||||
if isinstance(data, str):
|
||||
# Unicode data cannot have a byte-order mark.
|
||||
return data, encoding
|
||||
if (len(data) >= 4) and (data[:2] == b'\xfe\xff') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16be'
|
||||
data = data[2:]
|
||||
elif (len(data) >= 4) and (data[:2] == b'\xff\xfe') \
|
||||
and (data[2:4] != '\x00\x00'):
|
||||
encoding = 'utf-16le'
|
||||
data = data[2:]
|
||||
elif data[:3] == b'\xef\xbb\xbf':
|
||||
encoding = 'utf-8'
|
||||
data = data[3:]
|
||||
elif data[:4] == b'\x00\x00\xfe\xff':
|
||||
encoding = 'utf-32be'
|
||||
data = data[4:]
|
||||
elif data[:4] == b'\xff\xfe\x00\x00':
|
||||
encoding = 'utf-32le'
|
||||
data = data[4:]
|
||||
return data, encoding
|
||||
|
||||
@classmethod
|
||||
def find_declared_encoding(cls, markup, is_html=False, search_entire_document=False):
|
||||
"""Given a document, tries to find its declared encoding.
|
||||
|
||||
An XML encoding is declared at the beginning of the document.
|
||||
|
||||
An HTML encoding is declared in a <meta> tag, hopefully near the
|
||||
beginning of the document.
|
||||
"""
|
||||
if search_entire_document:
|
||||
xml_endpos = html_endpos = len(markup)
|
||||
else:
|
||||
xml_endpos = 1024
|
||||
html_endpos = max(2048, int(len(markup) * 0.05))
|
||||
|
||||
declared_encoding = None
|
||||
declared_encoding_match = xml_encoding_re.search(markup, endpos=xml_endpos)
|
||||
if not declared_encoding_match and is_html:
|
||||
declared_encoding_match = html_meta_re.search(markup, endpos=html_endpos)
|
||||
if declared_encoding_match is not None:
|
||||
declared_encoding = declared_encoding_match.groups()[0].decode(
|
||||
'ascii', 'replace')
|
||||
if declared_encoding:
|
||||
return declared_encoding.lower()
|
||||
return None
|
||||
|
||||
class UnicodeDammit:
|
||||
"""A class for detecting the encoding of a *ML document and
|
||||
converting it to a Unicode string. If the source encoding is
|
||||
windows-1252, can replace MS smart quotes with their HTML or XML
|
||||
equivalents."""
|
||||
|
||||
# This dictionary maps commonly seen values for "charset" in HTML
|
||||
# meta tags to the corresponding Python codec names. It only covers
|
||||
# values that aren't in Python's aliases and can't be determined
|
||||
# by the heuristics in find_codec.
|
||||
CHARSET_ALIASES = {"macintosh": "mac-roman",
|
||||
"x-sjis": "shift-jis"}
|
||||
|
||||
ENCODINGS_WITH_SMART_QUOTES = [
|
||||
"windows-1252",
|
||||
"iso-8859-1",
|
||||
"iso-8859-2",
|
||||
]
|
||||
|
||||
def __init__(self, markup, override_encodings=[],
|
||||
smart_quotes_to=None, is_html=False, exclude_encodings=[]):
|
||||
self.smart_quotes_to = smart_quotes_to
|
||||
self.tried_encodings = []
|
||||
self.contains_replacement_characters = False
|
||||
self.is_html = is_html
|
||||
self.log = logging.getLogger(__name__)
|
||||
self.detector = EncodingDetector(
|
||||
markup, override_encodings, is_html, exclude_encodings)
|
||||
|
||||
# Short-circuit if the data is in Unicode to begin with.
|
||||
if isinstance(markup, str) or markup == '':
|
||||
self.markup = markup
|
||||
self.unicode_markup = str(markup)
|
||||
self.original_encoding = None
|
||||
return
|
||||
|
||||
# The encoding detector may have stripped a byte-order mark.
|
||||
# Use the stripped markup from this point on.
|
||||
self.markup = self.detector.markup
|
||||
|
||||
u = None
|
||||
for encoding in self.detector.encodings:
|
||||
markup = self.detector.markup
|
||||
u = self._convert_from(encoding)
|
||||
if u is not None:
|
||||
break
|
||||
|
||||
if not u:
|
||||
# None of the encodings worked. As an absolute last resort,
|
||||
# try them again with character replacement.
|
||||
|
||||
for encoding in self.detector.encodings:
|
||||
if encoding != "ascii":
|
||||
u = self._convert_from(encoding, "replace")
|
||||
if u is not None:
|
||||
self.log.warning(
|
||||
"Some characters could not be decoded, and were "
|
||||
"replaced with REPLACEMENT CHARACTER."
|
||||
)
|
||||
self.contains_replacement_characters = True
|
||||
break
|
||||
|
||||
# If none of that worked, we could at this point force it to
|
||||
# ASCII, but that would destroy so much data that I think
|
||||
# giving up is better.
|
||||
self.unicode_markup = u
|
||||
if not u:
|
||||
self.original_encoding = None
|
||||
|
||||
def _sub_ms_char(self, match):
|
||||
"""Changes a MS smart quote character to an XML or HTML
|
||||
entity, or an ASCII character."""
|
||||
orig = match.group(1)
|
||||
if self.smart_quotes_to == 'ascii':
|
||||
sub = self.MS_CHARS_TO_ASCII.get(orig).encode()
|
||||
else:
|
||||
sub = self.MS_CHARS.get(orig)
|
||||
if type(sub) == tuple:
|
||||
if self.smart_quotes_to == 'xml':
|
||||
sub = '&#x'.encode() + sub[1].encode() + ';'.encode()
|
||||
else:
|
||||
sub = '&'.encode() + sub[0].encode() + ';'.encode()
|
||||
else:
|
||||
sub = sub.encode()
|
||||
return sub
|
||||
|
||||
def _convert_from(self, proposed, errors="strict"):
|
||||
proposed = self.find_codec(proposed)
|
||||
if not proposed or (proposed, errors) in self.tried_encodings:
|
||||
return None
|
||||
self.tried_encodings.append((proposed, errors))
|
||||
markup = self.markup
|
||||
# Convert smart quotes to HTML if coming from an encoding
|
||||
# that might have them.
|
||||
if (self.smart_quotes_to is not None
|
||||
and proposed in self.ENCODINGS_WITH_SMART_QUOTES):
|
||||
smart_quotes_re = b"([\x80-\x9f])"
|
||||
smart_quotes_compiled = re.compile(smart_quotes_re)
|
||||
markup = smart_quotes_compiled.sub(self._sub_ms_char, markup)
|
||||
|
||||
try:
|
||||
#print "Trying to convert document to %s (errors=%s)" % (
|
||||
# proposed, errors)
|
||||
u = self._to_unicode(markup, proposed, errors)
|
||||
self.markup = u
|
||||
self.original_encoding = proposed
|
||||
except Exception as e:
|
||||
#print "That didn't work!"
|
||||
#print e
|
||||
return None
|
||||
#print "Correct encoding: %s" % proposed
|
||||
return self.markup
|
||||
|
||||
def _to_unicode(self, data, encoding, errors="strict"):
|
||||
'''Given a string and its encoding, decodes the string into Unicode.
|
||||
%encoding is a string recognized by encodings.aliases'''
|
||||
return str(data, encoding, errors)
|
||||
|
||||
@property
|
||||
def declared_html_encoding(self):
|
||||
if not self.is_html:
|
||||
return None
|
||||
return self.detector.declared_encoding
|
||||
|
||||
def find_codec(self, charset):
|
||||
value = (self._codec(self.CHARSET_ALIASES.get(charset, charset))
|
||||
or (charset and self._codec(charset.replace("-", "")))
|
||||
or (charset and self._codec(charset.replace("-", "_")))
|
||||
or (charset and charset.lower())
|
||||
or charset
|
||||
)
|
||||
if value:
|
||||
return value.lower()
|
||||
return None
|
||||
|
||||
def _codec(self, charset):
|
||||
if not charset:
|
||||
return charset
|
||||
codec = None
|
||||
try:
|
||||
codecs.lookup(charset)
|
||||
codec = charset
|
||||
except (LookupError, ValueError):
|
||||
pass
|
||||
return codec
|
||||
|
||||
|
||||
# A partial mapping of ISO-Latin-1 to HTML entities/XML numeric entities.
|
||||
MS_CHARS = {b'\x80': ('euro', '20AC'),
|
||||
b'\x81': ' ',
|
||||
b'\x82': ('sbquo', '201A'),
|
||||
b'\x83': ('fnof', '192'),
|
||||
b'\x84': ('bdquo', '201E'),
|
||||
b'\x85': ('hellip', '2026'),
|
||||
b'\x86': ('dagger', '2020'),
|
||||
b'\x87': ('Dagger', '2021'),
|
||||
b'\x88': ('circ', '2C6'),
|
||||
b'\x89': ('permil', '2030'),
|
||||
b'\x8A': ('Scaron', '160'),
|
||||
b'\x8B': ('lsaquo', '2039'),
|
||||
b'\x8C': ('OElig', '152'),
|
||||
b'\x8D': '?',
|
||||
b'\x8E': ('#x17D', '17D'),
|
||||
b'\x8F': '?',
|
||||
b'\x90': '?',
|
||||
b'\x91': ('lsquo', '2018'),
|
||||
b'\x92': ('rsquo', '2019'),
|
||||
b'\x93': ('ldquo', '201C'),
|
||||
b'\x94': ('rdquo', '201D'),
|
||||
b'\x95': ('bull', '2022'),
|
||||
b'\x96': ('ndash', '2013'),
|
||||
b'\x97': ('mdash', '2014'),
|
||||
b'\x98': ('tilde', '2DC'),
|
||||
b'\x99': ('trade', '2122'),
|
||||
b'\x9a': ('scaron', '161'),
|
||||
b'\x9b': ('rsaquo', '203A'),
|
||||
b'\x9c': ('oelig', '153'),
|
||||
b'\x9d': '?',
|
||||
b'\x9e': ('#x17E', '17E'),
|
||||
b'\x9f': ('Yuml', ''),}
|
||||
|
||||
# A parochial partial mapping of ISO-Latin-1 to ASCII. Contains
|
||||
# horrors like stripping diacritical marks to turn á into a, but also
|
||||
# contains non-horrors like turning “ into ".
|
||||
MS_CHARS_TO_ASCII = {
|
||||
b'\x80' : 'EUR',
|
||||
b'\x81' : ' ',
|
||||
b'\x82' : ',',
|
||||
b'\x83' : 'f',
|
||||
b'\x84' : ',,',
|
||||
b'\x85' : '...',
|
||||
b'\x86' : '+',
|
||||
b'\x87' : '++',
|
||||
b'\x88' : '^',
|
||||
b'\x89' : '%',
|
||||
b'\x8a' : 'S',
|
||||
b'\x8b' : '<',
|
||||
b'\x8c' : 'OE',
|
||||
b'\x8d' : '?',
|
||||
b'\x8e' : 'Z',
|
||||
b'\x8f' : '?',
|
||||
b'\x90' : '?',
|
||||
b'\x91' : "'",
|
||||
b'\x92' : "'",
|
||||
b'\x93' : '"',
|
||||
b'\x94' : '"',
|
||||
b'\x95' : '*',
|
||||
b'\x96' : '-',
|
||||
b'\x97' : '--',
|
||||
b'\x98' : '~',
|
||||
b'\x99' : '(TM)',
|
||||
b'\x9a' : 's',
|
||||
b'\x9b' : '>',
|
||||
b'\x9c' : 'oe',
|
||||
b'\x9d' : '?',
|
||||
b'\x9e' : 'z',
|
||||
b'\x9f' : 'Y',
|
||||
b'\xa0' : ' ',
|
||||
b'\xa1' : '!',
|
||||
b'\xa2' : 'c',
|
||||
b'\xa3' : 'GBP',
|
||||
b'\xa4' : '$', #This approximation is especially parochial--this is the
|
||||
#generic currency symbol.
|
||||
b'\xa5' : 'YEN',
|
||||
b'\xa6' : '|',
|
||||
b'\xa7' : 'S',
|
||||
b'\xa8' : '..',
|
||||
b'\xa9' : '',
|
||||
b'\xaa' : '(th)',
|
||||
b'\xab' : '<<',
|
||||
b'\xac' : '!',
|
||||
b'\xad' : ' ',
|
||||
b'\xae' : '(R)',
|
||||
b'\xaf' : '-',
|
||||
b'\xb0' : 'o',
|
||||
b'\xb1' : '+-',
|
||||
b'\xb2' : '2',
|
||||
b'\xb3' : '3',
|
||||
b'\xb4' : ("'", 'acute'),
|
||||
b'\xb5' : 'u',
|
||||
b'\xb6' : 'P',
|
||||
b'\xb7' : '*',
|
||||
b'\xb8' : ',',
|
||||
b'\xb9' : '1',
|
||||
b'\xba' : '(th)',
|
||||
b'\xbb' : '>>',
|
||||
b'\xbc' : '1/4',
|
||||
b'\xbd' : '1/2',
|
||||
b'\xbe' : '3/4',
|
||||
b'\xbf' : '?',
|
||||
b'\xc0' : 'A',
|
||||
b'\xc1' : 'A',
|
||||
b'\xc2' : 'A',
|
||||
b'\xc3' : 'A',
|
||||
b'\xc4' : 'A',
|
||||
b'\xc5' : 'A',
|
||||
b'\xc6' : 'AE',
|
||||
b'\xc7' : 'C',
|
||||
b'\xc8' : 'E',
|
||||
b'\xc9' : 'E',
|
||||
b'\xca' : 'E',
|
||||
b'\xcb' : 'E',
|
||||
b'\xcc' : 'I',
|
||||
b'\xcd' : 'I',
|
||||
b'\xce' : 'I',
|
||||
b'\xcf' : 'I',
|
||||
b'\xd0' : 'D',
|
||||
b'\xd1' : 'N',
|
||||
b'\xd2' : 'O',
|
||||
b'\xd3' : 'O',
|
||||
b'\xd4' : 'O',
|
||||
b'\xd5' : 'O',
|
||||
b'\xd6' : 'O',
|
||||
b'\xd7' : '*',
|
||||
b'\xd8' : 'O',
|
||||
b'\xd9' : 'U',
|
||||
b'\xda' : 'U',
|
||||
b'\xdb' : 'U',
|
||||
b'\xdc' : 'U',
|
||||
b'\xdd' : 'Y',
|
||||
b'\xde' : 'b',
|
||||
b'\xdf' : 'B',
|
||||
b'\xe0' : 'a',
|
||||
b'\xe1' : 'a',
|
||||
b'\xe2' : 'a',
|
||||
b'\xe3' : 'a',
|
||||
b'\xe4' : 'a',
|
||||
b'\xe5' : 'a',
|
||||
b'\xe6' : 'ae',
|
||||
b'\xe7' : 'c',
|
||||
b'\xe8' : 'e',
|
||||
b'\xe9' : 'e',
|
||||
b'\xea' : 'e',
|
||||
b'\xeb' : 'e',
|
||||
b'\xec' : 'i',
|
||||
b'\xed' : 'i',
|
||||
b'\xee' : 'i',
|
||||
b'\xef' : 'i',
|
||||
b'\xf0' : 'o',
|
||||
b'\xf1' : 'n',
|
||||
b'\xf2' : 'o',
|
||||
b'\xf3' : 'o',
|
||||
b'\xf4' : 'o',
|
||||
b'\xf5' : 'o',
|
||||
b'\xf6' : 'o',
|
||||
b'\xf7' : '/',
|
||||
b'\xf8' : 'o',
|
||||
b'\xf9' : 'u',
|
||||
b'\xfa' : 'u',
|
||||
b'\xfb' : 'u',
|
||||
b'\xfc' : 'u',
|
||||
b'\xfd' : 'y',
|
||||
b'\xfe' : 'b',
|
||||
b'\xff' : 'y',
|
||||
}
|
||||
|
||||
# A map used when removing rogue Windows-1252/ISO-8859-1
|
||||
# characters in otherwise UTF-8 documents.
|
||||
#
|
||||
# Note that \x81, \x8d, \x8f, \x90, and \x9d are undefined in
|
||||
# Windows-1252.
|
||||
WINDOWS_1252_TO_UTF8 = {
|
||||
0x80 : b'\xe2\x82\xac', # €
|
||||
0x82 : b'\xe2\x80\x9a', # ‚
|
||||
0x83 : b'\xc6\x92', # ƒ
|
||||
0x84 : b'\xe2\x80\x9e', # „
|
||||
0x85 : b'\xe2\x80\xa6', # …
|
||||
0x86 : b'\xe2\x80\xa0', # †
|
||||
0x87 : b'\xe2\x80\xa1', # ‡
|
||||
0x88 : b'\xcb\x86', # ˆ
|
||||
0x89 : b'\xe2\x80\xb0', # ‰
|
||||
0x8a : b'\xc5\xa0', # Š
|
||||
0x8b : b'\xe2\x80\xb9', # ‹
|
||||
0x8c : b'\xc5\x92', # Œ
|
||||
0x8e : b'\xc5\xbd', # Ž
|
||||
0x91 : b'\xe2\x80\x98', # ‘
|
||||
0x92 : b'\xe2\x80\x99', # ’
|
||||
0x93 : b'\xe2\x80\x9c', # “
|
||||
0x94 : b'\xe2\x80\x9d', # ”
|
||||
0x95 : b'\xe2\x80\xa2', # •
|
||||
0x96 : b'\xe2\x80\x93', # –
|
||||
0x97 : b'\xe2\x80\x94', # —
|
||||
0x98 : b'\xcb\x9c', # ˜
|
||||
0x99 : b'\xe2\x84\xa2', # ™
|
||||
0x9a : b'\xc5\xa1', # š
|
||||
0x9b : b'\xe2\x80\xba', # ›
|
||||
0x9c : b'\xc5\x93', # œ
|
||||
0x9e : b'\xc5\xbe', # ž
|
||||
0x9f : b'\xc5\xb8', # Ÿ
|
||||
0xa0 : b'\xc2\xa0', #
|
||||
0xa1 : b'\xc2\xa1', # ¡
|
||||
0xa2 : b'\xc2\xa2', # ¢
|
||||
0xa3 : b'\xc2\xa3', # £
|
||||
0xa4 : b'\xc2\xa4', # ¤
|
||||
0xa5 : b'\xc2\xa5', # ¥
|
||||
0xa6 : b'\xc2\xa6', # ¦
|
||||
0xa7 : b'\xc2\xa7', # §
|
||||
0xa8 : b'\xc2\xa8', # ¨
|
||||
0xa9 : b'\xc2\xa9', # ©
|
||||
0xaa : b'\xc2\xaa', # ª
|
||||
0xab : b'\xc2\xab', # «
|
||||
0xac : b'\xc2\xac', # ¬
|
||||
0xad : b'\xc2\xad', #
|
||||
0xae : b'\xc2\xae', # ®
|
||||
0xaf : b'\xc2\xaf', # ¯
|
||||
0xb0 : b'\xc2\xb0', # °
|
||||
0xb1 : b'\xc2\xb1', # ±
|
||||
0xb2 : b'\xc2\xb2', # ²
|
||||
0xb3 : b'\xc2\xb3', # ³
|
||||
0xb4 : b'\xc2\xb4', # ´
|
||||
0xb5 : b'\xc2\xb5', # µ
|
||||
0xb6 : b'\xc2\xb6', # ¶
|
||||
0xb7 : b'\xc2\xb7', # ·
|
||||
0xb8 : b'\xc2\xb8', # ¸
|
||||
0xb9 : b'\xc2\xb9', # ¹
|
||||
0xba : b'\xc2\xba', # º
|
||||
0xbb : b'\xc2\xbb', # »
|
||||
0xbc : b'\xc2\xbc', # ¼
|
||||
0xbd : b'\xc2\xbd', # ½
|
||||
0xbe : b'\xc2\xbe', # ¾
|
||||
0xbf : b'\xc2\xbf', # ¿
|
||||
0xc0 : b'\xc3\x80', # À
|
||||
0xc1 : b'\xc3\x81', # Á
|
||||
0xc2 : b'\xc3\x82', # Â
|
||||
0xc3 : b'\xc3\x83', # Ã
|
||||
0xc4 : b'\xc3\x84', # Ä
|
||||
0xc5 : b'\xc3\x85', # Å
|
||||
0xc6 : b'\xc3\x86', # Æ
|
||||
0xc7 : b'\xc3\x87', # Ç
|
||||
0xc8 : b'\xc3\x88', # È
|
||||
0xc9 : b'\xc3\x89', # É
|
||||
0xca : b'\xc3\x8a', # Ê
|
||||
0xcb : b'\xc3\x8b', # Ë
|
||||
0xcc : b'\xc3\x8c', # Ì
|
||||
0xcd : b'\xc3\x8d', # Í
|
||||
0xce : b'\xc3\x8e', # Î
|
||||
0xcf : b'\xc3\x8f', # Ï
|
||||
0xd0 : b'\xc3\x90', # Ð
|
||||
0xd1 : b'\xc3\x91', # Ñ
|
||||
0xd2 : b'\xc3\x92', # Ò
|
||||
0xd3 : b'\xc3\x93', # Ó
|
||||
0xd4 : b'\xc3\x94', # Ô
|
||||
0xd5 : b'\xc3\x95', # Õ
|
||||
0xd6 : b'\xc3\x96', # Ö
|
||||
0xd7 : b'\xc3\x97', # ×
|
||||
0xd8 : b'\xc3\x98', # Ø
|
||||
0xd9 : b'\xc3\x99', # Ù
|
||||
0xda : b'\xc3\x9a', # Ú
|
||||
0xdb : b'\xc3\x9b', # Û
|
||||
0xdc : b'\xc3\x9c', # Ü
|
||||
0xdd : b'\xc3\x9d', # Ý
|
||||
0xde : b'\xc3\x9e', # Þ
|
||||
0xdf : b'\xc3\x9f', # ß
|
||||
0xe0 : b'\xc3\xa0', # à
|
||||
0xe1 : b'\xa1', # á
|
||||
0xe2 : b'\xc3\xa2', # â
|
||||
0xe3 : b'\xc3\xa3', # ã
|
||||
0xe4 : b'\xc3\xa4', # ä
|
||||
0xe5 : b'\xc3\xa5', # å
|
||||
0xe6 : b'\xc3\xa6', # æ
|
||||
0xe7 : b'\xc3\xa7', # ç
|
||||
0xe8 : b'\xc3\xa8', # è
|
||||
0xe9 : b'\xc3\xa9', # é
|
||||
0xea : b'\xc3\xaa', # ê
|
||||
0xeb : b'\xc3\xab', # ë
|
||||
0xec : b'\xc3\xac', # ì
|
||||
0xed : b'\xc3\xad', # í
|
||||
0xee : b'\xc3\xae', # î
|
||||
0xef : b'\xc3\xaf', # ï
|
||||
0xf0 : b'\xc3\xb0', # ð
|
||||
0xf1 : b'\xc3\xb1', # ñ
|
||||
0xf2 : b'\xc3\xb2', # ò
|
||||
0xf3 : b'\xc3\xb3', # ó
|
||||
0xf4 : b'\xc3\xb4', # ô
|
||||
0xf5 : b'\xc3\xb5', # õ
|
||||
0xf6 : b'\xc3\xb6', # ö
|
||||
0xf7 : b'\xc3\xb7', # ÷
|
||||
0xf8 : b'\xc3\xb8', # ø
|
||||
0xf9 : b'\xc3\xb9', # ù
|
||||
0xfa : b'\xc3\xba', # ú
|
||||
0xfb : b'\xc3\xbb', # û
|
||||
0xfc : b'\xc3\xbc', # ü
|
||||
0xfd : b'\xc3\xbd', # ý
|
||||
0xfe : b'\xc3\xbe', # þ
|
||||
}
|
||||
|
||||
MULTIBYTE_MARKERS_AND_SIZES = [
|
||||
(0xc2, 0xdf, 2), # 2-byte characters start with a byte C2-DF
|
||||
(0xe0, 0xef, 3), # 3-byte characters start with E0-EF
|
||||
(0xf0, 0xf4, 4), # 4-byte characters start with F0-F4
|
||||
]
|
||||
|
||||
FIRST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[0][0]
|
||||
LAST_MULTIBYTE_MARKER = MULTIBYTE_MARKERS_AND_SIZES[-1][1]
|
||||
|
||||
@classmethod
|
||||
def detwingle(cls, in_bytes, main_encoding="utf8",
|
||||
embedded_encoding="windows-1252"):
|
||||
"""Fix characters from one encoding embedded in some other encoding.
|
||||
|
||||
Currently the only situation supported is Windows-1252 (or its
|
||||
subset ISO-8859-1), embedded in UTF-8.
|
||||
|
||||
The input must be a bytestring. If you've already converted
|
||||
the document to Unicode, you're too late.
|
||||
|
||||
The output is a bytestring in which `embedded_encoding`
|
||||
characters have been converted to their `main_encoding`
|
||||
equivalents.
|
||||
"""
|
||||
if embedded_encoding.replace('_', '-').lower() not in (
|
||||
'windows-1252', 'windows_1252'):
|
||||
raise NotImplementedError(
|
||||
"Windows-1252 and ISO-8859-1 are the only currently supported "
|
||||
"embedded encodings.")
|
||||
|
||||
if main_encoding.lower() not in ('utf8', 'utf-8'):
|
||||
raise NotImplementedError(
|
||||
"UTF-8 is the only currently supported main encoding.")
|
||||
|
||||
byte_chunks = []
|
||||
|
||||
chunk_start = 0
|
||||
pos = 0
|
||||
while pos < len(in_bytes):
|
||||
byte = in_bytes[pos]
|
||||
if not isinstance(byte, int):
|
||||
# Python 2.x
|
||||
byte = ord(byte)
|
||||
if (byte >= cls.FIRST_MULTIBYTE_MARKER
|
||||
and byte <= cls.LAST_MULTIBYTE_MARKER):
|
||||
# This is the start of a UTF-8 multibyte character. Skip
|
||||
# to the end.
|
||||
for start, end, size in cls.MULTIBYTE_MARKERS_AND_SIZES:
|
||||
if byte >= start and byte <= end:
|
||||
pos += size
|
||||
break
|
||||
elif byte >= 0x80 and byte in cls.WINDOWS_1252_TO_UTF8:
|
||||
# We found a Windows-1252 character!
|
||||
# Save the string up to this point as a chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:pos])
|
||||
|
||||
# Now translate the Windows-1252 character into UTF-8
|
||||
# and add it as another, one-byte chunk.
|
||||
byte_chunks.append(cls.WINDOWS_1252_TO_UTF8[byte])
|
||||
pos += 1
|
||||
chunk_start = pos
|
||||
else:
|
||||
# Go on to the next character.
|
||||
pos += 1
|
||||
if chunk_start == 0:
|
||||
# The string is unchanged.
|
||||
return in_bytes
|
||||
else:
|
||||
# Store the final chunk.
|
||||
byte_chunks.append(in_bytes[chunk_start:])
|
||||
return b''.join(byte_chunks)
|
||||
|
||||
@@ -1,224 +0,0 @@
|
||||
"""Diagnostic functions, mainly for use when doing tech support."""
|
||||
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import cProfile
|
||||
from io import StringIO
|
||||
from html.parser import HTMLParser
|
||||
import bs4
|
||||
from bs4 import BeautifulSoup, __version__
|
||||
from bs4.builder import builder_registry
|
||||
|
||||
import os
|
||||
import pstats
|
||||
import random
|
||||
import tempfile
|
||||
import time
|
||||
import traceback
|
||||
import sys
|
||||
import cProfile
|
||||
|
||||
def diagnose(data):
|
||||
"""Diagnostic suite for isolating common problems."""
|
||||
print("Diagnostic running on Beautiful Soup %s" % __version__)
|
||||
print("Python version %s" % sys.version)
|
||||
|
||||
basic_parsers = ["html.parser", "html5lib", "lxml"]
|
||||
for name in basic_parsers:
|
||||
for builder in builder_registry.builders:
|
||||
if name in builder.features:
|
||||
break
|
||||
else:
|
||||
basic_parsers.remove(name)
|
||||
print((
|
||||
"I noticed that %s is not installed. Installing it may help." %
|
||||
name))
|
||||
|
||||
if 'lxml' in basic_parsers:
|
||||
basic_parsers.append("lxml-xml")
|
||||
try:
|
||||
from lxml import etree
|
||||
print("Found lxml version %s" % ".".join(map(str,etree.LXML_VERSION)))
|
||||
except ImportError as e:
|
||||
print (
|
||||
"lxml is not installed or couldn't be imported.")
|
||||
|
||||
|
||||
if 'html5lib' in basic_parsers:
|
||||
try:
|
||||
import html5lib
|
||||
print("Found html5lib version %s" % html5lib.__version__)
|
||||
except ImportError as e:
|
||||
print (
|
||||
"html5lib is not installed or couldn't be imported.")
|
||||
|
||||
if hasattr(data, 'read'):
|
||||
data = data.read()
|
||||
elif data.startswith("http:") or data.startswith("https:"):
|
||||
print('"%s" looks like a URL. Beautiful Soup is not an HTTP client.' % data)
|
||||
print("You need to use some other library to get the document behind the URL, and feed that document to Beautiful Soup.")
|
||||
return
|
||||
else:
|
||||
try:
|
||||
if os.path.exists(data):
|
||||
print('"%s" looks like a filename. Reading data from the file.' % data)
|
||||
with open(data) as fp:
|
||||
data = fp.read()
|
||||
except ValueError:
|
||||
# This can happen on some platforms when the 'filename' is
|
||||
# too long. Assume it's data and not a filename.
|
||||
pass
|
||||
print()
|
||||
|
||||
for parser in basic_parsers:
|
||||
print("Trying to parse your markup with %s" % parser)
|
||||
success = False
|
||||
try:
|
||||
soup = BeautifulSoup(data, features=parser)
|
||||
success = True
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print("Here's what %s did with the markup:" % parser)
|
||||
print(soup.prettify())
|
||||
|
||||
print("-" * 80)
|
||||
|
||||
def lxml_trace(data, html=True, **kwargs):
|
||||
"""Print out the lxml events that occur during parsing.
|
||||
|
||||
This lets you see how lxml parses a document when no Beautiful
|
||||
Soup code is running.
|
||||
"""
|
||||
from lxml import etree
|
||||
for event, element in etree.iterparse(StringIO(data), html=html, **kwargs):
|
||||
print(("%s, %4s, %s" % (event, element.tag, element.text)))
|
||||
|
||||
class AnnouncingParser(HTMLParser):
|
||||
"""Announces HTMLParser parse events, without doing anything else."""
|
||||
|
||||
def _p(self, s):
|
||||
print(s)
|
||||
|
||||
def handle_starttag(self, name, attrs):
|
||||
self._p("%s START" % name)
|
||||
|
||||
def handle_endtag(self, name):
|
||||
self._p("%s END" % name)
|
||||
|
||||
def handle_data(self, data):
|
||||
self._p("%s DATA" % data)
|
||||
|
||||
def handle_charref(self, name):
|
||||
self._p("%s CHARREF" % name)
|
||||
|
||||
def handle_entityref(self, name):
|
||||
self._p("%s ENTITYREF" % name)
|
||||
|
||||
def handle_comment(self, data):
|
||||
self._p("%s COMMENT" % data)
|
||||
|
||||
def handle_decl(self, data):
|
||||
self._p("%s DECL" % data)
|
||||
|
||||
def unknown_decl(self, data):
|
||||
self._p("%s UNKNOWN-DECL" % data)
|
||||
|
||||
def handle_pi(self, data):
|
||||
self._p("%s PI" % data)
|
||||
|
||||
def htmlparser_trace(data):
|
||||
"""Print out the HTMLParser events that occur during parsing.
|
||||
|
||||
This lets you see how HTMLParser parses a document when no
|
||||
Beautiful Soup code is running.
|
||||
"""
|
||||
parser = AnnouncingParser()
|
||||
parser.feed(data)
|
||||
|
||||
_vowels = "aeiou"
|
||||
_consonants = "bcdfghjklmnpqrstvwxyz"
|
||||
|
||||
def rword(length=5):
|
||||
"Generate a random word-like string."
|
||||
s = ''
|
||||
for i in range(length):
|
||||
if i % 2 == 0:
|
||||
t = _consonants
|
||||
else:
|
||||
t = _vowels
|
||||
s += random.choice(t)
|
||||
return s
|
||||
|
||||
def rsentence(length=4):
|
||||
"Generate a random sentence-like string."
|
||||
return " ".join(rword(random.randint(4,9)) for i in list(range(length)))
|
||||
|
||||
def rdoc(num_elements=1000):
|
||||
"""Randomly generate an invalid HTML document."""
|
||||
tag_names = ['p', 'div', 'span', 'i', 'b', 'script', 'table']
|
||||
elements = []
|
||||
for i in range(num_elements):
|
||||
choice = random.randint(0,3)
|
||||
if choice == 0:
|
||||
# New tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("<%s>" % tag_name)
|
||||
elif choice == 1:
|
||||
elements.append(rsentence(random.randint(1,4)))
|
||||
elif choice == 2:
|
||||
# Close a tag.
|
||||
tag_name = random.choice(tag_names)
|
||||
elements.append("</%s>" % tag_name)
|
||||
return "<html>" + "\n".join(elements) + "</html>"
|
||||
|
||||
def benchmark_parsers(num_elements=100000):
|
||||
"""Very basic head-to-head performance benchmark."""
|
||||
print("Comparative parser benchmark on Beautiful Soup %s" % __version__)
|
||||
data = rdoc(num_elements)
|
||||
print("Generated a large invalid HTML document (%d bytes)." % len(data))
|
||||
|
||||
for parser in ["lxml", ["lxml", "html"], "html5lib", "html.parser"]:
|
||||
success = False
|
||||
try:
|
||||
a = time.time()
|
||||
soup = BeautifulSoup(data, parser)
|
||||
b = time.time()
|
||||
success = True
|
||||
except Exception as e:
|
||||
print("%s could not parse the markup." % parser)
|
||||
traceback.print_exc()
|
||||
if success:
|
||||
print("BS4+%s parsed the markup in %.2fs." % (parser, b-a))
|
||||
|
||||
from lxml import etree
|
||||
a = time.time()
|
||||
etree.HTML(data)
|
||||
b = time.time()
|
||||
print("Raw lxml parsed the markup in %.2fs." % (b-a))
|
||||
|
||||
import html5lib
|
||||
parser = html5lib.HTMLParser()
|
||||
a = time.time()
|
||||
parser.parse(data)
|
||||
b = time.time()
|
||||
print("Raw html5lib parsed the markup in %.2fs." % (b-a))
|
||||
|
||||
def profile(num_elements=100000, parser="lxml"):
|
||||
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
|
||||
data = rdoc(num_elements)
|
||||
vars = dict(bs4=bs4, data=data, parser=parser)
|
||||
cProfile.runctx('bs4.BeautifulSoup(data, parser)' , vars, vars, filename)
|
||||
|
||||
stats = pstats.Stats(filename)
|
||||
# stats.strip_dirs()
|
||||
stats.sort_stats("cumulative")
|
||||
stats.print_stats('_html5lib|bs4', 50)
|
||||
|
||||
if __name__ == '__main__':
|
||||
diagnose(sys.stdin.read())
|
||||
1579
libs3/bs4/element.py
1579
libs3/bs4/element.py
File diff suppressed because it is too large
Load Diff
@@ -1,99 +0,0 @@
|
||||
from bs4.dammit import EntitySubstitution
|
||||
|
||||
class Formatter(EntitySubstitution):
|
||||
"""Describes a strategy to use when outputting a parse tree to a string.
|
||||
|
||||
Some parts of this strategy come from the distinction between
|
||||
HTML4, HTML5, and XML. Others are configurable by the user.
|
||||
"""
|
||||
# Registries of XML and HTML formatters.
|
||||
XML_FORMATTERS = {}
|
||||
HTML_FORMATTERS = {}
|
||||
|
||||
HTML = 'html'
|
||||
XML = 'xml'
|
||||
|
||||
HTML_DEFAULTS = dict(
|
||||
cdata_containing_tags=set(["script", "style"]),
|
||||
)
|
||||
|
||||
def _default(self, language, value, kwarg):
|
||||
if value is not None:
|
||||
return value
|
||||
if language == self.XML:
|
||||
return set()
|
||||
return self.HTML_DEFAULTS[kwarg]
|
||||
|
||||
def __init__(
|
||||
self, language=None, entity_substitution=None,
|
||||
void_element_close_prefix='/', cdata_containing_tags=None,
|
||||
):
|
||||
"""
|
||||
|
||||
:param void_element_close_prefix: By default, represent void
|
||||
elements as <tag/> rather than <tag>
|
||||
"""
|
||||
self.language = language
|
||||
self.entity_substitution = entity_substitution
|
||||
self.void_element_close_prefix = void_element_close_prefix
|
||||
self.cdata_containing_tags = self._default(
|
||||
language, cdata_containing_tags, 'cdata_containing_tags'
|
||||
)
|
||||
|
||||
def substitute(self, ns):
|
||||
"""Process a string that needs to undergo entity substitution."""
|
||||
if not self.entity_substitution:
|
||||
return ns
|
||||
from .element import NavigableString
|
||||
if (isinstance(ns, NavigableString)
|
||||
and ns.parent is not None
|
||||
and ns.parent.name in self.cdata_containing_tags):
|
||||
# Do nothing.
|
||||
return ns
|
||||
# Substitute.
|
||||
return self.entity_substitution(ns)
|
||||
|
||||
def attribute_value(self, value):
|
||||
"""Process the value of an attribute."""
|
||||
return self.substitute(value)
|
||||
|
||||
def attributes(self, tag):
|
||||
"""Reorder a tag's attributes however you want."""
|
||||
return sorted(tag.attrs.items())
|
||||
|
||||
|
||||
class HTMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(HTMLFormatter, self).__init__(self.HTML, *args, **kwargs)
|
||||
|
||||
|
||||
class XMLFormatter(Formatter):
|
||||
REGISTRY = {}
|
||||
def __init__(self, *args, **kwargs):
|
||||
return super(XMLFormatter, self).__init__(self.XML, *args, **kwargs)
|
||||
|
||||
|
||||
# Set up aliases for the default formatters.
|
||||
HTMLFormatter.REGISTRY['html'] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
HTMLFormatter.REGISTRY["html5"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html,
|
||||
void_element_close_prefix = None
|
||||
)
|
||||
HTMLFormatter.REGISTRY["minimal"] = HTMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
HTMLFormatter.REGISTRY[None] = HTMLFormatter(
|
||||
entity_substitution=None
|
||||
)
|
||||
XMLFormatter.REGISTRY["html"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_html
|
||||
)
|
||||
XMLFormatter.REGISTRY["minimal"] = XMLFormatter(
|
||||
entity_substitution=EntitySubstitution.substitute_xml
|
||||
)
|
||||
XMLFormatter.REGISTRY[None] = Formatter(
|
||||
Formatter(Formatter.XML, entity_substitution=None)
|
||||
)
|
||||
@@ -1,992 +0,0 @@
|
||||
# encoding: utf-8
|
||||
"""Helper classes for tests."""
|
||||
|
||||
# Use of this source code is governed by the MIT license.
|
||||
__license__ = "MIT"
|
||||
|
||||
import pickle
|
||||
import copy
|
||||
import functools
|
||||
import unittest
|
||||
from unittest import TestCase
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
Comment,
|
||||
ContentMetaAttributeValue,
|
||||
Doctype,
|
||||
SoupStrainer,
|
||||
Tag
|
||||
)
|
||||
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
BAD_DOCUMENT = """A bare string
|
||||
<!DOCTYPE xsl:stylesheet SYSTEM "htmlent.dtd">
|
||||
<!DOCTYPE xsl:stylesheet PUBLIC "htmlent.dtd">
|
||||
<div><![CDATA[A CDATA section where it doesn't belong]]></div>
|
||||
<div><svg><![CDATA[HTML5 does allow CDATA sections in SVG]]></svg></div>
|
||||
<div>A <meta> tag</div>
|
||||
<div>A <br> tag that supposedly has contents.</br></div>
|
||||
<div>AT&T</div>
|
||||
<div><textarea>Within a textarea, markup like <b> tags and <&<& should be treated as literal</textarea></div>
|
||||
<div><script>if (i < 2) { alert("<b>Markup within script tags should be treated as literal.</b>"); }</script></div>
|
||||
<div>This numeric entity is missing the final semicolon: <x t="piñata"></div>
|
||||
<div><a href="http://example.com/</a> that attribute value never got closed</div>
|
||||
<div><a href="foo</a>, </a><a href="bar">that attribute value was closed by the subsequent tag</a></div>
|
||||
<! This document starts with a bogus declaration ><div>a</div>
|
||||
<div>This document contains <!an incomplete declaration <div>(do you see it?)</div>
|
||||
<div>This document ends with <!an incomplete declaration
|
||||
<div><a style={height:21px;}>That attribute value was bogus</a></div>
|
||||
<! DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">The doctype is invalid because it contains extra whitespace
|
||||
<div><table><td nowrap>That boolean attribute had no value</td></table></div>
|
||||
<div>Here's a nonexistent entity: &#foo; (do you see it?)</div>
|
||||
<div>This document ends before the entity finishes: >
|
||||
<div><p>Paragraphs shouldn't contain block display elements, but this one does: <dl><dt>you see?</dt></p>
|
||||
<b b="20" a="1" b="10" a="2" a="3" a="4">Multiple values for the same attribute.</b>
|
||||
<div><table><tr><td>Here's a table</td></tr></table></div>
|
||||
<div><table id="1"><tr><td>Here's a nested table:<table id="2"><tr><td>foo</td></tr></table></td></div>
|
||||
<div>This tag contains nothing but whitespace: <b> </b></div>
|
||||
<div><blockquote><p><b>This p tag is cut off by</blockquote></p>the end of the blockquote tag</div>
|
||||
<div><table><div>This table contains bare markup</div></table></div>
|
||||
<div><div id="1">\n <a href="link1">This link is never closed.\n</div>\n<div id="2">\n <div id="3">\n <a href="link2">This link is closed.</a>\n </div>\n</div></div>
|
||||
<div>This document contains a <!DOCTYPE surprise>surprise doctype</div>
|
||||
<div><a><B><Cd><EFG>Mixed case tags are folded to lowercase</efg></CD></b></A></div>
|
||||
<div><our\u2603>Tag name contains Unicode characters</our\u2603></div>
|
||||
<div><a \u2603="snowman">Attribute name contains Unicode characters</a></div>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
|
||||
"""
|
||||
|
||||
|
||||
class SoupTest(unittest.TestCase):
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return default_builder
|
||||
|
||||
def soup(self, markup, **kwargs):
|
||||
"""Build a Beautiful Soup object from markup."""
|
||||
builder = kwargs.pop('builder', self.default_builder)
|
||||
return BeautifulSoup(markup, builder=builder, **kwargs)
|
||||
|
||||
def document_for(self, markup, **kwargs):
|
||||
"""Turn an HTML fragment into a document.
|
||||
|
||||
The details depend on the builder.
|
||||
"""
|
||||
return self.default_builder(**kwargs).test_fragment_to_document(markup)
|
||||
|
||||
def assertSoupEquals(self, to_parse, compare_parsed_to=None):
|
||||
builder = self.default_builder
|
||||
obj = BeautifulSoup(to_parse, builder=builder)
|
||||
if compare_parsed_to is None:
|
||||
compare_parsed_to = to_parse
|
||||
|
||||
self.assertEqual(obj.decode(), self.document_for(compare_parsed_to))
|
||||
|
||||
def assertConnectedness(self, element):
|
||||
"""Ensure that next_element and previous_element are properly
|
||||
set for all descendants of the given element.
|
||||
"""
|
||||
earlier = None
|
||||
for e in element.descendants:
|
||||
if earlier:
|
||||
self.assertEqual(e, earlier.next_element)
|
||||
self.assertEqual(earlier, e.previous_element)
|
||||
earlier = e
|
||||
|
||||
def linkage_validator(self, el, _recursive_call=False):
|
||||
"""Ensure proper linkage throughout the document."""
|
||||
descendant = None
|
||||
# Document element should have no previous element or previous sibling.
|
||||
# It also shouldn't have a next sibling.
|
||||
if el.parent is None:
|
||||
assert el.previous_element is None,\
|
||||
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||
el, el.previous_element, None
|
||||
)
|
||||
assert el.previous_sibling is None,\
|
||||
"Bad previous_sibling\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||
el, el.previous_sibling, None
|
||||
)
|
||||
assert el.next_sibling is None,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||
el, el.next_sibling, None
|
||||
)
|
||||
|
||||
idx = 0
|
||||
child = None
|
||||
last_child = None
|
||||
last_idx = len(el.contents) - 1
|
||||
for child in el.contents:
|
||||
descendant = None
|
||||
|
||||
# Parent should link next element to their first child
|
||||
# That child should have no previous sibling
|
||||
if idx == 0:
|
||||
if el.parent is not None:
|
||||
assert el.next_element is child,\
|
||||
"Bad next_element\nNODE: {}\nNEXT: {}\nEXPECTED: {}".format(
|
||||
el, el.next_element, child
|
||||
)
|
||||
assert child.previous_element is el,\
|
||||
"Bad previous_element\nNODE: {}\nPREV: {}\nEXPECTED: {}".format(
|
||||
child, child.previous_element, el
|
||||
)
|
||||
assert child.previous_sibling is None,\
|
||||
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED: {}".format(
|
||||
child, child.previous_sibling, None
|
||||
)
|
||||
|
||||
# If not the first child, previous index should link as sibling to this index
|
||||
# Previous element should match the last index or the last bubbled up descendant
|
||||
else:
|
||||
assert child.previous_sibling is el.contents[idx - 1],\
|
||||
"Bad previous_sibling\nNODE: {}\nPREV {}\nEXPECTED {}".format(
|
||||
child, child.previous_sibling, el.contents[idx - 1]
|
||||
)
|
||||
assert el.contents[idx - 1].next_sibling is child,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
el.contents[idx - 1], el.contents[idx - 1].next_sibling, child
|
||||
)
|
||||
|
||||
if last_child is not None:
|
||||
assert child.previous_element is last_child,\
|
||||
"Bad previous_element\nNODE: {}\nPREV {}\nEXPECTED {}\nCONTENTS {}".format(
|
||||
child, child.previous_element, last_child, child.parent.contents
|
||||
)
|
||||
assert last_child.next_element is child,\
|
||||
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
last_child, last_child.next_element, child
|
||||
)
|
||||
|
||||
if isinstance(child, Tag) and child.contents:
|
||||
descendant = self.linkage_validator(child, True)
|
||||
# A bubbled up descendant should have no next siblings
|
||||
assert descendant.next_sibling is None,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
descendant, descendant.next_sibling, None
|
||||
)
|
||||
|
||||
# Mark last child as either the bubbled up descendant or the current child
|
||||
if descendant is not None:
|
||||
last_child = descendant
|
||||
else:
|
||||
last_child = child
|
||||
|
||||
# If last child, there are non next siblings
|
||||
if idx == last_idx:
|
||||
assert child.next_sibling is None,\
|
||||
"Bad next_sibling\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
child, child.next_sibling, None
|
||||
)
|
||||
idx += 1
|
||||
|
||||
child = descendant if descendant is not None else child
|
||||
if child is None:
|
||||
child = el
|
||||
|
||||
if not _recursive_call and child is not None:
|
||||
target = el
|
||||
while True:
|
||||
if target is None:
|
||||
assert child.next_element is None, \
|
||||
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
child, child.next_element, None
|
||||
)
|
||||
break
|
||||
elif target.next_sibling is not None:
|
||||
assert child.next_element is target.next_sibling, \
|
||||
"Bad next_element\nNODE: {}\nNEXT {}\nEXPECTED {}".format(
|
||||
child, child.next_element, target.next_sibling
|
||||
)
|
||||
break
|
||||
target = target.parent
|
||||
|
||||
# We are done, so nothing to return
|
||||
return None
|
||||
else:
|
||||
# Return the child to the recursive caller
|
||||
return child
|
||||
|
||||
|
||||
class HTMLTreeBuilderSmokeTest(object):
|
||||
|
||||
"""A basic test of a treebuilder's competence.
|
||||
|
||||
Any HTML treebuilder, present or future, should be able to pass
|
||||
these tests. With invalid markup, there's room for interpretation,
|
||||
and different parsers can handle it differently. But with the
|
||||
markup in these tests, there's not much room for interpretation.
|
||||
"""
|
||||
|
||||
def test_empty_element_tags(self):
|
||||
"""Verify that all HTML4 and HTML5 empty element (aka void element) tags
|
||||
are handled correctly.
|
||||
"""
|
||||
for name in [
|
||||
'area', 'base', 'br', 'col', 'embed', 'hr', 'img', 'input', 'keygen', 'link', 'menuitem', 'meta', 'param', 'source', 'track', 'wbr',
|
||||
'spacer', 'frame'
|
||||
]:
|
||||
soup = self.soup("")
|
||||
new_tag = soup.new_tag(name)
|
||||
self.assertEqual(True, new_tag.is_empty_element)
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def assertDoctypeHandled(self, doctype_fragment):
|
||||
"""Assert that a given doctype string is handled correctly."""
|
||||
doctype_str, soup = self._document_with_doctype(doctype_fragment)
|
||||
|
||||
# Make sure a Doctype object was created.
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual(doctype.__class__, Doctype)
|
||||
self.assertEqual(doctype, doctype_fragment)
|
||||
self.assertEqual(str(soup)[:len(doctype_str)], doctype_str)
|
||||
|
||||
# Make sure that the doctype was correctly associated with the
|
||||
# parse tree and that the rest of the document parsed.
|
||||
self.assertEqual(soup.p.contents[0], 'foo')
|
||||
|
||||
def _document_with_doctype(self, doctype_fragment):
|
||||
"""Generate and parse a document with the given doctype."""
|
||||
doctype = '<!DOCTYPE %s>' % doctype_fragment
|
||||
markup = doctype + '\n<p>foo</p>'
|
||||
soup = self.soup(markup)
|
||||
return doctype, soup
|
||||
|
||||
def test_normal_doctypes(self):
|
||||
"""Make sure normal, everyday HTML doctypes are handled correctly."""
|
||||
self.assertDoctypeHandled("html")
|
||||
self.assertDoctypeHandled(
|
||||
'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"')
|
||||
|
||||
def test_empty_doctype(self):
|
||||
soup = self.soup("<!DOCTYPE>")
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual("", doctype.strip())
|
||||
|
||||
def test_public_doctype_with_url(self):
|
||||
doctype = 'html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"'
|
||||
self.assertDoctypeHandled(doctype)
|
||||
|
||||
def test_system_doctype(self):
|
||||
self.assertDoctypeHandled('foo SYSTEM "http://www.example.com/"')
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# We can handle a namespaced doctype with a system ID.
|
||||
self.assertDoctypeHandled('xsl:stylesheet SYSTEM "htmlent.dtd"')
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# Test a namespaced doctype with a public id.
|
||||
self.assertDoctypeHandled('xsl:stylesheet PUBLIC "htmlent.dtd"')
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out more or less the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8").replace(b"\n", b""),
|
||||
markup.replace(b"\n", b""))
|
||||
|
||||
def test_namespaced_html(self):
|
||||
"""When a namespaced XML document is parsed as HTML it should
|
||||
be treated as HTML with weird tag names.
|
||||
"""
|
||||
markup = b"""<ns1:foo>content</ns1:foo><ns1:foo/><ns2:foo/>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(2, len(soup.find_all("ns1:foo")))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
# We test both Unicode and bytestring to verify that
|
||||
# process_markup correctly sets processing_instruction_class
|
||||
# even when the markup is already Unicode and there is no
|
||||
# need to process anything.
|
||||
markup = """<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.decode())
|
||||
|
||||
markup = b"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_deepcopy(self):
|
||||
"""Make sure you can copy the tree builder.
|
||||
|
||||
This is important because the builder is part of a
|
||||
BeautifulSoup object, and we want to be able to copy that.
|
||||
"""
|
||||
copy.deepcopy(self.default_builder)
|
||||
|
||||
def test_p_tag_is_never_empty_element(self):
|
||||
"""A <p> tag is never designated as an empty-element tag.
|
||||
|
||||
Even if the markup shows it as an empty-element tag, it
|
||||
shouldn't be presented that way.
|
||||
"""
|
||||
soup = self.soup("<p/>")
|
||||
self.assertFalse(soup.p.is_empty_element)
|
||||
self.assertEqual(str(soup.p), "<p></p>")
|
||||
|
||||
def test_unclosed_tags_get_closed(self):
|
||||
"""A tag that's not closed by the end of the document should be closed.
|
||||
|
||||
This applies to all tags except empty-element tags.
|
||||
"""
|
||||
self.assertSoupEquals("<p>", "<p></p>")
|
||||
self.assertSoupEquals("<b>", "<b></b>")
|
||||
|
||||
self.assertSoupEquals("<br>", "<br/>")
|
||||
|
||||
def test_br_is_always_empty_element_tag(self):
|
||||
"""A <br> tag is designated as an empty-element tag.
|
||||
|
||||
Some parsers treat <br></br> as one <br/> tag, some parsers as
|
||||
two tags, but it should always be an empty-element tag.
|
||||
"""
|
||||
soup = self.soup("<br></br>")
|
||||
self.assertTrue(soup.br.is_empty_element)
|
||||
self.assertEqual(str(soup.br), "<br/>")
|
||||
|
||||
def test_nested_formatting_elements(self):
|
||||
self.assertSoupEquals("<em><em></em></em>")
|
||||
|
||||
def test_double_head(self):
|
||||
html = '''<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
<title>Ordinary HEAD element test</title>
|
||||
</head>
|
||||
<script type="text/javascript">
|
||||
alert("Help!");
|
||||
</script>
|
||||
<body>
|
||||
Hello, world!
|
||||
</body>
|
||||
</html>
|
||||
'''
|
||||
soup = self.soup(html)
|
||||
self.assertEqual("text/javascript", soup.find('script')['type'])
|
||||
|
||||
def test_comment(self):
|
||||
# Comments are represented as Comment objects.
|
||||
markup = "<p>foo<!--foobar-->baz</p>"
|
||||
self.assertSoupEquals(markup)
|
||||
|
||||
soup = self.soup(markup)
|
||||
comment = soup.find(text="foobar")
|
||||
self.assertEqual(comment.__class__, Comment)
|
||||
|
||||
# The comment is properly integrated into the tree.
|
||||
foo = soup.find(text="foo")
|
||||
self.assertEqual(comment, foo.next_element)
|
||||
baz = soup.find(text="baz")
|
||||
self.assertEqual(comment, baz.previous_element)
|
||||
|
||||
def test_preserved_whitespace_in_pre_and_textarea(self):
|
||||
"""Whitespace must be preserved in <pre> and <textarea> tags,
|
||||
even if that would mean not prettifying the markup.
|
||||
"""
|
||||
pre_markup = "<pre> </pre>"
|
||||
textarea_markup = "<textarea> woo\nwoo </textarea>"
|
||||
self.assertSoupEquals(pre_markup)
|
||||
self.assertSoupEquals(textarea_markup)
|
||||
|
||||
soup = self.soup(pre_markup)
|
||||
self.assertEqual(soup.pre.prettify(), pre_markup)
|
||||
|
||||
soup = self.soup(textarea_markup)
|
||||
self.assertEqual(soup.textarea.prettify(), textarea_markup)
|
||||
|
||||
soup = self.soup("<textarea></textarea>")
|
||||
self.assertEqual(soup.textarea.prettify(), "<textarea></textarea>")
|
||||
|
||||
def test_nested_inline_elements(self):
|
||||
"""Inline elements can be nested indefinitely."""
|
||||
b_tag = "<b>Inside a B tag</b>"
|
||||
self.assertSoupEquals(b_tag)
|
||||
|
||||
nested_b_tag = "<p>A <i>nested <b>tag</b></i></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
double_nested_b_tag = "<p>A <a>doubly <i>nested <b>tag</b></i></a></p>"
|
||||
self.assertSoupEquals(nested_b_tag)
|
||||
|
||||
def test_nested_block_level_elements(self):
|
||||
"""Block elements can be nested."""
|
||||
soup = self.soup('<blockquote><p><b>Foo</b></p></blockquote>')
|
||||
blockquote = soup.blockquote
|
||||
self.assertEqual(blockquote.p.b.string, 'Foo')
|
||||
self.assertEqual(blockquote.b.string, 'Foo')
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""One table can go inside another one."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tr><td>foo</td></tr></table>'
|
||||
'</td></tr></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_multivalued_attribute_with_whitespace(self):
|
||||
# Whitespace separating the values of a multi-valued attribute
|
||||
# should be ignored.
|
||||
|
||||
markup = '<div class=" foo bar "></a>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.div['class'])
|
||||
|
||||
# If you search by the literal name of the class it's like the whitespace
|
||||
# wasn't there.
|
||||
self.assertEqual(soup.div, soup.find('div', class_="foo bar"))
|
||||
|
||||
def test_deeply_nested_multivalued_attribute(self):
|
||||
# html5lib can set the attributes of the same tag many times
|
||||
# as it rearranges the tree. This has caused problems with
|
||||
# multivalued attributes.
|
||||
markup = '<table><div><div class="css"></div></div></table>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["css"], soup.div.div['class'])
|
||||
|
||||
def test_multivalued_attribute_on_html(self):
|
||||
# html5lib uses a different API to set the attributes ot the
|
||||
# <html> tag. This has caused problems with multivalued
|
||||
# attributes.
|
||||
markup = '<html class="a b"></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(["a", "b"], soup.html['class'])
|
||||
|
||||
def test_angle_brackets_in_attribute_values_are_escaped(self):
|
||||
self.assertSoupEquals('<a b="<a>"></a>', '<a b="<a>"></a>')
|
||||
|
||||
def test_strings_resembling_character_entity_references(self):
|
||||
# "&T" and "&p" look like incomplete character entities, but they are
|
||||
# not.
|
||||
self.assertSoupEquals(
|
||||
"<p>• AT&T is in the s&p 500</p>",
|
||||
"<p>\u2022 AT&T is in the s&p 500</p>"
|
||||
)
|
||||
|
||||
def test_apos_entity(self):
|
||||
self.assertSoupEquals(
|
||||
"<p>Bob's Bar</p>",
|
||||
"<p>Bob's Bar</p>",
|
||||
)
|
||||
|
||||
def test_entities_in_foreign_document_encoding(self):
|
||||
# “ and ” are invalid numeric entities referencing
|
||||
# Windows-1252 characters. - references a character common
|
||||
# to Windows-1252 and Unicode, and ☃ references a
|
||||
# character only found in Unicode.
|
||||
#
|
||||
# All of these entities should be converted to Unicode
|
||||
# characters.
|
||||
markup = "<p>“Hello” -☃</p>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("“Hello” -☃", soup.p.string)
|
||||
|
||||
def test_entities_in_attributes_converted_to_unicode(self):
|
||||
expect = '<p id="pi\N{LATIN SMALL LETTER N WITH TILDE}ata"></p>'
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
self.assertSoupEquals('<p id="piñata"></p>', expect)
|
||||
|
||||
def test_entities_in_text_converted_to_unicode(self):
|
||||
expect = '<p>pi\N{LATIN SMALL LETTER N WITH TILDE}ata</p>'
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
self.assertSoupEquals("<p>piñata</p>", expect)
|
||||
|
||||
def test_quot_entity_converted_to_quotation_mark(self):
|
||||
self.assertSoupEquals("<p>I said "good day!"</p>",
|
||||
'<p>I said "good day!"</p>')
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
expect = "\N{REPLACEMENT CHARACTER}"
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
self.assertSoupEquals("�", expect)
|
||||
|
||||
def test_multipart_strings(self):
|
||||
"Mostly to prevent a recurrence of a bug in the html5lib treebuilder."
|
||||
soup = self.soup("<html><h2>\nfoo</h2><p></p></html>")
|
||||
self.assertEqual("p", soup.h2.string.next_element.name)
|
||||
self.assertEqual("p", soup.p.name)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_empty_element_tags(self):
|
||||
"""Verify consistent handling of empty-element tags,
|
||||
no matter how they come in through the markup.
|
||||
"""
|
||||
self.assertSoupEquals('<br/><br/><br/>', "<br/><br/><br/>")
|
||||
self.assertSoupEquals('<br /><br /><br />', "<br/><br/><br/>")
|
||||
|
||||
def test_head_tag_between_head_and_body(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<html><head></head>
|
||||
<link></link>
|
||||
<body>foo</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertNotEqual(None, soup.html.body)
|
||||
self.assertConnectedness(soup)
|
||||
|
||||
def test_multiple_copies_of_a_tag(self):
|
||||
"Prevent recurrence of a bug in the html5lib treebuilder."
|
||||
content = """<!DOCTYPE html>
|
||||
<html>
|
||||
<body>
|
||||
<article id="a" >
|
||||
<div><a href="1"></div>
|
||||
<footer>
|
||||
<a href="2"></a>
|
||||
</footer>
|
||||
</article>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(content)
|
||||
self.assertConnectedness(soup.article)
|
||||
|
||||
def test_basic_namespaces(self):
|
||||
"""Parsers don't need to *understand* namespaces, but at the
|
||||
very least they should not choke on namespaces or lose
|
||||
data."""
|
||||
|
||||
markup = b'<html xmlns="http://www.w3.org/1999/xhtml" xmlns:mathml="http://www.w3.org/1998/Math/MathML" xmlns:svg="http://www.w3.org/2000/svg"><head></head><body><mathml:msqrt>4</mathml:msqrt><b svg:fill="red"></b></body></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode())
|
||||
html = soup.html
|
||||
self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml'])
|
||||
self.assertEqual(
|
||||
'http://www.w3.org/2000/svg', soup.html['xmlns:svg'])
|
||||
|
||||
def test_multivalued_attribute_value_becomes_list(self):
|
||||
markup = b'<a class="foo bar">'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(['foo', 'bar'], soup.a['class'])
|
||||
|
||||
#
|
||||
# Generally speaking, tests below this point are more tests of
|
||||
# Beautiful Soup than tests of the tree builders. But parsers are
|
||||
# weird, so we run these tests separately for every tree builder
|
||||
# to detect any differences between them.
|
||||
#
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
# A seemingly innocuous document... but it's in Unicode! And
|
||||
# it contains characters that can't be represented in the
|
||||
# encoding found in the declaration! The horror!
|
||||
markup = '<html><head><meta encoding="euc-jp"></head><body>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</body>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual('Sacr\xe9 bleu!', soup.body.string)
|
||||
|
||||
def test_soupstrainer(self):
|
||||
"""Parsers should be able to work with SoupStrainers."""
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup("A <b>bold</b> <meta/> <i>statement</i>",
|
||||
parse_only=strainer)
|
||||
self.assertEqual(soup.decode(), "<b>bold</b>")
|
||||
|
||||
def test_single_quote_attribute_values_become_double_quotes(self):
|
||||
self.assertSoupEquals("<foo attr='bar'></foo>",
|
||||
'<foo attr="bar"></foo>')
|
||||
|
||||
def test_attribute_values_with_nested_quotes_are_left_alone(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
self.assertSoupEquals(text)
|
||||
|
||||
def test_attribute_values_with_double_nested_quotes_get_quoted(self):
|
||||
text = """<foo attr='bar "brawls" happen'>a</foo>"""
|
||||
soup = self.soup(text)
|
||||
soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"'
|
||||
self.assertSoupEquals(
|
||||
soup.foo.decode(),
|
||||
"""<foo attr="Brawls happen at "Bob\'s Bar"">a</foo>""")
|
||||
|
||||
def test_ampersand_in_attribute_value_gets_escaped(self):
|
||||
self.assertSoupEquals('<this is="really messed up & stuff"></this>',
|
||||
'<this is="really messed up & stuff"></this>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>',
|
||||
'<a href="http://example.org?a=1&b=2;3">foo</a>')
|
||||
|
||||
def test_escaped_ampersand_in_attribute_value_is_left_alone(self):
|
||||
self.assertSoupEquals('<a href="http://example.org?a=1&b=2;3"></a>')
|
||||
|
||||
def test_entities_in_strings_converted_during_parsing(self):
|
||||
# Both XML and HTML entities are converted to Unicode characters
|
||||
# during parsing.
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>"
|
||||
self.assertSoupEquals(text, expected)
|
||||
|
||||
def test_smart_quotes_converted_on_the_way_in(self):
|
||||
# Microsoft smart quotes are converted to Unicode characters during
|
||||
# parsing.
|
||||
quote = b"<p>\x91Foo\x92</p>"
|
||||
soup = self.soup(quote)
|
||||
self.assertEqual(
|
||||
soup.p.string,
|
||||
"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}")
|
||||
|
||||
def test_non_breaking_spaces_converted_on_the_way_in(self):
|
||||
soup = self.soup("<a> </a>")
|
||||
self.assertEqual(soup.a.string, "\N{NO-BREAK SPACE}" * 2)
|
||||
|
||||
def test_entities_converted_on_the_way_out(self):
|
||||
text = "<p><<sacré bleu!>></p>"
|
||||
expected = "<p><<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>></p>".encode("utf-8")
|
||||
soup = self.soup(text)
|
||||
self.assertEqual(soup.p.encode("utf-8"), expected)
|
||||
|
||||
def test_real_iso_latin_document(self):
|
||||
# Smoke test of interrelated functionality, using an
|
||||
# easy-to-understand document.
|
||||
|
||||
# Here it is in Unicode. Note that it claims to be in ISO-Latin-1.
|
||||
unicode_html = '<html><head><meta content="text/html; charset=ISO-Latin-1" http-equiv="Content-type"/></head><body><p>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</p></body></html>'
|
||||
|
||||
# That's because we're going to encode it into ISO-Latin-1, and use
|
||||
# that to test.
|
||||
iso_latin_html = unicode_html.encode("iso-8859-1")
|
||||
|
||||
# Parse the ISO-Latin-1 HTML.
|
||||
soup = self.soup(iso_latin_html)
|
||||
# Encode it to UTF-8.
|
||||
result = soup.encode("utf-8")
|
||||
|
||||
# What do we expect the result to look like? Well, it would
|
||||
# look like unicode_html, except that the META tag would say
|
||||
# UTF-8 instead of ISO-Latin-1.
|
||||
expected = unicode_html.replace("ISO-Latin-1", "utf-8")
|
||||
|
||||
# And, of course, it would be in UTF-8, not Unicode.
|
||||
expected = expected.encode("utf-8")
|
||||
|
||||
# Ta-da!
|
||||
self.assertEqual(result, expected)
|
||||
|
||||
def test_real_shift_jis_document(self):
|
||||
# Smoke test to make sure the parser can handle a document in
|
||||
# Shift-JIS encoding, without choking.
|
||||
shift_jis_html = (
|
||||
b'<html><head></head><body><pre>'
|
||||
b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
|
||||
b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
|
||||
b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
|
||||
b'</pre></body></html>')
|
||||
unicode_html = shift_jis_html.decode("shift-jis")
|
||||
soup = self.soup(unicode_html)
|
||||
|
||||
# Make sure the parse tree is correctly encoded to various
|
||||
# encodings.
|
||||
self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8"))
|
||||
self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp"))
|
||||
|
||||
def test_real_hebrew_document(self):
|
||||
# A real-world test to make sure we can convert ISO-8859-9 (a
|
||||
# Hebrew encoding) to UTF-8.
|
||||
hebrew_document = b'<html><head><title>Hebrew (ISO 8859-8) in Visual Directionality</title></head><body><h1>Hebrew (ISO 8859-8) in Visual Directionality</h1>\xed\xe5\xec\xf9</body></html>'
|
||||
soup = self.soup(
|
||||
hebrew_document, from_encoding="iso8859-8")
|
||||
# Some tree builders call it iso8859-8, others call it iso-8859-9.
|
||||
# That's not a difference we really care about.
|
||||
assert soup.original_encoding in ('iso8859-8', 'iso-8859-8')
|
||||
self.assertEqual(
|
||||
soup.encode('utf-8'),
|
||||
hebrew_document.decode("iso8859-8").encode("utf-8"))
|
||||
|
||||
def test_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta content="text/html; charset=x-sjis" '
|
||||
'http-equiv="Content-type"/>')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'})
|
||||
content = parsed_meta['content']
|
||||
self.assertEqual('text/html; charset=x-sjis', content)
|
||||
|
||||
# But that value is actually a ContentMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(content, ContentMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('text/html; charset=utf8', content.encode("utf8"))
|
||||
|
||||
# For the rest of the story, see TestSubstitutions in
|
||||
# test_tree.py.
|
||||
|
||||
def test_html5_style_meta_tag_reflects_current_encoding(self):
|
||||
# Here's the <meta> tag saying that a document is
|
||||
# encoded in Shift-JIS.
|
||||
meta_tag = ('<meta id="encoding" charset="x-sjis" />')
|
||||
|
||||
# Here's a document incorporating that meta tag.
|
||||
shift_jis_html = (
|
||||
'<html><head>\n%s\n'
|
||||
'<meta http-equiv="Content-language" content="ja"/>'
|
||||
'</head><body>Shift-JIS markup goes here.') % meta_tag
|
||||
soup = self.soup(shift_jis_html)
|
||||
|
||||
# Parse the document, and the charset is seemingly unaffected.
|
||||
parsed_meta = soup.find('meta', id="encoding")
|
||||
charset = parsed_meta['charset']
|
||||
self.assertEqual('x-sjis', charset)
|
||||
|
||||
# But that value is actually a CharsetMetaAttributeValue object.
|
||||
self.assertTrue(isinstance(charset, CharsetMetaAttributeValue))
|
||||
|
||||
# And it will take on a value that reflects its current
|
||||
# encoding.
|
||||
self.assertEqual('utf8', charset.encode("utf8"))
|
||||
|
||||
def test_tag_with_no_attributes_can_have_attributes_added(self):
|
||||
data = self.soup("<a>text</a>")
|
||||
data.a['foo'] = 'bar'
|
||||
self.assertEqual('<a foo="bar">text</a>', data.a.decode())
|
||||
|
||||
def test_worst_case(self):
|
||||
"""Test the worst case (currently) for linking issues."""
|
||||
|
||||
soup = self.soup(BAD_DOCUMENT)
|
||||
self.linkage_validator(soup)
|
||||
|
||||
|
||||
class XMLTreeBuilderSmokeTest(object):
|
||||
|
||||
def test_pickle_and_unpickle_identity(self):
|
||||
# Pickling a tree, then unpickling it, yields a tree identical
|
||||
# to the original.
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertEqual(loaded.__class__, BeautifulSoup)
|
||||
self.assertEqual(loaded.decode(), tree.decode())
|
||||
|
||||
def test_docstring_generated(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode(), b'<?xml version="1.0" encoding="utf-8"?>\n<root/>')
|
||||
|
||||
def test_xml_declaration(self):
|
||||
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<foo/>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_processing_instruction(self):
|
||||
markup = b"""<?xml version="1.0" encoding="utf8"?>\n<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(markup, soup.encode("utf8"))
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
"""A real XHTML document should come out *exactly* the same as it went in."""
|
||||
markup = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN">
|
||||
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||
<head><title>Hello.</title></head>
|
||||
<body>Goodbye.</body>
|
||||
</html>"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
soup.encode("utf-8"), markup)
|
||||
|
||||
def test_nested_namespaces(self):
|
||||
doc = b"""<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
|
||||
<parent xmlns="http://ns1/">
|
||||
<child xmlns="http://ns2/" xmlns:ns3="http://ns3/">
|
||||
<grandchild ns3:attr="value" xmlns="http://ns4/"/>
|
||||
</child>
|
||||
</parent>"""
|
||||
soup = self.soup(doc)
|
||||
self.assertEqual(doc, soup.encode())
|
||||
|
||||
def test_formatter_processes_script_tag_for_xml_documents(self):
|
||||
doc = """
|
||||
<script type="text/javascript">
|
||||
</script>
|
||||
"""
|
||||
soup = BeautifulSoup(doc, "lxml-xml")
|
||||
# lxml would have stripped this while parsing, but we can add
|
||||
# it later.
|
||||
soup.script.string = 'console.log("< < hey > > ");'
|
||||
encoded = soup.encode()
|
||||
self.assertTrue(b"< < hey > >" in encoded)
|
||||
|
||||
def test_can_parse_unicode_document(self):
|
||||
markup = '<?xml version="1.0" encoding="euc-jp"><root>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</root>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual('Sacr\xe9 bleu!', soup.root.string)
|
||||
|
||||
def test_popping_namespaced_tag(self):
|
||||
markup = '<rss xmlns:dc="foo"><dc:creator>b</dc:creator><dc:date>2012-07-02T20:33:42Z</dc:date><dc:rights>c</dc:rights><image>d</image></rss>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(
|
||||
str(soup.rss), markup)
|
||||
|
||||
def test_docstring_includes_correct_encoding(self):
|
||||
soup = self.soup("<root/>")
|
||||
self.assertEqual(
|
||||
soup.encode("latin1"),
|
||||
b'<?xml version="1.0" encoding="latin1"?>\n<root/>')
|
||||
|
||||
def test_large_xml_document(self):
|
||||
"""A large XML document should come out the same as it went in."""
|
||||
markup = (b'<?xml version="1.0" encoding="utf-8"?>\n<root>'
|
||||
+ b'0' * (2**12)
|
||||
+ b'</root>')
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(soup.encode("utf-8"), markup)
|
||||
|
||||
|
||||
def test_tags_are_empty_element_if_and_only_if_they_are_empty(self):
|
||||
self.assertSoupEquals("<p>", "<p/>")
|
||||
self.assertSoupEquals("<p>foo</p>")
|
||||
|
||||
def test_namespaces_are_preserved(self):
|
||||
markup = '<root xmlns:a="http://example.com/" xmlns:b="http://example.net/"><a:foo>This tag is in the a namespace</a:foo><b:foo>This tag is in the b namespace</b:foo></root>'
|
||||
soup = self.soup(markup)
|
||||
root = soup.root
|
||||
self.assertEqual("http://example.com/", root['xmlns:a'])
|
||||
self.assertEqual("http://example.net/", root['xmlns:b'])
|
||||
|
||||
def test_closing_namespaced_tag(self):
|
||||
markup = '<p xmlns:dc="http://purl.org/dc/elements/1.1/"><dc:date>20010504</dc:date></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(str(soup.p), markup)
|
||||
|
||||
def test_namespaced_attributes(self):
|
||||
markup = '<foo xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"><bar xsi:schemaLocation="http://www.example.com"/></foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(str(soup.foo), markup)
|
||||
|
||||
def test_namespaced_attributes_xml_namespace(self):
|
||||
markup = '<foo xml:lang="fr">bar</foo>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual(str(soup.foo), markup)
|
||||
|
||||
def test_find_by_prefixed_name(self):
|
||||
doc = """<?xml version="1.0" encoding="utf-8"?>
|
||||
<Document xmlns="http://example.com/ns0"
|
||||
xmlns:ns1="http://example.com/ns1"
|
||||
xmlns:ns2="http://example.com/ns2"
|
||||
<ns1:tag>foo</ns1:tag>
|
||||
<ns1:tag>bar</ns1:tag>
|
||||
<ns2:tag key="value">baz</ns2:tag>
|
||||
</Document>
|
||||
"""
|
||||
soup = self.soup(doc)
|
||||
|
||||
# There are three <tag> tags.
|
||||
self.assertEqual(3, len(soup.find_all('tag')))
|
||||
|
||||
# But two of them are ns1:tag and one of them is ns2:tag.
|
||||
self.assertEqual(2, len(soup.find_all('ns1:tag')))
|
||||
self.assertEqual(1, len(soup.find_all('ns2:tag')))
|
||||
|
||||
self.assertEqual(1, len(soup.find_all('ns2:tag', key='value')))
|
||||
self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag'])))
|
||||
|
||||
def test_copy_tag_preserves_namespace(self):
|
||||
xml = """<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
||||
<w:document xmlns:w="http://example.com/ns0"/>"""
|
||||
|
||||
soup = self.soup(xml)
|
||||
tag = soup.document
|
||||
duplicate = copy.copy(tag)
|
||||
|
||||
# The two tags have the same namespace prefix.
|
||||
self.assertEqual(tag.prefix, duplicate.prefix)
|
||||
|
||||
def test_worst_case(self):
|
||||
"""Test the worst case (currently) for linking issues."""
|
||||
|
||||
soup = self.soup(BAD_DOCUMENT)
|
||||
self.linkage_validator(soup)
|
||||
|
||||
|
||||
class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest):
|
||||
"""Smoke test for a tree builder that supports HTML5."""
|
||||
|
||||
def test_real_xhtml_document(self):
|
||||
# Since XHTML is not HTML5, HTML5 parsers are not tested to handle
|
||||
# XHTML documents in any particular way.
|
||||
pass
|
||||
|
||||
def test_html_tags_have_namespace(self):
|
||||
markup = "<a>"
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace)
|
||||
|
||||
def test_svg_tags_have_namespace(self):
|
||||
markup = '<svg><circle/></svg>'
|
||||
soup = self.soup(markup)
|
||||
namespace = "http://www.w3.org/2000/svg"
|
||||
self.assertEqual(namespace, soup.svg.namespace)
|
||||
self.assertEqual(namespace, soup.circle.namespace)
|
||||
|
||||
|
||||
def test_mathml_tags_have_namespace(self):
|
||||
markup = '<math><msqrt>5</msqrt></math>'
|
||||
soup = self.soup(markup)
|
||||
namespace = 'http://www.w3.org/1998/Math/MathML'
|
||||
self.assertEqual(namespace, soup.math.namespace)
|
||||
self.assertEqual(namespace, soup.msqrt.namespace)
|
||||
|
||||
def test_xml_declaration_becomes_comment(self):
|
||||
markup = '<?xml version="1.0" encoding="utf-8"?><html></html>'
|
||||
soup = self.soup(markup)
|
||||
self.assertTrue(isinstance(soup.contents[0], Comment))
|
||||
self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?')
|
||||
self.assertEqual("html", soup.contents[0].next_element.name)
|
||||
|
||||
def skipIf(condition, reason):
|
||||
def nothing(test, *args, **kwargs):
|
||||
return None
|
||||
|
||||
def decorator(test_item):
|
||||
if condition:
|
||||
return nothing
|
||||
else:
|
||||
return test_item
|
||||
|
||||
return decorator
|
||||
@@ -1 +0,0 @@
|
||||
"The beautifulsoup tests."
|
||||
@@ -1,147 +0,0 @@
|
||||
"""Tests of the builder registry."""
|
||||
|
||||
import unittest
|
||||
import warnings
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
from bs4.builder import (
|
||||
builder_registry as registry,
|
||||
HTMLParserTreeBuilder,
|
||||
TreeBuilderRegistry,
|
||||
)
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError:
|
||||
HTML5LIB_PRESENT = False
|
||||
|
||||
try:
|
||||
from bs4.builder import (
|
||||
LXMLTreeBuilderForXML,
|
||||
LXMLTreeBuilder,
|
||||
)
|
||||
LXML_PRESENT = True
|
||||
except ImportError:
|
||||
LXML_PRESENT = False
|
||||
|
||||
|
||||
class BuiltInRegistryTest(unittest.TestCase):
|
||||
"""Test the built-in registry with the default builders registered."""
|
||||
|
||||
def test_combination(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('fast', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('permissive', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('strict', 'html'),
|
||||
HTMLParserTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib', 'html'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
def test_lookup_by_markup_type(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), LXMLTreeBuilder)
|
||||
self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('xml'), None)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html'), HTML5TreeBuilder)
|
||||
else:
|
||||
self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder)
|
||||
|
||||
def test_named_library(self):
|
||||
if LXML_PRESENT:
|
||||
self.assertEqual(registry.lookup('lxml', 'xml'),
|
||||
LXMLTreeBuilderForXML)
|
||||
self.assertEqual(registry.lookup('lxml', 'html'),
|
||||
LXMLTreeBuilder)
|
||||
if HTML5LIB_PRESENT:
|
||||
self.assertEqual(registry.lookup('html5lib'),
|
||||
HTML5TreeBuilder)
|
||||
|
||||
self.assertEqual(registry.lookup('html.parser'),
|
||||
HTMLParserTreeBuilder)
|
||||
|
||||
def test_beautifulsoup_constructor_does_lookup(self):
|
||||
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
# This will create a warning about not explicitly
|
||||
# specifying a parser, but we'll ignore it.
|
||||
|
||||
# You can pass in a string.
|
||||
BeautifulSoup("", features="html")
|
||||
# Or a list of strings.
|
||||
BeautifulSoup("", features=["html", "fast"])
|
||||
|
||||
# You'll get an exception if BS can't find an appropriate
|
||||
# builder.
|
||||
self.assertRaises(ValueError, BeautifulSoup,
|
||||
"", features="no-such-feature")
|
||||
|
||||
class RegistryTest(unittest.TestCase):
|
||||
"""Test the TreeBuilderRegistry class in general."""
|
||||
|
||||
def setUp(self):
|
||||
self.registry = TreeBuilderRegistry()
|
||||
|
||||
def builder_for_features(self, *feature_list):
|
||||
cls = type('Builder_' + '_'.join(feature_list),
|
||||
(object,), {'features' : feature_list})
|
||||
|
||||
self.registry.register(cls)
|
||||
return cls
|
||||
|
||||
def test_register_with_no_features(self):
|
||||
builder = self.builder_for_features()
|
||||
|
||||
# Since the builder advertises no features, you can't find it
|
||||
# by looking up features.
|
||||
self.assertEqual(self.registry.lookup('foo'), None)
|
||||
|
||||
# But you can find it by doing a lookup with no features, if
|
||||
# this happens to be the only registered builder.
|
||||
self.assertEqual(self.registry.lookup(), builder)
|
||||
|
||||
def test_register_with_features_makes_lookup_succeed(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('foo'), builder)
|
||||
self.assertEqual(self.registry.lookup('bar'), builder)
|
||||
|
||||
def test_lookup_fails_when_no_builder_implements_feature(self):
|
||||
builder = self.builder_for_features('foo', 'bar')
|
||||
self.assertEqual(self.registry.lookup('baz'), None)
|
||||
|
||||
def test_lookup_gets_most_recent_registration_when_no_feature_specified(self):
|
||||
builder1 = self.builder_for_features('foo')
|
||||
builder2 = self.builder_for_features('bar')
|
||||
self.assertEqual(self.registry.lookup(), builder2)
|
||||
|
||||
def test_lookup_fails_when_no_tree_builders_registered(self):
|
||||
self.assertEqual(self.registry.lookup(), None)
|
||||
|
||||
def test_lookup_gets_most_recent_builder_supporting_all_features(self):
|
||||
has_one = self.builder_for_features('foo')
|
||||
has_the_other = self.builder_for_features('bar')
|
||||
has_both_early = self.builder_for_features('foo', 'bar', 'baz')
|
||||
has_both_late = self.builder_for_features('foo', 'bar', 'quux')
|
||||
lacks_one = self.builder_for_features('bar')
|
||||
has_the_other = self.builder_for_features('foo')
|
||||
|
||||
# There are two builders featuring 'foo' and 'bar', but
|
||||
# the one that also features 'quux' was registered later.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar'),
|
||||
has_both_late)
|
||||
|
||||
# There is only one builder featuring 'foo', 'bar', and 'baz'.
|
||||
self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'),
|
||||
has_both_early)
|
||||
|
||||
def test_lookup_fails_when_cannot_reconcile_requested_features(self):
|
||||
builder1 = self.builder_for_features('foo', 'bar')
|
||||
builder2 = self.builder_for_features('foo', 'baz')
|
||||
self.assertEqual(self.registry.lookup('bar', 'baz'), None)
|
||||
@@ -1,36 +0,0 @@
|
||||
"Test harness for doctests."
|
||||
|
||||
# pylint: disable-msg=E0611,W0142
|
||||
|
||||
__metaclass__ = type
|
||||
__all__ = [
|
||||
'additional_tests',
|
||||
]
|
||||
|
||||
import atexit
|
||||
import doctest
|
||||
import os
|
||||
#from pkg_resources import (
|
||||
# resource_filename, resource_exists, resource_listdir, cleanup_resources)
|
||||
import unittest
|
||||
|
||||
DOCTEST_FLAGS = (
|
||||
doctest.ELLIPSIS |
|
||||
doctest.NORMALIZE_WHITESPACE |
|
||||
doctest.REPORT_NDIFF)
|
||||
|
||||
|
||||
# def additional_tests():
|
||||
# "Run the doc tests (README.txt and docs/*, if any exist)"
|
||||
# doctest_files = [
|
||||
# os.path.abspath(resource_filename('bs4', 'README.txt'))]
|
||||
# if resource_exists('bs4', 'docs'):
|
||||
# for name in resource_listdir('bs4', 'docs'):
|
||||
# if name.endswith('.txt'):
|
||||
# doctest_files.append(
|
||||
# os.path.abspath(
|
||||
# resource_filename('bs4', 'docs/%s' % name)))
|
||||
# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS)
|
||||
# atexit.register(cleanup_resources)
|
||||
# return unittest.TestSuite((
|
||||
# doctest.DocFileSuite(*doctest_files, **kwargs)))
|
||||
@@ -1,170 +0,0 @@
|
||||
"""Tests to ensure that the html5lib tree builder generates good trees."""
|
||||
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import HTML5TreeBuilder
|
||||
HTML5LIB_PRESENT = True
|
||||
except ImportError as e:
|
||||
HTML5LIB_PRESENT = False
|
||||
from bs4.element import SoupStrainer
|
||||
from bs4.testing import (
|
||||
HTML5TreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not HTML5LIB_PRESENT,
|
||||
"html5lib seems not to be present, not testing its tree builder.")
|
||||
class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest):
|
||||
"""See ``HTML5TreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return HTML5TreeBuilder
|
||||
|
||||
def test_soupstrainer(self):
|
||||
# The html5lib tree builder does not support SoupStrainers.
|
||||
strainer = SoupStrainer("b")
|
||||
markup = "<p>A <b>bold</b> statement.</p>"
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(
|
||||
soup.decode(), self.document_for(markup))
|
||||
|
||||
self.assertTrue(
|
||||
"the html5lib tree builder doesn't support parse_only" in
|
||||
str(w[0].message))
|
||||
|
||||
def test_correctly_nested_tables(self):
|
||||
"""html5lib inserts <tbody> tags where other parsers don't."""
|
||||
markup = ('<table id="1">'
|
||||
'<tr>'
|
||||
"<td>Here's another table:"
|
||||
'<table id="2">'
|
||||
'<tr><td>foo</td></tr>'
|
||||
'</table></td>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
markup,
|
||||
'<table id="1"><tbody><tr><td>Here\'s another table:'
|
||||
'<table id="2"><tbody><tr><td>foo</td></tr></tbody></table>'
|
||||
'</td></tr></tbody></table>')
|
||||
|
||||
self.assertSoupEquals(
|
||||
"<table><thead><tr><td>Foo</td></tr></thead>"
|
||||
"<tbody><tr><td>Bar</td></tr></tbody>"
|
||||
"<tfoot><tr><td>Baz</td></tr></tfoot></table>")
|
||||
|
||||
def test_xml_declaration_followed_by_doctype(self):
|
||||
markup = '''<?xml version="1.0" encoding="utf-8"?>
|
||||
<!DOCTYPE html>
|
||||
<html>
|
||||
<head>
|
||||
</head>
|
||||
<body>
|
||||
<p>foo</p>
|
||||
</body>
|
||||
</html>'''
|
||||
soup = self.soup(markup)
|
||||
# Verify that we can reach the <p> tag; this means the tree is connected.
|
||||
self.assertEqual(b"<p>foo</p>", soup.p.encode())
|
||||
|
||||
def test_reparented_markup(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p></body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
|
||||
def test_reparented_markup_ends_with_whitespace(self):
|
||||
markup = '<p><em>foo</p>\n<p>bar<a></a></em></p>\n'
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("<body><p><em>foo</em></p><em>\n</em><p><em>bar<a></a></em></p>\n</body>", soup.body.decode())
|
||||
self.assertEqual(2, len(soup.find_all('p')))
|
||||
|
||||
def test_reparented_markup_containing_identical_whitespace_nodes(self):
|
||||
"""Verify that we keep the two whitespace nodes in this
|
||||
document distinct when reparenting the adjacent <tbody> tags.
|
||||
"""
|
||||
markup = '<table> <tbody><tbody><ims></tbody> </table>'
|
||||
soup = self.soup(markup)
|
||||
space1, space2 = soup.find_all(string=' ')
|
||||
tbody1, tbody2 = soup.find_all('tbody')
|
||||
assert space1.next_element is tbody1
|
||||
assert tbody2.next_element is space2
|
||||
|
||||
def test_reparented_markup_containing_children(self):
|
||||
markup = '<div><a>aftermath<p><noscript>target</noscript>aftermath</a></p></div>'
|
||||
soup = self.soup(markup)
|
||||
noscript = soup.noscript
|
||||
self.assertEqual("target", noscript.next_element)
|
||||
target = soup.find(string='target')
|
||||
|
||||
# The 'aftermath' string was duplicated; we want the second one.
|
||||
final_aftermath = soup.find_all(string='aftermath')[-1]
|
||||
|
||||
# The <noscript> tag was moved beneath a copy of the <a> tag,
|
||||
# but the 'target' string within is still connected to the
|
||||
# (second) 'aftermath' string.
|
||||
self.assertEqual(final_aftermath, target.next_element)
|
||||
self.assertEqual(target, final_aftermath.previous_element)
|
||||
|
||||
def test_processing_instruction(self):
|
||||
"""Processing instructions become comments."""
|
||||
markup = b"""<?PITarget PIContent?>"""
|
||||
soup = self.soup(markup)
|
||||
assert str(soup).startswith("<!--?PITarget PIContent?-->")
|
||||
|
||||
def test_cloned_multivalue_node(self):
|
||||
markup = b"""<a class="my_class"><p></a>"""
|
||||
soup = self.soup(markup)
|
||||
a1, a2 = soup.find_all('a')
|
||||
self.assertEqual(a1, a2)
|
||||
assert a1 is not a2
|
||||
|
||||
def test_foster_parenting(self):
|
||||
markup = b"""<table><td></tbody>A"""
|
||||
soup = self.soup(markup)
|
||||
self.assertEqual("<body>A<table><tbody><tr><td></td></tr></tbody></table></body>", soup.body.decode())
|
||||
|
||||
def test_extraction(self):
|
||||
"""
|
||||
Test that extraction does not destroy the tree.
|
||||
|
||||
https://bugs.launchpad.net/beautifulsoup/+bug/1782928
|
||||
"""
|
||||
|
||||
markup = """
|
||||
<html><head></head>
|
||||
<style>
|
||||
</style><script></script><body><p>hello</p></body></html>
|
||||
"""
|
||||
soup = self.soup(markup)
|
||||
[s.extract() for s in soup('script')]
|
||||
[s.extract() for s in soup('style')]
|
||||
|
||||
self.assertEqual(len(soup.find_all("p")), 1)
|
||||
|
||||
def test_empty_comment(self):
|
||||
"""
|
||||
Test that empty comment does not break structure.
|
||||
|
||||
https://bugs.launchpad.net/beautifulsoup/+bug/1806598
|
||||
"""
|
||||
|
||||
markup = """
|
||||
<html>
|
||||
<body>
|
||||
<form>
|
||||
<!----><input type="text">
|
||||
</form>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
soup = self.soup(markup)
|
||||
inputs = []
|
||||
for form in soup.find_all('form'):
|
||||
inputs.extend(form.find_all('input'))
|
||||
self.assertEqual(len(inputs), 1)
|
||||
@@ -1,47 +0,0 @@
|
||||
"""Tests to ensure that the html.parser tree builder generates good
|
||||
trees."""
|
||||
|
||||
from pdb import set_trace
|
||||
import pickle
|
||||
from bs4.testing import SoupTest, HTMLTreeBuilderSmokeTest
|
||||
from bs4.builder import HTMLParserTreeBuilder
|
||||
from bs4.builder._htmlparser import BeautifulSoupHTMLParser
|
||||
|
||||
class HTMLParserTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
|
||||
default_builder = HTMLParserTreeBuilder
|
||||
|
||||
def test_namespaced_system_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
||||
def test_namespaced_public_doctype(self):
|
||||
# html.parser can't handle namespaced doctypes, so skip this one.
|
||||
pass
|
||||
|
||||
def test_builder_is_pickled(self):
|
||||
"""Unlike most tree builders, HTMLParserTreeBuilder and will
|
||||
be restored after pickling.
|
||||
"""
|
||||
tree = self.soup("<a><b>foo</a>")
|
||||
dumped = pickle.dumps(tree, 2)
|
||||
loaded = pickle.loads(dumped)
|
||||
self.assertTrue(isinstance(loaded.builder, type(tree.builder)))
|
||||
|
||||
def test_redundant_empty_element_closing_tags(self):
|
||||
self.assertSoupEquals('<br></br><br></br><br></br>', "<br/><br/><br/>")
|
||||
self.assertSoupEquals('</br></br></br>', "")
|
||||
|
||||
def test_empty_element(self):
|
||||
# This verifies that any buffered data present when the parser
|
||||
# finishes working is handled.
|
||||
self.assertSoupEquals("foo &# bar", "foo &# bar")
|
||||
|
||||
|
||||
class TestHTMLParserSubclass(SoupTest):
|
||||
def test_error(self):
|
||||
"""Verify that our HTMLParser subclass implements error() in a way
|
||||
that doesn't cause a crash.
|
||||
"""
|
||||
parser = BeautifulSoupHTMLParser()
|
||||
parser.error("don't crash")
|
||||
@@ -1,100 +0,0 @@
|
||||
"""Tests to ensure that the lxml tree builder generates good trees."""
|
||||
|
||||
import re
|
||||
import warnings
|
||||
|
||||
try:
|
||||
import lxml.etree
|
||||
LXML_PRESENT = True
|
||||
LXML_VERSION = lxml.etree.LXML_VERSION
|
||||
except ImportError as e:
|
||||
LXML_PRESENT = False
|
||||
LXML_VERSION = (0,)
|
||||
|
||||
if LXML_PRESENT:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import Comment, Doctype, SoupStrainer
|
||||
from bs4.testing import skipIf
|
||||
from bs4.tests import test_htmlparser
|
||||
from bs4.testing import (
|
||||
HTMLTreeBuilderSmokeTest,
|
||||
XMLTreeBuilderSmokeTest,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its tree builder.")
|
||||
class LXMLTreeBuilderSmokeTest(SoupTest, HTMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilder
|
||||
|
||||
def test_out_of_range_entity(self):
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
self.assertSoupEquals(
|
||||
"<p>foo�bar</p>", "<p>foobar</p>")
|
||||
|
||||
def test_entities_in_foreign_document_encoding(self):
|
||||
# We can't implement this case correctly because by the time we
|
||||
# hear about markup like "“", it's been (incorrectly) converted into
|
||||
# a string like u'\x93'
|
||||
pass
|
||||
|
||||
# In lxml < 2.3.5, an empty doctype causes a segfault. Skip this
|
||||
# test if an old version of lxml is installed.
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT or LXML_VERSION < (2,3,5,0),
|
||||
"Skipping doctype test for old version of lxml to avoid segfault.")
|
||||
def test_empty_doctype(self):
|
||||
soup = self.soup("<!DOCTYPE>")
|
||||
doctype = soup.contents[0]
|
||||
self.assertEqual("", doctype.strip())
|
||||
|
||||
def test_beautifulstonesoup_is_xml_parser(self):
|
||||
# Make sure that the deprecated BSS class uses an xml builder
|
||||
# if one is installed.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = BeautifulStoneSoup("<b />")
|
||||
self.assertEqual("<b/>", str(soup.b))
|
||||
self.assertTrue("BeautifulStoneSoup class is deprecated" in str(w[0].message))
|
||||
|
||||
@skipIf(
|
||||
not LXML_PRESENT,
|
||||
"lxml seems not to be present, not testing its XML tree builder.")
|
||||
class LXMLXMLTreeBuilderSmokeTest(SoupTest, XMLTreeBuilderSmokeTest):
|
||||
"""See ``HTMLTreeBuilderSmokeTest``."""
|
||||
|
||||
@property
|
||||
def default_builder(self):
|
||||
return LXMLTreeBuilderForXML
|
||||
|
||||
def test_namespace_indexing(self):
|
||||
# We should not track un-prefixed namespaces as we can only hold one
|
||||
# and it will be recognized as the default namespace by soupsieve,
|
||||
# which may be confusing in some situations. When no namespace is provided
|
||||
# for a selector, the default namespace (if defined) is assumed.
|
||||
|
||||
soup = self.soup(
|
||||
'<?xml version="1.1"?>\n'
|
||||
'<root>'
|
||||
'<tag xmlns="http://unprefixed-namespace.com">content</tag>'
|
||||
'<prefix:tag xmlns:prefix="http://prefixed-namespace.com">content</tag>'
|
||||
'</root>'
|
||||
)
|
||||
self.assertEqual(
|
||||
soup._namespaces,
|
||||
{'xml': 'http://www.w3.org/XML/1998/namespace', 'prefix': 'http://prefixed-namespace.com'}
|
||||
)
|
||||
@@ -1,567 +0,0 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
"""Tests of Beautiful Soup as a whole."""
|
||||
|
||||
from pdb import set_trace
|
||||
import logging
|
||||
import unittest
|
||||
import sys
|
||||
import tempfile
|
||||
|
||||
from bs4 import (
|
||||
BeautifulSoup,
|
||||
BeautifulStoneSoup,
|
||||
)
|
||||
from bs4.element import (
|
||||
CharsetMetaAttributeValue,
|
||||
ContentMetaAttributeValue,
|
||||
SoupStrainer,
|
||||
NamespacedAttribute,
|
||||
)
|
||||
import bs4.dammit
|
||||
from bs4.dammit import (
|
||||
EntitySubstitution,
|
||||
UnicodeDammit,
|
||||
EncodingDetector,
|
||||
)
|
||||
from bs4.testing import (
|
||||
default_builder,
|
||||
SoupTest,
|
||||
skipIf,
|
||||
)
|
||||
import warnings
|
||||
|
||||
try:
|
||||
from bs4.builder import LXMLTreeBuilder, LXMLTreeBuilderForXML
|
||||
LXML_PRESENT = True
|
||||
except ImportError as e:
|
||||
LXML_PRESENT = False
|
||||
|
||||
PYTHON_3_PRE_3_2 = (sys.version_info[0] == 3 and sys.version_info < (3,2))
|
||||
|
||||
class TestConstructor(SoupTest):
|
||||
|
||||
def test_short_unicode_input(self):
|
||||
data = "<h1>éé</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual("éé", soup.h1.string)
|
||||
|
||||
def test_embedded_null(self):
|
||||
data = "<h1>foo\0bar</h1>"
|
||||
soup = self.soup(data)
|
||||
self.assertEqual("foo\0bar", soup.h1.string)
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
soup = self.soup(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual("windows-1252", soup.original_encoding)
|
||||
|
||||
def test_custom_builder_class(self):
|
||||
# Verify that you can pass in a custom Builder class and
|
||||
# it'll be instantiated with the appropriate keyword arguments.
|
||||
class Mock(object):
|
||||
def __init__(self, **kwargs):
|
||||
self.called_with = kwargs
|
||||
self.is_xml = True
|
||||
def initialize_soup(self, soup):
|
||||
pass
|
||||
def prepare_markup(self, *args, **kwargs):
|
||||
return ''
|
||||
|
||||
kwargs = dict(
|
||||
var="value",
|
||||
# This is a deprecated BS3-era keyword argument, which
|
||||
# will be stripped out.
|
||||
convertEntities=True,
|
||||
)
|
||||
with warnings.catch_warnings(record=True):
|
||||
soup = BeautifulSoup('', builder=Mock, **kwargs)
|
||||
assert isinstance(soup.builder, Mock)
|
||||
self.assertEqual(dict(var="value"), soup.builder.called_with)
|
||||
|
||||
# You can also instantiate the TreeBuilder yourself. In this
|
||||
# case, that specific object is used and any keyword arguments
|
||||
# to the BeautifulSoup constructor are ignored.
|
||||
builder = Mock(**kwargs)
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = BeautifulSoup(
|
||||
'', builder=builder, ignored_value=True,
|
||||
)
|
||||
msg = str(w[0].message)
|
||||
assert msg.startswith("Keyword arguments to the BeautifulSoup constructor will be ignored.")
|
||||
self.assertEqual(builder, soup.builder)
|
||||
self.assertEqual(kwargs, builder.called_with)
|
||||
|
||||
def test_cdata_list_attributes(self):
|
||||
# Most attribute values are represented as scalars, but the
|
||||
# HTML standard says that some attributes, like 'class' have
|
||||
# space-separated lists as values.
|
||||
markup = '<a id=" an id " class=" a class "></a>'
|
||||
soup = self.soup(markup)
|
||||
|
||||
# Note that the spaces are stripped for 'class' but not for 'id'.
|
||||
a = soup.a
|
||||
self.assertEqual(" an id ", a['id'])
|
||||
self.assertEqual(["a", "class"], a['class'])
|
||||
|
||||
# TreeBuilder takes an argument called 'mutli_valued_attributes' which lets
|
||||
# you customize or disable this. As always, you can customize the TreeBuilder
|
||||
# by passing in a keyword argument to the BeautifulSoup constructor.
|
||||
soup = self.soup(markup, builder=default_builder, multi_valued_attributes=None)
|
||||
self.assertEqual(" a class ", soup.a['class'])
|
||||
|
||||
# Here are two ways of saying that `id` is a multi-valued
|
||||
# attribute in this context, but 'class' is not.
|
||||
for switcheroo in ({'*': 'id'}, {'a': 'id'}):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
# This will create a warning about not explicitly
|
||||
# specifying a parser, but we'll ignore it.
|
||||
soup = self.soup(markup, builder=None, multi_valued_attributes=switcheroo)
|
||||
a = soup.a
|
||||
self.assertEqual(["an", "id"], a['id'])
|
||||
self.assertEqual(" a class ", a['class'])
|
||||
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def _no_parser_specified(self, s, is_there=True):
|
||||
v = s.startswith(BeautifulSoup.NO_PARSER_SPECIFIED_WARNING[:80])
|
||||
self.assertTrue(v)
|
||||
|
||||
def test_warning_if_no_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
|
||||
def test_warning_if_parser_specified_too_vague(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html")
|
||||
msg = str(w[0].message)
|
||||
self._assert_no_parser_specified(msg)
|
||||
|
||||
def test_no_warning_if_explicit_parser_specified(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", "html.parser")
|
||||
self.assertEqual([], w)
|
||||
|
||||
def test_parseOnlyThese_renamed_to_parse_only(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup("<a><b></b></a>", parseOnlyThese=SoupStrainer("b"))
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("parseOnlyThese" in msg)
|
||||
self.assertTrue("parse_only" in msg)
|
||||
self.assertEqual(b"<b></b>", soup.encode())
|
||||
|
||||
def test_fromEncoding_renamed_to_from_encoding(self):
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
utf8 = b"\xc3\xa9"
|
||||
soup = self.soup(utf8, fromEncoding="utf8")
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("fromEncoding" in msg)
|
||||
self.assertTrue("from_encoding" in msg)
|
||||
self.assertEqual("utf8", soup.original_encoding)
|
||||
|
||||
def test_unrecognized_keyword_argument(self):
|
||||
self.assertRaises(
|
||||
TypeError, self.soup, "<a>", no_such_argument=True)
|
||||
|
||||
class TestWarnings(SoupTest):
|
||||
|
||||
def test_disk_file_warning(self):
|
||||
filehandle = tempfile.NamedTemporaryFile()
|
||||
filename = filehandle.name
|
||||
try:
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
msg = str(w[0].message)
|
||||
self.assertTrue("looks like a filename" in msg)
|
||||
finally:
|
||||
filehandle.close()
|
||||
|
||||
# The file no longer exists, so Beautiful Soup will no longer issue the warning.
|
||||
with warnings.catch_warnings(record=True) as w:
|
||||
soup = self.soup(filename)
|
||||
self.assertEqual(0, len(w))
|
||||
|
||||
def test_url_warning_with_bytes_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/")
|
||||
# Be aware this isn't the only warning that can be raised during
|
||||
# execution..
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_unicode_url(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
# note - this url must differ from the bytes one otherwise
|
||||
# python's warnings system swallows the second warning
|
||||
soup = self.soup("http://www.crummyunicode.com/")
|
||||
self.assertTrue(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_bytes_and_space(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup(b"http://www.crummybytes.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
def test_url_warning_with_unicode_and_space(self):
|
||||
with warnings.catch_warnings(record=True) as warning_list:
|
||||
soup = self.soup("http://www.crummyuncode.com/ is great")
|
||||
self.assertFalse(any("looks like a URL" in str(w.message)
|
||||
for w in warning_list))
|
||||
|
||||
|
||||
class TestSelectiveParsing(SoupTest):
|
||||
|
||||
def test_parse_with_soupstrainer(self):
|
||||
markup = "No<b>Yes</b><a>No<b>Yes <c>Yes</c></b>"
|
||||
strainer = SoupStrainer("b")
|
||||
soup = self.soup(markup, parse_only=strainer)
|
||||
self.assertEqual(soup.encode(), b"<b>Yes</b><b>Yes <c>Yes</c></b>")
|
||||
|
||||
|
||||
class TestEntitySubstitution(unittest.TestCase):
|
||||
"""Standalone tests of the EntitySubstitution class."""
|
||||
def setUp(self):
|
||||
self.sub = EntitySubstitution
|
||||
|
||||
def test_simple_html_substitution(self):
|
||||
# Unicode characters corresponding to named HTML entites
|
||||
# are substituted, and no others.
|
||||
s = "foo\u2200\N{SNOWMAN}\u00f5bar"
|
||||
self.assertEqual(self.sub.substitute_html(s),
|
||||
"foo∀\N{SNOWMAN}õbar")
|
||||
|
||||
def test_smart_quote_substitution(self):
|
||||
# MS smart quotes are a common source of frustration, so we
|
||||
# give them a special test.
|
||||
quotes = b"\x91\x92foo\x93\x94"
|
||||
dammit = UnicodeDammit(quotes)
|
||||
self.assertEqual(self.sub.substitute_html(dammit.markup),
|
||||
"‘’foo“”")
|
||||
|
||||
def test_xml_converstion_includes_no_quotes_if_make_quoted_attribute_is_false(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, False), s)
|
||||
|
||||
def test_xml_attribute_quoting_normally_uses_double_quotes(self):
|
||||
self.assertEqual(self.sub.substitute_xml("Welcome", True),
|
||||
'"Welcome"')
|
||||
self.assertEqual(self.sub.substitute_xml("Bob's Bar", True),
|
||||
'"Bob\'s Bar"')
|
||||
|
||||
def test_xml_attribute_quoting_uses_single_quotes_when_value_contains_double_quotes(self):
|
||||
s = 'Welcome to "my bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(s, True),
|
||||
"'Welcome to \"my bar\"'")
|
||||
|
||||
def test_xml_attribute_quoting_escapes_single_quotes_when_value_contains_both_single_and_double_quotes(self):
|
||||
s = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml(s, True),
|
||||
'"Welcome to "Bob\'s Bar""')
|
||||
|
||||
def test_xml_quotes_arent_escaped_when_value_is_not_being_quoted(self):
|
||||
quoted = 'Welcome to "Bob\'s Bar"'
|
||||
self.assertEqual(self.sub.substitute_xml(quoted), quoted)
|
||||
|
||||
def test_xml_quoting_handles_angle_brackets(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("foo<bar>"),
|
||||
"foo<bar>")
|
||||
|
||||
def test_xml_quoting_handles_ampersands(self):
|
||||
self.assertEqual(self.sub.substitute_xml("AT&T"), "AT&T")
|
||||
|
||||
def test_xml_quoting_including_ampersands_when_they_are_part_of_an_entity(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml("ÁT&T"),
|
||||
"&Aacute;T&T")
|
||||
|
||||
def test_xml_quoting_ignoring_ampersands_when_they_are_part_of_an_entity(self):
|
||||
self.assertEqual(
|
||||
self.sub.substitute_xml_containing_entities("ÁT&T"),
|
||||
"ÁT&T")
|
||||
|
||||
def test_quotes_not_html_substituted(self):
|
||||
"""There's no need to do this except inside attribute values."""
|
||||
text = 'Bob\'s "bar"'
|
||||
self.assertEqual(self.sub.substitute_html(text), text)
|
||||
|
||||
|
||||
class TestEncodingConversion(SoupTest):
|
||||
# Test Beautiful Soup's ability to decode and encode from various
|
||||
# encodings.
|
||||
|
||||
def setUp(self):
|
||||
super(TestEncodingConversion, self).setUp()
|
||||
self.unicode_data = '<html><head><meta charset="utf-8"/></head><body><foo>Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!</foo></body></html>'
|
||||
self.utf8_data = self.unicode_data.encode("utf-8")
|
||||
# Just so you know what it looks like.
|
||||
self.assertEqual(
|
||||
self.utf8_data,
|
||||
b'<html><head><meta charset="utf-8"/></head><body><foo>Sacr\xc3\xa9 bleu!</foo></body></html>')
|
||||
|
||||
def test_ascii_in_unicode_out(self):
|
||||
# ASCII input is converted to Unicode. The original_encoding
|
||||
# attribute is set to 'utf-8', a superset of ASCII.
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
# Disable chardet, which will realize that the ASCII is ASCII.
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
ascii = b"<foo>a</foo>"
|
||||
soup_from_ascii = self.soup(ascii)
|
||||
unicode_output = soup_from_ascii.decode()
|
||||
self.assertTrue(isinstance(unicode_output, str))
|
||||
self.assertEqual(unicode_output, self.document_for(ascii.decode()))
|
||||
self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8")
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_unicode_in_unicode_out(self):
|
||||
# Unicode input is left alone. The original_encoding attribute
|
||||
# is not set.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!')
|
||||
self.assertEqual(soup_from_unicode.original_encoding, None)
|
||||
|
||||
def test_utf8_in_unicode_out(self):
|
||||
# UTF-8 input is converted to Unicode. The original_encoding
|
||||
# attribute is set.
|
||||
soup_from_utf8 = self.soup(self.utf8_data)
|
||||
self.assertEqual(soup_from_utf8.decode(), self.unicode_data)
|
||||
self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
|
||||
|
||||
def test_utf8_out(self):
|
||||
# The internal data structures can be encoded as UTF-8.
|
||||
soup_from_unicode = self.soup(self.unicode_data)
|
||||
self.assertEqual(soup_from_unicode.encode('utf-8'), self.utf8_data)
|
||||
|
||||
@skipIf(
|
||||
PYTHON_3_PRE_3_2,
|
||||
"Bad HTMLParser detected; skipping test of non-ASCII characters in attribute name.")
|
||||
def test_attribute_name_containing_unicode_characters(self):
|
||||
markup = '<div><a \N{SNOWMAN}="snowman"></a></div>'
|
||||
self.assertEqual(self.soup(markup).div.encode("utf8"), markup.encode("utf8"))
|
||||
|
||||
class TestUnicodeDammit(unittest.TestCase):
|
||||
"""Standalone tests of UnicodeDammit."""
|
||||
|
||||
def test_unicode_input(self):
|
||||
markup = "I'm already Unicode! \N{SNOWMAN}"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(dammit.unicode_markup, markup)
|
||||
|
||||
def test_smart_quotes_to_unicode(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup)
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>\u2018\u2019\u201c\u201d</foo>")
|
||||
|
||||
def test_smart_quotes_to_xml_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="xml")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_html_entities(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="html")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, "<foo>‘’“”</foo>")
|
||||
|
||||
def test_smart_quotes_to_ascii(self):
|
||||
markup = b"<foo>\x91\x92\x93\x94</foo>"
|
||||
dammit = UnicodeDammit(markup, smart_quotes_to="ascii")
|
||||
self.assertEqual(
|
||||
dammit.unicode_markup, """<foo>''""</foo>""")
|
||||
|
||||
def test_detect_utf8(self):
|
||||
utf8 = b"Sacr\xc3\xa9 bleu! \xe2\x98\x83"
|
||||
dammit = UnicodeDammit(utf8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup, 'Sacr\xe9 bleu! \N{SNOWMAN}')
|
||||
|
||||
|
||||
def test_convert_hebrew(self):
|
||||
hebrew = b"\xed\xe5\xec\xf9"
|
||||
dammit = UnicodeDammit(hebrew, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8')
|
||||
self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
|
||||
|
||||
def test_dont_see_smart_quotes_where_there_are_none(self):
|
||||
utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch"
|
||||
dammit = UnicodeDammit(utf_8)
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
|
||||
|
||||
def test_ignore_inappropriate_codecs(self):
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
dammit = UnicodeDammit(utf8_data, ["iso-8859-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_ignore_invalid_codecs(self):
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
for bad_encoding in ['.utf8', '...', 'utF---16.!']:
|
||||
dammit = UnicodeDammit(utf8_data, [bad_encoding])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
|
||||
|
||||
def test_exclude_encodings(self):
|
||||
# This is UTF-8.
|
||||
utf8_data = "Räksmörgås".encode("utf-8")
|
||||
|
||||
# But if we exclude UTF-8 from consideration, the guess is
|
||||
# Windows-1252.
|
||||
dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"])
|
||||
self.assertEqual(dammit.original_encoding.lower(), 'windows-1252')
|
||||
|
||||
# And if we exclude that, there is no valid guess at all.
|
||||
dammit = UnicodeDammit(
|
||||
utf8_data, exclude_encodings=["utf-8", "windows-1252"])
|
||||
self.assertEqual(dammit.original_encoding, None)
|
||||
|
||||
def test_encoding_detector_replaces_junk_in_encoding_name_with_replacement_character(self):
|
||||
detected = EncodingDetector(
|
||||
b'<?xml version="1.0" encoding="UTF-\xdb" ?>')
|
||||
encodings = list(detected.encodings)
|
||||
assert 'utf-\N{REPLACEMENT CHARACTER}' in encodings
|
||||
|
||||
def test_detect_html5_style_meta_tag(self):
|
||||
|
||||
for data in (
|
||||
b'<html><meta charset="euc-jp" /></html>',
|
||||
b"<html><meta charset='euc-jp' /></html>",
|
||||
b"<html><meta charset=euc-jp /></html>",
|
||||
b"<html><meta charset=euc-jp/></html>"):
|
||||
dammit = UnicodeDammit(data, is_html=True)
|
||||
self.assertEqual(
|
||||
"euc-jp", dammit.original_encoding)
|
||||
|
||||
def test_last_ditch_entity_replacement(self):
|
||||
# This is a UTF-8 document that contains bytestrings
|
||||
# completely incompatible with UTF-8 (ie. encoded with some other
|
||||
# encoding).
|
||||
#
|
||||
# Since there is no consistent encoding for the document,
|
||||
# Unicode, Dammit will eventually encode the document as UTF-8
|
||||
# and encode the incompatible characters as REPLACEMENT
|
||||
# CHARACTER.
|
||||
#
|
||||
# If chardet is installed, it will detect that the document
|
||||
# can be converted into ISO-8859-1 without errors. This happens
|
||||
# to be the wrong encoding, but it is a consistent encoding, so the
|
||||
# code we're testing here won't run.
|
||||
#
|
||||
# So we temporarily disable chardet if it's present.
|
||||
doc = b"""\357\273\277<?xml version="1.0" encoding="UTF-8"?>
|
||||
<html><b>\330\250\330\252\330\261</b>
|
||||
<i>\310\322\321\220\312\321\355\344</i></html>"""
|
||||
chardet = bs4.dammit.chardet_dammit
|
||||
logging.disable(logging.WARNING)
|
||||
try:
|
||||
def noop(str):
|
||||
return None
|
||||
bs4.dammit.chardet_dammit = noop
|
||||
dammit = UnicodeDammit(doc)
|
||||
self.assertEqual(True, dammit.contains_replacement_characters)
|
||||
self.assertTrue("\ufffd" in dammit.unicode_markup)
|
||||
|
||||
soup = BeautifulSoup(doc, "html.parser")
|
||||
self.assertTrue(soup.contains_replacement_characters)
|
||||
finally:
|
||||
logging.disable(logging.NOTSET)
|
||||
bs4.dammit.chardet_dammit = chardet
|
||||
|
||||
def test_byte_order_mark_removed(self):
|
||||
# A document written in UTF-16LE will have its byte order marker stripped.
|
||||
data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00'
|
||||
dammit = UnicodeDammit(data)
|
||||
self.assertEqual("<a>áé</a>", dammit.unicode_markup)
|
||||
self.assertEqual("utf-16le", dammit.original_encoding)
|
||||
|
||||
def test_detwingle(self):
|
||||
# Here's a UTF8 document.
|
||||
utf8 = ("\N{SNOWMAN}" * 3).encode("utf8")
|
||||
|
||||
# Here's a Windows-1252 document.
|
||||
windows_1252 = (
|
||||
"\N{LEFT DOUBLE QUOTATION MARK}Hi, I like Windows!"
|
||||
"\N{RIGHT DOUBLE QUOTATION MARK}").encode("windows_1252")
|
||||
|
||||
# Through some unholy alchemy, they've been stuck together.
|
||||
doc = utf8 + windows_1252 + utf8
|
||||
|
||||
# The document can't be turned into UTF-8:
|
||||
self.assertRaises(UnicodeDecodeError, doc.decode, "utf8")
|
||||
|
||||
# Unicode, Dammit thinks the whole document is Windows-1252,
|
||||
# and decodes it into "☃☃☃“Hi, I like Windows!”☃☃☃"
|
||||
|
||||
# But if we run it through fix_embedded_windows_1252, it's fixed:
|
||||
|
||||
fixed = UnicodeDammit.detwingle(doc)
|
||||
self.assertEqual(
|
||||
"☃☃☃“Hi, I like Windows!”☃☃☃", fixed.decode("utf8"))
|
||||
|
||||
def test_detwingle_ignores_multibyte_characters(self):
|
||||
# Each of these characters has a UTF-8 representation ending
|
||||
# in \x93. \x93 is a smart quote if interpreted as
|
||||
# Windows-1252. But our code knows to skip over multibyte
|
||||
# UTF-8 characters, so they'll survive the process unscathed.
|
||||
for tricky_unicode_char in (
|
||||
"\N{LATIN SMALL LIGATURE OE}", # 2-byte char '\xc5\x93'
|
||||
"\N{LATIN SUBSCRIPT SMALL LETTER X}", # 3-byte char '\xe2\x82\x93'
|
||||
"\xf0\x90\x90\x93", # This is a CJK character, not sure which one.
|
||||
):
|
||||
input = tricky_unicode_char.encode("utf8")
|
||||
self.assertTrue(input.endswith(b'\x93'))
|
||||
output = UnicodeDammit.detwingle(input)
|
||||
self.assertEqual(output, input)
|
||||
|
||||
class TestNamedspacedAttribute(SoupTest):
|
||||
|
||||
def test_name_may_be_none(self):
|
||||
a = NamespacedAttribute("xmlns", None)
|
||||
self.assertEqual(a, "xmlns")
|
||||
|
||||
def test_attribute_is_equivalent_to_colon_separated_string(self):
|
||||
a = NamespacedAttribute("a", "b")
|
||||
self.assertEqual("a:b", a)
|
||||
|
||||
def test_attributes_are_equivalent_if_prefix_and_name_identical(self):
|
||||
a = NamespacedAttribute("a", "b", "c")
|
||||
b = NamespacedAttribute("a", "b", "c")
|
||||
self.assertEqual(a, b)
|
||||
|
||||
# The actual namespace is not considered.
|
||||
c = NamespacedAttribute("a", "b", None)
|
||||
self.assertEqual(a, c)
|
||||
|
||||
# But name and prefix are important.
|
||||
d = NamespacedAttribute("a", "z", "c")
|
||||
self.assertNotEqual(a, d)
|
||||
|
||||
e = NamespacedAttribute("z", "b", "c")
|
||||
self.assertNotEqual(a, e)
|
||||
|
||||
|
||||
class TestAttributeValueWithCharsetSubstitution(unittest.TestCase):
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = CharsetMetaAttributeValue("euc-jp")
|
||||
self.assertEqual("euc-jp", value)
|
||||
self.assertEqual("euc-jp", value.original_value)
|
||||
self.assertEqual("utf8", value.encode("utf8"))
|
||||
|
||||
|
||||
def test_content_meta_attribute_value(self):
|
||||
value = ContentMetaAttributeValue("text/html; charset=euc-jp")
|
||||
self.assertEqual("text/html; charset=euc-jp", value)
|
||||
self.assertEqual("text/html; charset=euc-jp", value.original_value)
|
||||
self.assertEqual("text/html; charset=utf8", value.encode("utf8"))
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,25 +0,0 @@
|
||||
import sys
|
||||
|
||||
from .client import Client
|
||||
from .middleware import WSGIApp, Middleware
|
||||
from .server import Server
|
||||
if sys.version_info >= (3, 5): # pragma: no cover
|
||||
from .asyncio_server import AsyncServer
|
||||
from .asyncio_client import AsyncClient
|
||||
from .async_drivers.asgi import ASGIApp
|
||||
try:
|
||||
from .async_drivers.tornado import get_tornado_handler
|
||||
except ImportError:
|
||||
get_tornado_handler = None
|
||||
else: # pragma: no cover
|
||||
AsyncServer = None
|
||||
AsyncClient = None
|
||||
get_tornado_handler = None
|
||||
ASGIApp = None
|
||||
|
||||
__version__ = '3.11.2'
|
||||
|
||||
__all__ = ['__version__', 'Server', 'WSGIApp', 'Middleware', 'Client']
|
||||
if AsyncServer is not None: # pragma: no cover
|
||||
__all__ += ['AsyncServer', 'ASGIApp', 'get_tornado_handler',
|
||||
'AsyncClient'],
|
||||
@@ -1,128 +0,0 @@
|
||||
import asyncio
|
||||
import sys
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
from aiohttp.web import Response, WebSocketResponse
|
||||
import six
|
||||
|
||||
|
||||
def create_route(app, engineio_server, engineio_endpoint):
|
||||
"""This function sets up the engine.io endpoint as a route for the
|
||||
application.
|
||||
|
||||
Note that both GET and POST requests must be hooked up on the engine.io
|
||||
endpoint.
|
||||
"""
|
||||
app.router.add_get(engineio_endpoint, engineio_server.handle_request)
|
||||
app.router.add_post(engineio_endpoint, engineio_server.handle_request)
|
||||
app.router.add_route('OPTIONS', engineio_endpoint,
|
||||
engineio_server.handle_request)
|
||||
|
||||
|
||||
def translate_request(request):
|
||||
"""This function takes the arguments passed to the request handler and
|
||||
uses them to generate a WSGI compatible environ dictionary.
|
||||
"""
|
||||
message = request._message
|
||||
payload = request._payload
|
||||
|
||||
uri_parts = urlsplit(message.path)
|
||||
environ = {
|
||||
'wsgi.input': payload,
|
||||
'wsgi.errors': sys.stderr,
|
||||
'wsgi.version': (1, 0),
|
||||
'wsgi.async': True,
|
||||
'wsgi.multithread': False,
|
||||
'wsgi.multiprocess': False,
|
||||
'wsgi.run_once': False,
|
||||
'SERVER_SOFTWARE': 'aiohttp',
|
||||
'REQUEST_METHOD': message.method,
|
||||
'QUERY_STRING': uri_parts.query or '',
|
||||
'RAW_URI': message.path,
|
||||
'SERVER_PROTOCOL': 'HTTP/%s.%s' % message.version,
|
||||
'REMOTE_ADDR': '127.0.0.1',
|
||||
'REMOTE_PORT': '0',
|
||||
'SERVER_NAME': 'aiohttp',
|
||||
'SERVER_PORT': '0',
|
||||
'aiohttp.request': request
|
||||
}
|
||||
|
||||
for hdr_name, hdr_value in message.headers.items():
|
||||
hdr_name = hdr_name.upper()
|
||||
if hdr_name == 'CONTENT-TYPE':
|
||||
environ['CONTENT_TYPE'] = hdr_value
|
||||
continue
|
||||
elif hdr_name == 'CONTENT-LENGTH':
|
||||
environ['CONTENT_LENGTH'] = hdr_value
|
||||
continue
|
||||
|
||||
key = 'HTTP_%s' % hdr_name.replace('-', '_')
|
||||
if key in environ:
|
||||
hdr_value = '%s,%s' % (environ[key], hdr_value)
|
||||
|
||||
environ[key] = hdr_value
|
||||
|
||||
environ['wsgi.url_scheme'] = environ.get('HTTP_X_FORWARDED_PROTO', 'http')
|
||||
|
||||
path_info = uri_parts.path
|
||||
|
||||
environ['PATH_INFO'] = path_info
|
||||
environ['SCRIPT_NAME'] = ''
|
||||
|
||||
return environ
|
||||
|
||||
|
||||
def make_response(status, headers, payload, environ):
|
||||
"""This function generates an appropriate response object for this async
|
||||
mode.
|
||||
"""
|
||||
return Response(body=payload, status=int(status.split()[0]),
|
||||
headers=headers)
|
||||
|
||||
|
||||
class WebSocket(object): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides a aiohttp WebSocket interface that is
|
||||
somewhat compatible with eventlet's implementation.
|
||||
"""
|
||||
def __init__(self, handler):
|
||||
self.handler = handler
|
||||
self._sock = None
|
||||
|
||||
async def __call__(self, environ):
|
||||
request = environ['aiohttp.request']
|
||||
self._sock = WebSocketResponse()
|
||||
await self._sock.prepare(request)
|
||||
|
||||
self.environ = environ
|
||||
await self.handler(self)
|
||||
return self._sock
|
||||
|
||||
async def close(self):
|
||||
await self._sock.close()
|
||||
|
||||
async def send(self, message):
|
||||
if isinstance(message, bytes):
|
||||
f = self._sock.send_bytes
|
||||
else:
|
||||
f = self._sock.send_str
|
||||
if asyncio.iscoroutinefunction(f):
|
||||
await f(message)
|
||||
else:
|
||||
f(message)
|
||||
|
||||
async def wait(self):
|
||||
msg = await self._sock.receive()
|
||||
if not isinstance(msg.data, six.binary_type) and \
|
||||
not isinstance(msg.data, six.text_type):
|
||||
raise IOError()
|
||||
return msg.data
|
||||
|
||||
|
||||
_async = {
|
||||
'asyncio': True,
|
||||
'create_route': create_route,
|
||||
'translate_request': translate_request,
|
||||
'make_response': make_response,
|
||||
'websocket': WebSocket,
|
||||
}
|
||||
@@ -1,214 +0,0 @@
|
||||
import os
|
||||
import sys
|
||||
|
||||
from engineio.static_files import get_static_file
|
||||
|
||||
|
||||
class ASGIApp:
|
||||
"""ASGI application middleware for Engine.IO.
|
||||
|
||||
This middleware dispatches traffic to an Engine.IO application. It can
|
||||
also serve a list of static files to the client, or forward unrelated
|
||||
HTTP traffic to another ASGI application.
|
||||
|
||||
:param engineio_server: The Engine.IO server. Must be an instance of the
|
||||
``engineio.AsyncServer`` class.
|
||||
:param static_files: A dictionary with static file mapping rules. See the
|
||||
documentation for details on this argument.
|
||||
:param other_asgi_app: A separate ASGI app that receives all other traffic.
|
||||
:param engineio_path: The endpoint where the Engine.IO application should
|
||||
be installed. The default value is appropriate for
|
||||
most cases.
|
||||
|
||||
Example usage::
|
||||
|
||||
import engineio
|
||||
import uvicorn
|
||||
|
||||
eio = engineio.AsyncServer()
|
||||
app = engineio.ASGIApp(eio, static_files={
|
||||
'/': {'content_type': 'text/html', 'filename': 'index.html'},
|
||||
'/index.html': {'content_type': 'text/html',
|
||||
'filename': 'index.html'},
|
||||
})
|
||||
uvicorn.run(app, '127.0.0.1', 5000)
|
||||
"""
|
||||
def __init__(self, engineio_server, other_asgi_app=None,
|
||||
static_files=None, engineio_path='engine.io'):
|
||||
self.engineio_server = engineio_server
|
||||
self.other_asgi_app = other_asgi_app
|
||||
self.engineio_path = engineio_path.strip('/')
|
||||
self.static_files = static_files or {}
|
||||
|
||||
async def __call__(self, scope, receive, send):
|
||||
if scope['type'] in ['http', 'websocket'] and \
|
||||
scope['path'].startswith('/{0}/'.format(self.engineio_path)):
|
||||
await self.engineio_server.handle_request(scope, receive, send)
|
||||
else:
|
||||
static_file = get_static_file(scope['path'], self.static_files) \
|
||||
if scope['type'] == 'http' and self.static_files else None
|
||||
if static_file:
|
||||
await self.serve_static_file(static_file, receive, send)
|
||||
elif self.other_asgi_app is not None:
|
||||
await self.other_asgi_app(scope, receive, send)
|
||||
elif scope['type'] == 'lifespan':
|
||||
await self.lifespan(receive, send)
|
||||
else:
|
||||
await self.not_found(receive, send)
|
||||
|
||||
async def serve_static_file(self, static_file, receive,
|
||||
send): # pragma: no cover
|
||||
event = await receive()
|
||||
if event['type'] == 'http.request':
|
||||
if os.path.exists(static_file['filename']):
|
||||
with open(static_file['filename'], 'rb') as f:
|
||||
payload = f.read()
|
||||
await send({'type': 'http.response.start',
|
||||
'status': 200,
|
||||
'headers': [(b'Content-Type', static_file[
|
||||
'content_type'].encode('utf-8'))]})
|
||||
await send({'type': 'http.response.body',
|
||||
'body': payload})
|
||||
else:
|
||||
await self.not_found(receive, send)
|
||||
|
||||
async def lifespan(self, receive, send):
|
||||
event = await receive()
|
||||
if event['type'] == 'lifespan.startup':
|
||||
await send({'type': 'lifespan.startup.complete'})
|
||||
elif event['type'] == 'lifespan.shutdown':
|
||||
await send({'type': 'lifespan.shutdown.complete'})
|
||||
|
||||
async def not_found(self, receive, send):
|
||||
"""Return a 404 Not Found error to the client."""
|
||||
await send({'type': 'http.response.start',
|
||||
'status': 404,
|
||||
'headers': [(b'Content-Type', b'text/plain')]})
|
||||
await send({'type': 'http.response.body',
|
||||
'body': b'Not Found'})
|
||||
|
||||
|
||||
async def translate_request(scope, receive, send):
|
||||
class AwaitablePayload(object): # pragma: no cover
|
||||
def __init__(self, payload):
|
||||
self.payload = payload or b''
|
||||
|
||||
async def read(self, length=None):
|
||||
if length is None:
|
||||
r = self.payload
|
||||
self.payload = b''
|
||||
else:
|
||||
r = self.payload[:length]
|
||||
self.payload = self.payload[length:]
|
||||
return r
|
||||
|
||||
event = await receive()
|
||||
payload = b''
|
||||
if event['type'] == 'http.request':
|
||||
payload += event.get('body') or b''
|
||||
while event.get('more_body'):
|
||||
event = await receive()
|
||||
if event['type'] == 'http.request':
|
||||
payload += event.get('body') or b''
|
||||
elif event['type'] == 'websocket.connect':
|
||||
await send({'type': 'websocket.accept'})
|
||||
else:
|
||||
return {}
|
||||
|
||||
raw_uri = scope['path'].encode('utf-8')
|
||||
if 'query_string' in scope and scope['query_string']:
|
||||
raw_uri += b'?' + scope['query_string']
|
||||
environ = {
|
||||
'wsgi.input': AwaitablePayload(payload),
|
||||
'wsgi.errors': sys.stderr,
|
||||
'wsgi.version': (1, 0),
|
||||
'wsgi.async': True,
|
||||
'wsgi.multithread': False,
|
||||
'wsgi.multiprocess': False,
|
||||
'wsgi.run_once': False,
|
||||
'SERVER_SOFTWARE': 'asgi',
|
||||
'REQUEST_METHOD': scope.get('method', 'GET'),
|
||||
'PATH_INFO': scope['path'],
|
||||
'QUERY_STRING': scope.get('query_string', b'').decode('utf-8'),
|
||||
'RAW_URI': raw_uri.decode('utf-8'),
|
||||
'SCRIPT_NAME': '',
|
||||
'SERVER_PROTOCOL': 'HTTP/1.1',
|
||||
'REMOTE_ADDR': '127.0.0.1',
|
||||
'REMOTE_PORT': '0',
|
||||
'SERVER_NAME': 'asgi',
|
||||
'SERVER_PORT': '0',
|
||||
'asgi.receive': receive,
|
||||
'asgi.send': send,
|
||||
}
|
||||
|
||||
for hdr_name, hdr_value in scope['headers']:
|
||||
hdr_name = hdr_name.upper().decode('utf-8')
|
||||
hdr_value = hdr_value.decode('utf-8')
|
||||
if hdr_name == 'CONTENT-TYPE':
|
||||
environ['CONTENT_TYPE'] = hdr_value
|
||||
continue
|
||||
elif hdr_name == 'CONTENT-LENGTH':
|
||||
environ['CONTENT_LENGTH'] = hdr_value
|
||||
continue
|
||||
|
||||
key = 'HTTP_%s' % hdr_name.replace('-', '_')
|
||||
if key in environ:
|
||||
hdr_value = '%s,%s' % (environ[key], hdr_value)
|
||||
|
||||
environ[key] = hdr_value
|
||||
|
||||
environ['wsgi.url_scheme'] = environ.get('HTTP_X_FORWARDED_PROTO', 'http')
|
||||
return environ
|
||||
|
||||
|
||||
async def make_response(status, headers, payload, environ):
|
||||
headers = [(h[0].encode('utf-8'), h[1].encode('utf-8')) for h in headers]
|
||||
await environ['asgi.send']({'type': 'http.response.start',
|
||||
'status': int(status.split(' ')[0]),
|
||||
'headers': headers})
|
||||
await environ['asgi.send']({'type': 'http.response.body',
|
||||
'body': payload})
|
||||
|
||||
|
||||
class WebSocket(object): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides an asgi WebSocket interface that is
|
||||
somewhat compatible with eventlet's implementation.
|
||||
"""
|
||||
def __init__(self, handler):
|
||||
self.handler = handler
|
||||
self.asgi_receive = None
|
||||
self.asgi_send = None
|
||||
|
||||
async def __call__(self, environ):
|
||||
self.asgi_receive = environ['asgi.receive']
|
||||
self.asgi_send = environ['asgi.send']
|
||||
await self.handler(self)
|
||||
|
||||
async def close(self):
|
||||
await self.asgi_send({'type': 'websocket.close'})
|
||||
|
||||
async def send(self, message):
|
||||
msg_bytes = None
|
||||
msg_text = None
|
||||
if isinstance(message, bytes):
|
||||
msg_bytes = message
|
||||
else:
|
||||
msg_text = message
|
||||
await self.asgi_send({'type': 'websocket.send',
|
||||
'bytes': msg_bytes,
|
||||
'text': msg_text})
|
||||
|
||||
async def wait(self):
|
||||
event = await self.asgi_receive()
|
||||
if event['type'] != 'websocket.receive':
|
||||
raise IOError()
|
||||
return event.get('bytes') or event.get('text')
|
||||
|
||||
|
||||
_async = {
|
||||
'asyncio': True,
|
||||
'translate_request': translate_request,
|
||||
'make_response': make_response,
|
||||
'websocket': WebSocket,
|
||||
}
|
||||
@@ -1,30 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
from eventlet.green.threading import Thread, Event
|
||||
from eventlet import queue
|
||||
from eventlet import sleep
|
||||
from eventlet.websocket import WebSocketWSGI as _WebSocketWSGI
|
||||
|
||||
|
||||
class WebSocketWSGI(_WebSocketWSGI):
|
||||
def __init__(self, *args, **kwargs):
|
||||
super(WebSocketWSGI, self).__init__(*args, **kwargs)
|
||||
self._sock = None
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
if 'eventlet.input' not in environ:
|
||||
raise RuntimeError('You need to use the eventlet server. '
|
||||
'See the Deployment section of the '
|
||||
'documentation for more information.')
|
||||
self._sock = environ['eventlet.input'].get_socket()
|
||||
return super(WebSocketWSGI, self).__call__(environ, start_response)
|
||||
|
||||
|
||||
_async = {
|
||||
'thread': Thread,
|
||||
'queue': queue.Queue,
|
||||
'queue_empty': queue.Empty,
|
||||
'event': Event,
|
||||
'websocket': WebSocketWSGI,
|
||||
'sleep': sleep,
|
||||
}
|
||||
@@ -1,63 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import gevent
|
||||
from gevent import queue
|
||||
from gevent.event import Event
|
||||
try:
|
||||
import geventwebsocket # noqa
|
||||
_websocket_available = True
|
||||
except ImportError:
|
||||
_websocket_available = False
|
||||
|
||||
|
||||
class Thread(gevent.Greenlet): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides gevent Greenlet interface that is compatible
|
||||
with the standard library's Thread class.
|
||||
"""
|
||||
def __init__(self, target, args=[], kwargs={}):
|
||||
super(Thread, self).__init__(target, *args, **kwargs)
|
||||
|
||||
def _run(self):
|
||||
return self.run()
|
||||
|
||||
|
||||
class WebSocketWSGI(object): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides a gevent WebSocket interface that is
|
||||
compatible with eventlet's implementation.
|
||||
"""
|
||||
def __init__(self, app):
|
||||
self.app = app
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
if 'wsgi.websocket' not in environ:
|
||||
raise RuntimeError('You need to use the gevent-websocket server. '
|
||||
'See the Deployment section of the '
|
||||
'documentation for more information.')
|
||||
self._sock = environ['wsgi.websocket']
|
||||
self.environ = environ
|
||||
self.version = self._sock.version
|
||||
self.path = self._sock.path
|
||||
self.origin = self._sock.origin
|
||||
self.protocol = self._sock.protocol
|
||||
return self.app(self)
|
||||
|
||||
def close(self):
|
||||
return self._sock.close()
|
||||
|
||||
def send(self, message):
|
||||
return self._sock.send(message)
|
||||
|
||||
def wait(self):
|
||||
return self._sock.receive()
|
||||
|
||||
|
||||
_async = {
|
||||
'thread': Thread,
|
||||
'queue': queue.JoinableQueue,
|
||||
'queue_empty': queue.Empty,
|
||||
'event': Event,
|
||||
'websocket': WebSocketWSGI if _websocket_available else None,
|
||||
'sleep': gevent.sleep,
|
||||
}
|
||||
@@ -1,156 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
|
||||
import six
|
||||
|
||||
import gevent
|
||||
from gevent import queue
|
||||
from gevent.event import Event
|
||||
import uwsgi
|
||||
_websocket_available = hasattr(uwsgi, 'websocket_handshake')
|
||||
|
||||
|
||||
class Thread(gevent.Greenlet): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides gevent Greenlet interface that is compatible
|
||||
with the standard library's Thread class.
|
||||
"""
|
||||
def __init__(self, target, args=[], kwargs={}):
|
||||
super(Thread, self).__init__(target, *args, **kwargs)
|
||||
|
||||
def _run(self):
|
||||
return self.run()
|
||||
|
||||
|
||||
class uWSGIWebSocket(object): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides a uWSGI WebSocket interface that is
|
||||
compatible with eventlet's implementation.
|
||||
"""
|
||||
def __init__(self, app):
|
||||
self.app = app
|
||||
self._sock = None
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
self._sock = uwsgi.connection_fd()
|
||||
self.environ = environ
|
||||
|
||||
uwsgi.websocket_handshake()
|
||||
|
||||
self._req_ctx = None
|
||||
if hasattr(uwsgi, 'request_context'):
|
||||
# uWSGI >= 2.1.x with support for api access across-greenlets
|
||||
self._req_ctx = uwsgi.request_context()
|
||||
else:
|
||||
# use event and queue for sending messages
|
||||
from gevent.event import Event
|
||||
from gevent.queue import Queue
|
||||
from gevent.select import select
|
||||
self._event = Event()
|
||||
self._send_queue = Queue()
|
||||
|
||||
# spawn a select greenlet
|
||||
def select_greenlet_runner(fd, event):
|
||||
"""Sets event when data becomes available to read on fd."""
|
||||
while True:
|
||||
event.set()
|
||||
try:
|
||||
select([fd], [], [])[0]
|
||||
except ValueError:
|
||||
break
|
||||
self._select_greenlet = gevent.spawn(
|
||||
select_greenlet_runner,
|
||||
self._sock,
|
||||
self._event)
|
||||
|
||||
self.app(self)
|
||||
|
||||
def close(self):
|
||||
"""Disconnects uWSGI from the client."""
|
||||
uwsgi.disconnect()
|
||||
if self._req_ctx is None:
|
||||
# better kill it here in case wait() is not called again
|
||||
self._select_greenlet.kill()
|
||||
self._event.set()
|
||||
|
||||
def _send(self, msg):
|
||||
"""Transmits message either in binary or UTF-8 text mode,
|
||||
depending on its type."""
|
||||
if isinstance(msg, six.binary_type):
|
||||
method = uwsgi.websocket_send_binary
|
||||
else:
|
||||
method = uwsgi.websocket_send
|
||||
if self._req_ctx is not None:
|
||||
method(msg, request_context=self._req_ctx)
|
||||
else:
|
||||
method(msg)
|
||||
|
||||
def _decode_received(self, msg):
|
||||
"""Returns either bytes or str, depending on message type."""
|
||||
if not isinstance(msg, six.binary_type):
|
||||
# already decoded - do nothing
|
||||
return msg
|
||||
# only decode from utf-8 if message is not binary data
|
||||
type = six.byte2int(msg[0:1])
|
||||
if type >= 48: # no binary
|
||||
return msg.decode('utf-8')
|
||||
# binary message, don't try to decode
|
||||
return msg
|
||||
|
||||
def send(self, msg):
|
||||
"""Queues a message for sending. Real transmission is done in
|
||||
wait method.
|
||||
Sends directly if uWSGI version is new enough."""
|
||||
if self._req_ctx is not None:
|
||||
self._send(msg)
|
||||
else:
|
||||
self._send_queue.put(msg)
|
||||
self._event.set()
|
||||
|
||||
def wait(self):
|
||||
"""Waits and returns received messages.
|
||||
If running in compatibility mode for older uWSGI versions,
|
||||
it also sends messages that have been queued by send().
|
||||
A return value of None means that connection was closed.
|
||||
This must be called repeatedly. For uWSGI < 2.1.x it must
|
||||
be called from the main greenlet."""
|
||||
while True:
|
||||
if self._req_ctx is not None:
|
||||
try:
|
||||
msg = uwsgi.websocket_recv(request_context=self._req_ctx)
|
||||
except IOError: # connection closed
|
||||
return None
|
||||
return self._decode_received(msg)
|
||||
else:
|
||||
# we wake up at least every 3 seconds to let uWSGI
|
||||
# do its ping/ponging
|
||||
event_set = self._event.wait(timeout=3)
|
||||
if event_set:
|
||||
self._event.clear()
|
||||
# maybe there is something to send
|
||||
msgs = []
|
||||
while True:
|
||||
try:
|
||||
msgs.append(self._send_queue.get(block=False))
|
||||
except gevent.queue.Empty:
|
||||
break
|
||||
for msg in msgs:
|
||||
self._send(msg)
|
||||
# maybe there is something to receive, if not, at least
|
||||
# ensure uWSGI does its ping/ponging
|
||||
try:
|
||||
msg = uwsgi.websocket_recv_nb()
|
||||
except IOError: # connection closed
|
||||
self._select_greenlet.kill()
|
||||
return None
|
||||
if msg: # message available
|
||||
return self._decode_received(msg)
|
||||
|
||||
|
||||
_async = {
|
||||
'thread': Thread,
|
||||
'queue': queue.JoinableQueue,
|
||||
'queue_empty': queue.Empty,
|
||||
'event': Event,
|
||||
'websocket': uWSGIWebSocket if _websocket_available else None,
|
||||
'sleep': gevent.sleep,
|
||||
}
|
||||
@@ -1,144 +0,0 @@
|
||||
import sys
|
||||
from urllib.parse import urlsplit
|
||||
|
||||
from sanic.response import HTTPResponse
|
||||
try:
|
||||
from sanic.websocket import WebSocketProtocol
|
||||
except ImportError:
|
||||
# the installed version of sanic does not have websocket support
|
||||
WebSocketProtocol = None
|
||||
import six
|
||||
|
||||
|
||||
def create_route(app, engineio_server, engineio_endpoint):
|
||||
"""This function sets up the engine.io endpoint as a route for the
|
||||
application.
|
||||
|
||||
Note that both GET and POST requests must be hooked up on the engine.io
|
||||
endpoint.
|
||||
"""
|
||||
app.add_route(engineio_server.handle_request, engineio_endpoint,
|
||||
methods=['GET', 'POST', 'OPTIONS'])
|
||||
try:
|
||||
app.enable_websocket()
|
||||
except AttributeError:
|
||||
# ignore, this version does not support websocket
|
||||
pass
|
||||
|
||||
|
||||
def translate_request(request):
|
||||
"""This function takes the arguments passed to the request handler and
|
||||
uses them to generate a WSGI compatible environ dictionary.
|
||||
"""
|
||||
class AwaitablePayload(object):
|
||||
def __init__(self, payload):
|
||||
self.payload = payload or b''
|
||||
|
||||
async def read(self, length=None):
|
||||
if length is None:
|
||||
r = self.payload
|
||||
self.payload = b''
|
||||
else:
|
||||
r = self.payload[:length]
|
||||
self.payload = self.payload[length:]
|
||||
return r
|
||||
|
||||
uri_parts = urlsplit(request.url)
|
||||
environ = {
|
||||
'wsgi.input': AwaitablePayload(request.body),
|
||||
'wsgi.errors': sys.stderr,
|
||||
'wsgi.version': (1, 0),
|
||||
'wsgi.async': True,
|
||||
'wsgi.multithread': False,
|
||||
'wsgi.multiprocess': False,
|
||||
'wsgi.run_once': False,
|
||||
'SERVER_SOFTWARE': 'sanic',
|
||||
'REQUEST_METHOD': request.method,
|
||||
'QUERY_STRING': uri_parts.query or '',
|
||||
'RAW_URI': request.url,
|
||||
'SERVER_PROTOCOL': 'HTTP/' + request.version,
|
||||
'REMOTE_ADDR': '127.0.0.1',
|
||||
'REMOTE_PORT': '0',
|
||||
'SERVER_NAME': 'sanic',
|
||||
'SERVER_PORT': '0',
|
||||
'sanic.request': request
|
||||
}
|
||||
|
||||
for hdr_name, hdr_value in request.headers.items():
|
||||
hdr_name = hdr_name.upper()
|
||||
if hdr_name == 'CONTENT-TYPE':
|
||||
environ['CONTENT_TYPE'] = hdr_value
|
||||
continue
|
||||
elif hdr_name == 'CONTENT-LENGTH':
|
||||
environ['CONTENT_LENGTH'] = hdr_value
|
||||
continue
|
||||
|
||||
key = 'HTTP_%s' % hdr_name.replace('-', '_')
|
||||
if key in environ:
|
||||
hdr_value = '%s,%s' % (environ[key], hdr_value)
|
||||
|
||||
environ[key] = hdr_value
|
||||
|
||||
environ['wsgi.url_scheme'] = environ.get('HTTP_X_FORWARDED_PROTO', 'http')
|
||||
|
||||
path_info = uri_parts.path
|
||||
|
||||
environ['PATH_INFO'] = path_info
|
||||
environ['SCRIPT_NAME'] = ''
|
||||
|
||||
return environ
|
||||
|
||||
|
||||
def make_response(status, headers, payload, environ):
|
||||
"""This function generates an appropriate response object for this async
|
||||
mode.
|
||||
"""
|
||||
headers_dict = {}
|
||||
content_type = None
|
||||
for h in headers:
|
||||
if h[0].lower() == 'content-type':
|
||||
content_type = h[1]
|
||||
else:
|
||||
headers_dict[h[0]] = h[1]
|
||||
return HTTPResponse(body_bytes=payload, content_type=content_type,
|
||||
status=int(status.split()[0]), headers=headers_dict)
|
||||
|
||||
|
||||
class WebSocket(object): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides a sanic WebSocket interface that is
|
||||
somewhat compatible with eventlet's implementation.
|
||||
"""
|
||||
def __init__(self, handler):
|
||||
self.handler = handler
|
||||
self._sock = None
|
||||
|
||||
async def __call__(self, environ):
|
||||
request = environ['sanic.request']
|
||||
protocol = request.transport.get_protocol()
|
||||
self._sock = await protocol.websocket_handshake(request)
|
||||
|
||||
self.environ = environ
|
||||
await self.handler(self)
|
||||
|
||||
async def close(self):
|
||||
await self._sock.close()
|
||||
|
||||
async def send(self, message):
|
||||
await self._sock.send(message)
|
||||
|
||||
async def wait(self):
|
||||
data = await self._sock.recv()
|
||||
if not isinstance(data, six.binary_type) and \
|
||||
not isinstance(data, six.text_type):
|
||||
raise IOError()
|
||||
return data
|
||||
|
||||
|
||||
_async = {
|
||||
'asyncio': True,
|
||||
'create_route': create_route,
|
||||
'translate_request': translate_request,
|
||||
'make_response': make_response,
|
||||
'websocket': WebSocket if WebSocketProtocol else None,
|
||||
}
|
||||
@@ -1,17 +0,0 @@
|
||||
from __future__ import absolute_import
|
||||
import threading
|
||||
import time
|
||||
|
||||
try:
|
||||
import queue
|
||||
except ImportError: # pragma: no cover
|
||||
import Queue as queue
|
||||
|
||||
_async = {
|
||||
'thread': threading.Thread,
|
||||
'queue': queue.Queue,
|
||||
'queue_empty': queue.Empty,
|
||||
'event': threading.Event,
|
||||
'websocket': None,
|
||||
'sleep': time.sleep,
|
||||
}
|
||||
@@ -1,184 +0,0 @@
|
||||
import asyncio
|
||||
import sys
|
||||
from urllib.parse import urlsplit
|
||||
from .. import exceptions
|
||||
|
||||
import tornado.web
|
||||
import tornado.websocket
|
||||
import six
|
||||
|
||||
|
||||
def get_tornado_handler(engineio_server):
|
||||
class Handler(tornado.websocket.WebSocketHandler): # pragma: no cover
|
||||
def __init__(self, *args, **kwargs):
|
||||
super().__init__(*args, **kwargs)
|
||||
if isinstance(engineio_server.cors_allowed_origins,
|
||||
six.string_types):
|
||||
if engineio_server.cors_allowed_origins == '*':
|
||||
self.allowed_origins = None
|
||||
else:
|
||||
self.allowed_origins = [
|
||||
engineio_server.cors_allowed_origins]
|
||||
else:
|
||||
self.allowed_origins = engineio_server.cors_allowed_origins
|
||||
self.receive_queue = asyncio.Queue()
|
||||
|
||||
async def get(self, *args, **kwargs):
|
||||
if self.request.headers.get('Upgrade', '').lower() == 'websocket':
|
||||
ret = super().get(*args, **kwargs)
|
||||
if asyncio.iscoroutine(ret):
|
||||
await ret
|
||||
else:
|
||||
await engineio_server.handle_request(self)
|
||||
|
||||
async def open(self, *args, **kwargs):
|
||||
# this is the handler for the websocket request
|
||||
asyncio.ensure_future(engineio_server.handle_request(self))
|
||||
|
||||
async def post(self, *args, **kwargs):
|
||||
await engineio_server.handle_request(self)
|
||||
|
||||
async def options(self, *args, **kwargs):
|
||||
await engineio_server.handle_request(self)
|
||||
|
||||
async def on_message(self, message):
|
||||
await self.receive_queue.put(message)
|
||||
|
||||
async def get_next_message(self):
|
||||
return await self.receive_queue.get()
|
||||
|
||||
def on_close(self):
|
||||
self.receive_queue.put_nowait(None)
|
||||
|
||||
def check_origin(self, origin):
|
||||
if self.allowed_origins is None or origin in self.allowed_origins:
|
||||
return True
|
||||
return super().check_origin(origin)
|
||||
|
||||
def get_compression_options(self):
|
||||
# enable compression
|
||||
return {}
|
||||
|
||||
return Handler
|
||||
|
||||
|
||||
def translate_request(handler):
|
||||
"""This function takes the arguments passed to the request handler and
|
||||
uses them to generate a WSGI compatible environ dictionary.
|
||||
"""
|
||||
class AwaitablePayload(object):
|
||||
def __init__(self, payload):
|
||||
self.payload = payload or b''
|
||||
|
||||
async def read(self, length=None):
|
||||
if length is None:
|
||||
r = self.payload
|
||||
self.payload = b''
|
||||
else:
|
||||
r = self.payload[:length]
|
||||
self.payload = self.payload[length:]
|
||||
return r
|
||||
|
||||
payload = handler.request.body
|
||||
|
||||
uri_parts = urlsplit(handler.request.path)
|
||||
full_uri = handler.request.path
|
||||
if handler.request.query: # pragma: no cover
|
||||
full_uri += '?' + handler.request.query
|
||||
environ = {
|
||||
'wsgi.input': AwaitablePayload(payload),
|
||||
'wsgi.errors': sys.stderr,
|
||||
'wsgi.version': (1, 0),
|
||||
'wsgi.async': True,
|
||||
'wsgi.multithread': False,
|
||||
'wsgi.multiprocess': False,
|
||||
'wsgi.run_once': False,
|
||||
'SERVER_SOFTWARE': 'aiohttp',
|
||||
'REQUEST_METHOD': handler.request.method,
|
||||
'QUERY_STRING': handler.request.query or '',
|
||||
'RAW_URI': full_uri,
|
||||
'SERVER_PROTOCOL': 'HTTP/%s' % handler.request.version,
|
||||
'REMOTE_ADDR': '127.0.0.1',
|
||||
'REMOTE_PORT': '0',
|
||||
'SERVER_NAME': 'aiohttp',
|
||||
'SERVER_PORT': '0',
|
||||
'tornado.handler': handler
|
||||
}
|
||||
|
||||
for hdr_name, hdr_value in handler.request.headers.items():
|
||||
hdr_name = hdr_name.upper()
|
||||
if hdr_name == 'CONTENT-TYPE':
|
||||
environ['CONTENT_TYPE'] = hdr_value
|
||||
continue
|
||||
elif hdr_name == 'CONTENT-LENGTH':
|
||||
environ['CONTENT_LENGTH'] = hdr_value
|
||||
continue
|
||||
|
||||
key = 'HTTP_%s' % hdr_name.replace('-', '_')
|
||||
environ[key] = hdr_value
|
||||
|
||||
environ['wsgi.url_scheme'] = environ.get('HTTP_X_FORWARDED_PROTO', 'http')
|
||||
|
||||
path_info = uri_parts.path
|
||||
|
||||
environ['PATH_INFO'] = path_info
|
||||
environ['SCRIPT_NAME'] = ''
|
||||
|
||||
return environ
|
||||
|
||||
|
||||
def make_response(status, headers, payload, environ):
|
||||
"""This function generates an appropriate response object for this async
|
||||
mode.
|
||||
"""
|
||||
tornado_handler = environ['tornado.handler']
|
||||
try:
|
||||
tornado_handler.set_status(int(status.split()[0]))
|
||||
except RuntimeError: # pragma: no cover
|
||||
# for websocket connections Tornado does not accept a response, since
|
||||
# it already emitted the 101 status code
|
||||
return
|
||||
for header, value in headers:
|
||||
tornado_handler.set_header(header, value)
|
||||
tornado_handler.write(payload)
|
||||
tornado_handler.finish()
|
||||
|
||||
|
||||
class WebSocket(object): # pragma: no cover
|
||||
"""
|
||||
This wrapper class provides a tornado WebSocket interface that is
|
||||
somewhat compatible with eventlet's implementation.
|
||||
"""
|
||||
def __init__(self, handler):
|
||||
self.handler = handler
|
||||
self.tornado_handler = None
|
||||
|
||||
async def __call__(self, environ):
|
||||
self.tornado_handler = environ['tornado.handler']
|
||||
self.environ = environ
|
||||
await self.handler(self)
|
||||
|
||||
async def close(self):
|
||||
self.tornado_handler.close()
|
||||
|
||||
async def send(self, message):
|
||||
try:
|
||||
self.tornado_handler.write_message(
|
||||
message, binary=isinstance(message, bytes))
|
||||
except tornado.websocket.WebSocketClosedError:
|
||||
raise exceptions.EngineIOError()
|
||||
|
||||
async def wait(self):
|
||||
msg = await self.tornado_handler.get_next_message()
|
||||
if not isinstance(msg, six.binary_type) and \
|
||||
not isinstance(msg, six.text_type):
|
||||
raise IOError()
|
||||
return msg
|
||||
|
||||
|
||||
_async = {
|
||||
'asyncio': True,
|
||||
'translate_request': translate_request,
|
||||
'make_response': make_response,
|
||||
'websocket': WebSocket,
|
||||
}
|
||||
@@ -1,585 +0,0 @@
|
||||
import asyncio
|
||||
import ssl
|
||||
|
||||
try:
|
||||
import aiohttp
|
||||
except ImportError: # pragma: no cover
|
||||
aiohttp = None
|
||||
import six
|
||||
|
||||
from . import client
|
||||
from . import exceptions
|
||||
from . import packet
|
||||
from . import payload
|
||||
|
||||
|
||||
class AsyncClient(client.Client):
|
||||
"""An Engine.IO client for asyncio.
|
||||
|
||||
This class implements a fully compliant Engine.IO web client with support
|
||||
for websocket and long-polling transports, compatible with the asyncio
|
||||
framework on Python 3.5 or newer.
|
||||
|
||||
:param logger: To enable logging set to ``True`` or pass a logger object to
|
||||
use. To disable logging set to ``False``. The default is
|
||||
``False``.
|
||||
:param json: An alternative json module to use for encoding and decoding
|
||||
packets. Custom json modules must have ``dumps`` and ``loads``
|
||||
functions that are compatible with the standard library
|
||||
versions.
|
||||
:param request_timeout: A timeout in seconds for requests. The default is
|
||||
5 seconds.
|
||||
:param ssl_verify: ``True`` to verify SSL certificates, or ``False`` to
|
||||
skip SSL certificate verification, allowing
|
||||
connections to servers with self signed certificates.
|
||||
The default is ``True``.
|
||||
"""
|
||||
def is_asyncio_based(self):
|
||||
return True
|
||||
|
||||
async def connect(self, url, headers={}, transports=None,
|
||||
engineio_path='engine.io'):
|
||||
"""Connect to an Engine.IO server.
|
||||
|
||||
:param url: The URL of the Engine.IO server. It can include custom
|
||||
query string parameters if required by the server.
|
||||
:param headers: A dictionary with custom headers to send with the
|
||||
connection request.
|
||||
:param transports: The list of allowed transports. Valid transports
|
||||
are ``'polling'`` and ``'websocket'``. If not
|
||||
given, the polling transport is connected first,
|
||||
then an upgrade to websocket is attempted.
|
||||
:param engineio_path: The endpoint where the Engine.IO server is
|
||||
installed. The default value is appropriate for
|
||||
most cases.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
|
||||
Example usage::
|
||||
|
||||
eio = engineio.Client()
|
||||
await eio.connect('http://localhost:5000')
|
||||
"""
|
||||
if self.state != 'disconnected':
|
||||
raise ValueError('Client is not in a disconnected state')
|
||||
valid_transports = ['polling', 'websocket']
|
||||
if transports is not None:
|
||||
if isinstance(transports, six.text_type):
|
||||
transports = [transports]
|
||||
transports = [transport for transport in transports
|
||||
if transport in valid_transports]
|
||||
if not transports:
|
||||
raise ValueError('No valid transports provided')
|
||||
self.transports = transports or valid_transports
|
||||
self.queue = self.create_queue()
|
||||
return await getattr(self, '_connect_' + self.transports[0])(
|
||||
url, headers, engineio_path)
|
||||
|
||||
async def wait(self):
|
||||
"""Wait until the connection with the server ends.
|
||||
|
||||
Client applications can use this function to block the main thread
|
||||
during the life of the connection.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
if self.read_loop_task:
|
||||
await self.read_loop_task
|
||||
|
||||
async def send(self, data, binary=None):
|
||||
"""Send a message to a client.
|
||||
|
||||
:param data: The data to send to the client. Data can be of type
|
||||
``str``, ``bytes``, ``list`` or ``dict``. If a ``list``
|
||||
or ``dict``, the data will be serialized as JSON.
|
||||
:param binary: ``True`` to send packet as binary, ``False`` to send
|
||||
as text. If not given, unicode (Python 2) and str
|
||||
(Python 3) are sent as text, and str (Python 2) and
|
||||
bytes (Python 3) are sent as binary.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
await self._send_packet(packet.Packet(packet.MESSAGE, data=data,
|
||||
binary=binary))
|
||||
|
||||
async def disconnect(self, abort=False):
|
||||
"""Disconnect from the server.
|
||||
|
||||
:param abort: If set to ``True``, do not wait for background tasks
|
||||
associated with the connection to end.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
if self.state == 'connected':
|
||||
await self._send_packet(packet.Packet(packet.CLOSE))
|
||||
await self.queue.put(None)
|
||||
self.state = 'disconnecting'
|
||||
await self._trigger_event('disconnect', run_async=False)
|
||||
if self.current_transport == 'websocket':
|
||||
await self.ws.close()
|
||||
if not abort:
|
||||
await self.read_loop_task
|
||||
self.state = 'disconnected'
|
||||
try:
|
||||
client.connected_clients.remove(self)
|
||||
except ValueError: # pragma: no cover
|
||||
pass
|
||||
self._reset()
|
||||
|
||||
def start_background_task(self, target, *args, **kwargs):
|
||||
"""Start a background task.
|
||||
|
||||
This is a utility function that applications can use to start a
|
||||
background task.
|
||||
|
||||
:param target: the target function to execute.
|
||||
:param args: arguments to pass to the function.
|
||||
:param kwargs: keyword arguments to pass to the function.
|
||||
|
||||
This function returns an object compatible with the `Thread` class in
|
||||
the Python standard library. The `start()` method on this object is
|
||||
already called by this function.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
return asyncio.ensure_future(target(*args, **kwargs))
|
||||
|
||||
async def sleep(self, seconds=0):
|
||||
"""Sleep for the requested amount of time.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
return await asyncio.sleep(seconds)
|
||||
|
||||
def create_queue(self):
|
||||
"""Create a queue object."""
|
||||
q = asyncio.Queue()
|
||||
q.Empty = asyncio.QueueEmpty
|
||||
return q
|
||||
|
||||
def create_event(self):
|
||||
"""Create an event object."""
|
||||
return asyncio.Event()
|
||||
|
||||
def _reset(self):
|
||||
if self.http: # pragma: no cover
|
||||
asyncio.ensure_future(self.http.close())
|
||||
super()._reset()
|
||||
|
||||
async def _connect_polling(self, url, headers, engineio_path):
|
||||
"""Establish a long-polling connection to the Engine.IO server."""
|
||||
if aiohttp is None: # pragma: no cover
|
||||
self.logger.error('aiohttp not installed -- cannot make HTTP '
|
||||
'requests!')
|
||||
return
|
||||
self.base_url = self._get_engineio_url(url, engineio_path, 'polling')
|
||||
self.logger.info('Attempting polling connection to ' + self.base_url)
|
||||
r = await self._send_request(
|
||||
'GET', self.base_url + self._get_url_timestamp(), headers=headers,
|
||||
timeout=self.request_timeout)
|
||||
if r is None:
|
||||
self._reset()
|
||||
raise exceptions.ConnectionError(
|
||||
'Connection refused by the server')
|
||||
if r.status < 200 or r.status >= 300:
|
||||
raise exceptions.ConnectionError(
|
||||
'Unexpected status code {} in server response'.format(
|
||||
r.status))
|
||||
try:
|
||||
p = payload.Payload(encoded_payload=await r.read())
|
||||
except ValueError:
|
||||
six.raise_from(exceptions.ConnectionError(
|
||||
'Unexpected response from server'), None)
|
||||
open_packet = p.packets[0]
|
||||
if open_packet.packet_type != packet.OPEN:
|
||||
raise exceptions.ConnectionError(
|
||||
'OPEN packet not returned by server')
|
||||
self.logger.info(
|
||||
'Polling connection accepted with ' + str(open_packet.data))
|
||||
self.sid = open_packet.data['sid']
|
||||
self.upgrades = open_packet.data['upgrades']
|
||||
self.ping_interval = open_packet.data['pingInterval'] / 1000.0
|
||||
self.ping_timeout = open_packet.data['pingTimeout'] / 1000.0
|
||||
self.current_transport = 'polling'
|
||||
self.base_url += '&sid=' + self.sid
|
||||
|
||||
self.state = 'connected'
|
||||
client.connected_clients.append(self)
|
||||
await self._trigger_event('connect', run_async=False)
|
||||
|
||||
for pkt in p.packets[1:]:
|
||||
await self._receive_packet(pkt)
|
||||
|
||||
if 'websocket' in self.upgrades and 'websocket' in self.transports:
|
||||
# attempt to upgrade to websocket
|
||||
if await self._connect_websocket(url, headers, engineio_path):
|
||||
# upgrade to websocket succeeded, we're done here
|
||||
return
|
||||
|
||||
self.ping_loop_task = self.start_background_task(self._ping_loop)
|
||||
self.write_loop_task = self.start_background_task(self._write_loop)
|
||||
self.read_loop_task = self.start_background_task(
|
||||
self._read_loop_polling)
|
||||
|
||||
async def _connect_websocket(self, url, headers, engineio_path):
|
||||
"""Establish or upgrade to a WebSocket connection with the server."""
|
||||
if aiohttp is None: # pragma: no cover
|
||||
self.logger.error('aiohttp package not installed')
|
||||
return False
|
||||
websocket_url = self._get_engineio_url(url, engineio_path,
|
||||
'websocket')
|
||||
if self.sid:
|
||||
self.logger.info(
|
||||
'Attempting WebSocket upgrade to ' + websocket_url)
|
||||
upgrade = True
|
||||
websocket_url += '&sid=' + self.sid
|
||||
else:
|
||||
upgrade = False
|
||||
self.base_url = websocket_url
|
||||
self.logger.info(
|
||||
'Attempting WebSocket connection to ' + websocket_url)
|
||||
|
||||
if self.http is None or self.http.closed: # pragma: no cover
|
||||
self.http = aiohttp.ClientSession()
|
||||
|
||||
try:
|
||||
if not self.ssl_verify:
|
||||
ssl_context = ssl.create_default_context()
|
||||
ssl_context.check_hostname = False
|
||||
ssl_context.verify_mode = ssl.CERT_NONE
|
||||
ws = await self.http.ws_connect(
|
||||
websocket_url + self._get_url_timestamp(),
|
||||
headers=headers, ssl=ssl_context)
|
||||
else:
|
||||
ws = await self.http.ws_connect(
|
||||
websocket_url + self._get_url_timestamp(),
|
||||
headers=headers)
|
||||
except (aiohttp.client_exceptions.WSServerHandshakeError,
|
||||
aiohttp.client_exceptions.ServerConnectionError):
|
||||
if upgrade:
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: connection error')
|
||||
return False
|
||||
else:
|
||||
raise exceptions.ConnectionError('Connection error')
|
||||
if upgrade:
|
||||
p = packet.Packet(packet.PING, data='probe').encode(
|
||||
always_bytes=False)
|
||||
try:
|
||||
await ws.send_str(p)
|
||||
except Exception as e: # pragma: no cover
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: unexpected send exception: %s',
|
||||
str(e))
|
||||
return False
|
||||
try:
|
||||
p = (await ws.receive()).data
|
||||
except Exception as e: # pragma: no cover
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: unexpected recv exception: %s',
|
||||
str(e))
|
||||
return False
|
||||
pkt = packet.Packet(encoded_packet=p)
|
||||
if pkt.packet_type != packet.PONG or pkt.data != 'probe':
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: no PONG packet')
|
||||
return False
|
||||
p = packet.Packet(packet.UPGRADE).encode(always_bytes=False)
|
||||
try:
|
||||
await ws.send_str(p)
|
||||
except Exception as e: # pragma: no cover
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: unexpected send exception: %s',
|
||||
str(e))
|
||||
return False
|
||||
self.current_transport = 'websocket'
|
||||
self.logger.info('WebSocket upgrade was successful')
|
||||
else:
|
||||
try:
|
||||
p = (await ws.receive()).data
|
||||
except Exception as e: # pragma: no cover
|
||||
raise exceptions.ConnectionError(
|
||||
'Unexpected recv exception: ' + str(e))
|
||||
open_packet = packet.Packet(encoded_packet=p)
|
||||
if open_packet.packet_type != packet.OPEN:
|
||||
raise exceptions.ConnectionError('no OPEN packet')
|
||||
self.logger.info(
|
||||
'WebSocket connection accepted with ' + str(open_packet.data))
|
||||
self.sid = open_packet.data['sid']
|
||||
self.upgrades = open_packet.data['upgrades']
|
||||
self.ping_interval = open_packet.data['pingInterval'] / 1000.0
|
||||
self.ping_timeout = open_packet.data['pingTimeout'] / 1000.0
|
||||
self.current_transport = 'websocket'
|
||||
|
||||
self.state = 'connected'
|
||||
client.connected_clients.append(self)
|
||||
await self._trigger_event('connect', run_async=False)
|
||||
|
||||
self.ws = ws
|
||||
self.ping_loop_task = self.start_background_task(self._ping_loop)
|
||||
self.write_loop_task = self.start_background_task(self._write_loop)
|
||||
self.read_loop_task = self.start_background_task(
|
||||
self._read_loop_websocket)
|
||||
return True
|
||||
|
||||
async def _receive_packet(self, pkt):
|
||||
"""Handle incoming packets from the server."""
|
||||
packet_name = packet.packet_names[pkt.packet_type] \
|
||||
if pkt.packet_type < len(packet.packet_names) else 'UNKNOWN'
|
||||
self.logger.info(
|
||||
'Received packet %s data %s', packet_name,
|
||||
pkt.data if not isinstance(pkt.data, bytes) else '<binary>')
|
||||
if pkt.packet_type == packet.MESSAGE:
|
||||
await self._trigger_event('message', pkt.data, run_async=True)
|
||||
elif pkt.packet_type == packet.PONG:
|
||||
self.pong_received = True
|
||||
elif pkt.packet_type == packet.CLOSE:
|
||||
await self.disconnect(abort=True)
|
||||
elif pkt.packet_type == packet.NOOP:
|
||||
pass
|
||||
else:
|
||||
self.logger.error('Received unexpected packet of type %s',
|
||||
pkt.packet_type)
|
||||
|
||||
async def _send_packet(self, pkt):
|
||||
"""Queue a packet to be sent to the server."""
|
||||
if self.state != 'connected':
|
||||
return
|
||||
await self.queue.put(pkt)
|
||||
self.logger.info(
|
||||
'Sending packet %s data %s',
|
||||
packet.packet_names[pkt.packet_type],
|
||||
pkt.data if not isinstance(pkt.data, bytes) else '<binary>')
|
||||
|
||||
async def _send_request(
|
||||
self, method, url, headers=None, body=None,
|
||||
timeout=None): # pragma: no cover
|
||||
if self.http is None or self.http.closed:
|
||||
self.http = aiohttp.ClientSession()
|
||||
http_method = getattr(self.http, method.lower())
|
||||
|
||||
try:
|
||||
if not self.ssl_verify:
|
||||
return await http_method(
|
||||
url, headers=headers, data=body,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout), ssl=False)
|
||||
else:
|
||||
return await http_method(
|
||||
url, headers=headers, data=body,
|
||||
timeout=aiohttp.ClientTimeout(total=timeout))
|
||||
|
||||
except (aiohttp.ClientError, asyncio.TimeoutError) as exc:
|
||||
self.logger.info('HTTP %s request to %s failed with error %s.',
|
||||
method, url, exc)
|
||||
|
||||
async def _trigger_event(self, event, *args, **kwargs):
|
||||
"""Invoke an event handler."""
|
||||
run_async = kwargs.pop('run_async', False)
|
||||
ret = None
|
||||
if event in self.handlers:
|
||||
if asyncio.iscoroutinefunction(self.handlers[event]) is True:
|
||||
if run_async:
|
||||
return self.start_background_task(self.handlers[event],
|
||||
*args)
|
||||
else:
|
||||
try:
|
||||
ret = await self.handlers[event](*args)
|
||||
except asyncio.CancelledError: # pragma: no cover
|
||||
pass
|
||||
except:
|
||||
self.logger.exception(event + ' async handler error')
|
||||
if event == 'connect':
|
||||
# if connect handler raised error we reject the
|
||||
# connection
|
||||
return False
|
||||
else:
|
||||
if run_async:
|
||||
async def async_handler():
|
||||
return self.handlers[event](*args)
|
||||
|
||||
return self.start_background_task(async_handler)
|
||||
else:
|
||||
try:
|
||||
ret = self.handlers[event](*args)
|
||||
except:
|
||||
self.logger.exception(event + ' handler error')
|
||||
if event == 'connect':
|
||||
# if connect handler raised error we reject the
|
||||
# connection
|
||||
return False
|
||||
return ret
|
||||
|
||||
async def _ping_loop(self):
|
||||
"""This background task sends a PING to the server at the requested
|
||||
interval.
|
||||
"""
|
||||
self.pong_received = True
|
||||
if self.ping_loop_event is None:
|
||||
self.ping_loop_event = self.create_event()
|
||||
else:
|
||||
self.ping_loop_event.clear()
|
||||
while self.state == 'connected':
|
||||
if not self.pong_received:
|
||||
self.logger.info(
|
||||
'PONG response has not been received, aborting')
|
||||
if self.ws:
|
||||
await self.ws.close()
|
||||
await self.queue.put(None)
|
||||
break
|
||||
self.pong_received = False
|
||||
await self._send_packet(packet.Packet(packet.PING))
|
||||
try:
|
||||
await asyncio.wait_for(self.ping_loop_event.wait(),
|
||||
self.ping_interval)
|
||||
except (asyncio.TimeoutError,
|
||||
asyncio.CancelledError): # pragma: no cover
|
||||
pass
|
||||
self.logger.info('Exiting ping task')
|
||||
|
||||
async def _read_loop_polling(self):
|
||||
"""Read packets by polling the Engine.IO server."""
|
||||
while self.state == 'connected':
|
||||
self.logger.info(
|
||||
'Sending polling GET request to ' + self.base_url)
|
||||
r = await self._send_request(
|
||||
'GET', self.base_url + self._get_url_timestamp(),
|
||||
timeout=max(self.ping_interval, self.ping_timeout) + 5)
|
||||
if r is None:
|
||||
self.logger.warning(
|
||||
'Connection refused by the server, aborting')
|
||||
await self.queue.put(None)
|
||||
break
|
||||
if r.status < 200 or r.status >= 300:
|
||||
self.logger.warning('Unexpected status code %s in server '
|
||||
'response, aborting', r.status)
|
||||
await self.queue.put(None)
|
||||
break
|
||||
try:
|
||||
p = payload.Payload(encoded_payload=await r.read())
|
||||
except ValueError:
|
||||
self.logger.warning(
|
||||
'Unexpected packet from server, aborting')
|
||||
await self.queue.put(None)
|
||||
break
|
||||
for pkt in p.packets:
|
||||
await self._receive_packet(pkt)
|
||||
|
||||
self.logger.info('Waiting for write loop task to end')
|
||||
await self.write_loop_task
|
||||
self.logger.info('Waiting for ping loop task to end')
|
||||
if self.ping_loop_event: # pragma: no cover
|
||||
self.ping_loop_event.set()
|
||||
await self.ping_loop_task
|
||||
if self.state == 'connected':
|
||||
await self._trigger_event('disconnect', run_async=False)
|
||||
try:
|
||||
client.connected_clients.remove(self)
|
||||
except ValueError: # pragma: no cover
|
||||
pass
|
||||
self._reset()
|
||||
self.logger.info('Exiting read loop task')
|
||||
|
||||
async def _read_loop_websocket(self):
|
||||
"""Read packets from the Engine.IO WebSocket connection."""
|
||||
while self.state == 'connected':
|
||||
p = None
|
||||
try:
|
||||
p = (await self.ws.receive()).data
|
||||
if p is None: # pragma: no cover
|
||||
raise RuntimeError('WebSocket read returned None')
|
||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
||||
self.logger.info(
|
||||
'Read loop: WebSocket connection was closed, aborting')
|
||||
await self.queue.put(None)
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.info(
|
||||
'Unexpected error "%s", aborting', str(e))
|
||||
await self.queue.put(None)
|
||||
break
|
||||
if isinstance(p, six.text_type): # pragma: no cover
|
||||
p = p.encode('utf-8')
|
||||
pkt = packet.Packet(encoded_packet=p)
|
||||
await self._receive_packet(pkt)
|
||||
|
||||
self.logger.info('Waiting for write loop task to end')
|
||||
await self.write_loop_task
|
||||
self.logger.info('Waiting for ping loop task to end')
|
||||
if self.ping_loop_event: # pragma: no cover
|
||||
self.ping_loop_event.set()
|
||||
await self.ping_loop_task
|
||||
if self.state == 'connected':
|
||||
await self._trigger_event('disconnect', run_async=False)
|
||||
try:
|
||||
client.connected_clients.remove(self)
|
||||
except ValueError: # pragma: no cover
|
||||
pass
|
||||
self._reset()
|
||||
self.logger.info('Exiting read loop task')
|
||||
|
||||
async def _write_loop(self):
|
||||
"""This background task sends packages to the server as they are
|
||||
pushed to the send queue.
|
||||
"""
|
||||
while self.state == 'connected':
|
||||
# to simplify the timeout handling, use the maximum of the
|
||||
# ping interval and ping timeout as timeout, with an extra 5
|
||||
# seconds grace period
|
||||
timeout = max(self.ping_interval, self.ping_timeout) + 5
|
||||
packets = None
|
||||
try:
|
||||
packets = [await asyncio.wait_for(self.queue.get(), timeout)]
|
||||
except (self.queue.Empty, asyncio.TimeoutError,
|
||||
asyncio.CancelledError):
|
||||
self.logger.error('packet queue is empty, aborting')
|
||||
break
|
||||
if packets == [None]:
|
||||
self.queue.task_done()
|
||||
packets = []
|
||||
else:
|
||||
while True:
|
||||
try:
|
||||
packets.append(self.queue.get_nowait())
|
||||
except self.queue.Empty:
|
||||
break
|
||||
if packets[-1] is None:
|
||||
packets = packets[:-1]
|
||||
self.queue.task_done()
|
||||
break
|
||||
if not packets:
|
||||
# empty packet list returned -> connection closed
|
||||
break
|
||||
if self.current_transport == 'polling':
|
||||
p = payload.Payload(packets=packets)
|
||||
r = await self._send_request(
|
||||
'POST', self.base_url, body=p.encode(),
|
||||
headers={'Content-Type': 'application/octet-stream'},
|
||||
timeout=self.request_timeout)
|
||||
for pkt in packets:
|
||||
self.queue.task_done()
|
||||
if r is None:
|
||||
self.logger.warning(
|
||||
'Connection refused by the server, aborting')
|
||||
break
|
||||
if r.status < 200 or r.status >= 300:
|
||||
self.logger.warning('Unexpected status code %s in server '
|
||||
'response, aborting', r.status)
|
||||
self._reset()
|
||||
break
|
||||
else:
|
||||
# websocket
|
||||
try:
|
||||
for pkt in packets:
|
||||
if pkt.binary:
|
||||
await self.ws.send_bytes(pkt.encode(
|
||||
always_bytes=False))
|
||||
else:
|
||||
await self.ws.send_str(pkt.encode(
|
||||
always_bytes=False))
|
||||
self.queue.task_done()
|
||||
except aiohttp.client_exceptions.ServerDisconnectedError:
|
||||
self.logger.info(
|
||||
'Write loop: WebSocket connection was closed, '
|
||||
'aborting')
|
||||
break
|
||||
self.logger.info('Exiting write loop task')
|
||||
@@ -1,472 +0,0 @@
|
||||
import asyncio
|
||||
|
||||
import six
|
||||
from six.moves import urllib
|
||||
|
||||
from . import exceptions
|
||||
from . import packet
|
||||
from . import server
|
||||
from . import asyncio_socket
|
||||
|
||||
|
||||
class AsyncServer(server.Server):
|
||||
"""An Engine.IO server for asyncio.
|
||||
|
||||
This class implements a fully compliant Engine.IO web server with support
|
||||
for websocket and long-polling transports, compatible with the asyncio
|
||||
framework on Python 3.5 or newer.
|
||||
|
||||
:param async_mode: The asynchronous model to use. See the Deployment
|
||||
section in the documentation for a description of the
|
||||
available options. Valid async modes are "aiohttp",
|
||||
"sanic", "tornado" and "asgi". If this argument is not
|
||||
given, "aiohttp" is tried first, followed by "sanic",
|
||||
"tornado", and finally "asgi". The first async mode that
|
||||
has all its dependencies installed is the one that is
|
||||
chosen.
|
||||
:param ping_timeout: The time in seconds that the client waits for the
|
||||
server to respond before disconnecting.
|
||||
:param ping_interval: The interval in seconds at which the client pings
|
||||
the server. The default is 25 seconds. For advanced
|
||||
control, a two element tuple can be given, where
|
||||
the first number is the ping interval and the second
|
||||
is a grace period added by the server. The default
|
||||
grace period is 5 seconds.
|
||||
:param max_http_buffer_size: The maximum size of a message when using the
|
||||
polling transport.
|
||||
:param allow_upgrades: Whether to allow transport upgrades or not.
|
||||
:param http_compression: Whether to compress packages when using the
|
||||
polling transport.
|
||||
:param compression_threshold: Only compress messages when their byte size
|
||||
is greater than this value.
|
||||
:param cookie: Name of the HTTP cookie that contains the client session
|
||||
id. If set to ``None``, a cookie is not sent to the client.
|
||||
:param cors_allowed_origins: Origin or list of origins that are allowed to
|
||||
connect to this server. Only the same origin
|
||||
is allowed by default. Set this argument to
|
||||
``'*'`` to allow all origins, or to ``[]`` to
|
||||
disable CORS handling.
|
||||
:param cors_credentials: Whether credentials (cookies, authentication) are
|
||||
allowed in requests to this server.
|
||||
:param logger: To enable logging set to ``True`` or pass a logger object to
|
||||
use. To disable logging set to ``False``.
|
||||
:param json: An alternative json module to use for encoding and decoding
|
||||
packets. Custom json modules must have ``dumps`` and ``loads``
|
||||
functions that are compatible with the standard library
|
||||
versions.
|
||||
:param async_handlers: If set to ``True``, run message event handlers in
|
||||
non-blocking threads. To run handlers synchronously,
|
||||
set to ``False``. The default is ``True``.
|
||||
:param kwargs: Reserved for future extensions, any additional parameters
|
||||
given as keyword arguments will be silently ignored.
|
||||
"""
|
||||
def is_asyncio_based(self):
|
||||
return True
|
||||
|
||||
def async_modes(self):
|
||||
return ['aiohttp', 'sanic', 'tornado', 'asgi']
|
||||
|
||||
def attach(self, app, engineio_path='engine.io'):
|
||||
"""Attach the Engine.IO server to an application."""
|
||||
engineio_path = engineio_path.strip('/')
|
||||
self._async['create_route'](app, self, '/{}/'.format(engineio_path))
|
||||
|
||||
async def send(self, sid, data, binary=None):
|
||||
"""Send a message to a client.
|
||||
|
||||
:param sid: The session id of the recipient client.
|
||||
:param data: The data to send to the client. Data can be of type
|
||||
``str``, ``bytes``, ``list`` or ``dict``. If a ``list``
|
||||
or ``dict``, the data will be serialized as JSON.
|
||||
:param binary: ``True`` to send packet as binary, ``False`` to send
|
||||
as text. If not given, unicode (Python 2) and str
|
||||
(Python 3) are sent as text, and str (Python 2) and
|
||||
bytes (Python 3) are sent as binary.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
try:
|
||||
socket = self._get_socket(sid)
|
||||
except KeyError:
|
||||
# the socket is not available
|
||||
self.logger.warning('Cannot send to sid %s', sid)
|
||||
return
|
||||
await socket.send(packet.Packet(packet.MESSAGE, data=data,
|
||||
binary=binary))
|
||||
|
||||
async def get_session(self, sid):
|
||||
"""Return the user session for a client.
|
||||
|
||||
:param sid: The session id of the client.
|
||||
|
||||
The return value is a dictionary. Modifications made to this
|
||||
dictionary are not guaranteed to be preserved. If you want to modify
|
||||
the user session, use the ``session`` context manager instead.
|
||||
"""
|
||||
socket = self._get_socket(sid)
|
||||
return socket.session
|
||||
|
||||
async def save_session(self, sid, session):
|
||||
"""Store the user session for a client.
|
||||
|
||||
:param sid: The session id of the client.
|
||||
:param session: The session dictionary.
|
||||
"""
|
||||
socket = self._get_socket(sid)
|
||||
socket.session = session
|
||||
|
||||
def session(self, sid):
|
||||
"""Return the user session for a client with context manager syntax.
|
||||
|
||||
:param sid: The session id of the client.
|
||||
|
||||
This is a context manager that returns the user session dictionary for
|
||||
the client. Any changes that are made to this dictionary inside the
|
||||
context manager block are saved back to the session. Example usage::
|
||||
|
||||
@eio.on('connect')
|
||||
def on_connect(sid, environ):
|
||||
username = authenticate_user(environ)
|
||||
if not username:
|
||||
return False
|
||||
with eio.session(sid) as session:
|
||||
session['username'] = username
|
||||
|
||||
@eio.on('message')
|
||||
def on_message(sid, msg):
|
||||
async with eio.session(sid) as session:
|
||||
print('received message from ', session['username'])
|
||||
"""
|
||||
class _session_context_manager(object):
|
||||
def __init__(self, server, sid):
|
||||
self.server = server
|
||||
self.sid = sid
|
||||
self.session = None
|
||||
|
||||
async def __aenter__(self):
|
||||
self.session = await self.server.get_session(sid)
|
||||
return self.session
|
||||
|
||||
async def __aexit__(self, *args):
|
||||
await self.server.save_session(sid, self.session)
|
||||
|
||||
return _session_context_manager(self, sid)
|
||||
|
||||
async def disconnect(self, sid=None):
|
||||
"""Disconnect a client.
|
||||
|
||||
:param sid: The session id of the client to close. If this parameter
|
||||
is not given, then all clients are closed.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
if sid is not None:
|
||||
try:
|
||||
socket = self._get_socket(sid)
|
||||
except KeyError: # pragma: no cover
|
||||
# the socket was already closed or gone
|
||||
pass
|
||||
else:
|
||||
await socket.close()
|
||||
if sid in self.sockets: # pragma: no cover
|
||||
del self.sockets[sid]
|
||||
else:
|
||||
await asyncio.wait([client.close()
|
||||
for client in six.itervalues(self.sockets)])
|
||||
self.sockets = {}
|
||||
|
||||
async def handle_request(self, *args, **kwargs):
|
||||
"""Handle an HTTP request from the client.
|
||||
|
||||
This is the entry point of the Engine.IO application. This function
|
||||
returns the HTTP response to deliver to the client.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
translate_request = self._async['translate_request']
|
||||
if asyncio.iscoroutinefunction(translate_request):
|
||||
environ = await translate_request(*args, **kwargs)
|
||||
else:
|
||||
environ = translate_request(*args, **kwargs)
|
||||
|
||||
if self.cors_allowed_origins != []:
|
||||
# Validate the origin header if present
|
||||
# This is important for WebSocket more than for HTTP, since
|
||||
# browsers only apply CORS controls to HTTP.
|
||||
origin = environ.get('HTTP_ORIGIN')
|
||||
if origin:
|
||||
allowed_origins = self._cors_allowed_origins(environ)
|
||||
if allowed_origins is not None and origin not in \
|
||||
allowed_origins:
|
||||
self.logger.info(origin + ' is not an accepted origin.')
|
||||
r = self._bad_request()
|
||||
make_response = self._async['make_response']
|
||||
if asyncio.iscoroutinefunction(make_response):
|
||||
response = await make_response(
|
||||
r['status'], r['headers'], r['response'], environ)
|
||||
else:
|
||||
response = make_response(r['status'], r['headers'],
|
||||
r['response'], environ)
|
||||
return response
|
||||
|
||||
method = environ['REQUEST_METHOD']
|
||||
query = urllib.parse.parse_qs(environ.get('QUERY_STRING', ''))
|
||||
|
||||
sid = query['sid'][0] if 'sid' in query else None
|
||||
b64 = False
|
||||
jsonp = False
|
||||
jsonp_index = None
|
||||
|
||||
if 'b64' in query:
|
||||
if query['b64'][0] == "1" or query['b64'][0].lower() == "true":
|
||||
b64 = True
|
||||
if 'j' in query:
|
||||
jsonp = True
|
||||
try:
|
||||
jsonp_index = int(query['j'][0])
|
||||
except (ValueError, KeyError, IndexError):
|
||||
# Invalid JSONP index number
|
||||
pass
|
||||
|
||||
if jsonp and jsonp_index is None:
|
||||
self.logger.warning('Invalid JSONP index number')
|
||||
r = self._bad_request()
|
||||
elif method == 'GET':
|
||||
if sid is None:
|
||||
transport = query.get('transport', ['polling'])[0]
|
||||
if transport != 'polling' and transport != 'websocket':
|
||||
self.logger.warning('Invalid transport %s', transport)
|
||||
r = self._bad_request()
|
||||
else:
|
||||
r = await self._handle_connect(environ, transport,
|
||||
b64, jsonp_index)
|
||||
else:
|
||||
if sid not in self.sockets:
|
||||
self.logger.warning('Invalid session %s', sid)
|
||||
r = self._bad_request()
|
||||
else:
|
||||
socket = self._get_socket(sid)
|
||||
try:
|
||||
packets = await socket.handle_get_request(environ)
|
||||
if isinstance(packets, list):
|
||||
r = self._ok(packets, b64=b64,
|
||||
jsonp_index=jsonp_index)
|
||||
else:
|
||||
r = packets
|
||||
except exceptions.EngineIOError:
|
||||
if sid in self.sockets: # pragma: no cover
|
||||
await self.disconnect(sid)
|
||||
r = self._bad_request()
|
||||
if sid in self.sockets and self.sockets[sid].closed:
|
||||
del self.sockets[sid]
|
||||
elif method == 'POST':
|
||||
if sid is None or sid not in self.sockets:
|
||||
self.logger.warning('Invalid session %s', sid)
|
||||
r = self._bad_request()
|
||||
else:
|
||||
socket = self._get_socket(sid)
|
||||
try:
|
||||
await socket.handle_post_request(environ)
|
||||
r = self._ok(jsonp_index=jsonp_index)
|
||||
except exceptions.EngineIOError:
|
||||
if sid in self.sockets: # pragma: no cover
|
||||
await self.disconnect(sid)
|
||||
r = self._bad_request()
|
||||
except: # pragma: no cover
|
||||
# for any other unexpected errors, we log the error
|
||||
# and keep going
|
||||
self.logger.exception('post request handler error')
|
||||
r = self._ok(jsonp_index=jsonp_index)
|
||||
elif method == 'OPTIONS':
|
||||
r = self._ok()
|
||||
else:
|
||||
self.logger.warning('Method %s not supported', method)
|
||||
r = self._method_not_found()
|
||||
if not isinstance(r, dict):
|
||||
return r
|
||||
if self.http_compression and \
|
||||
len(r['response']) >= self.compression_threshold:
|
||||
encodings = [e.split(';')[0].strip() for e in
|
||||
environ.get('HTTP_ACCEPT_ENCODING', '').split(',')]
|
||||
for encoding in encodings:
|
||||
if encoding in self.compression_methods:
|
||||
r['response'] = \
|
||||
getattr(self, '_' + encoding)(r['response'])
|
||||
r['headers'] += [('Content-Encoding', encoding)]
|
||||
break
|
||||
cors_headers = self._cors_headers(environ)
|
||||
make_response = self._async['make_response']
|
||||
if asyncio.iscoroutinefunction(make_response):
|
||||
response = await make_response(r['status'],
|
||||
r['headers'] + cors_headers,
|
||||
r['response'], environ)
|
||||
else:
|
||||
response = make_response(r['status'], r['headers'] + cors_headers,
|
||||
r['response'], environ)
|
||||
return response
|
||||
|
||||
def start_background_task(self, target, *args, **kwargs):
|
||||
"""Start a background task using the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to start a
|
||||
background task using the method that is compatible with the
|
||||
selected async mode.
|
||||
|
||||
:param target: the target function to execute.
|
||||
:param args: arguments to pass to the function.
|
||||
:param kwargs: keyword arguments to pass to the function.
|
||||
|
||||
The return value is a ``asyncio.Task`` object.
|
||||
"""
|
||||
return asyncio.ensure_future(target(*args, **kwargs))
|
||||
|
||||
async def sleep(self, seconds=0):
|
||||
"""Sleep for the requested amount of time using the appropriate async
|
||||
model.
|
||||
|
||||
This is a utility function that applications can use to put a task to
|
||||
sleep without having to worry about using the correct call for the
|
||||
selected async mode.
|
||||
|
||||
Note: this method is a coroutine.
|
||||
"""
|
||||
return await asyncio.sleep(seconds)
|
||||
|
||||
def create_queue(self, *args, **kwargs):
|
||||
"""Create a queue object using the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to create a queue
|
||||
without having to worry about using the correct call for the selected
|
||||
async mode. For asyncio based async modes, this returns an instance of
|
||||
``asyncio.Queue``.
|
||||
"""
|
||||
return asyncio.Queue(*args, **kwargs)
|
||||
|
||||
def get_queue_empty_exception(self):
|
||||
"""Return the queue empty exception for the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to work with a
|
||||
queue without having to worry about using the correct call for the
|
||||
selected async mode. For asyncio based async modes, this returns an
|
||||
instance of ``asyncio.QueueEmpty``.
|
||||
"""
|
||||
return asyncio.QueueEmpty
|
||||
|
||||
def create_event(self, *args, **kwargs):
|
||||
"""Create an event object using the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to create an
|
||||
event without having to worry about using the correct call for the
|
||||
selected async mode. For asyncio based async modes, this returns
|
||||
an instance of ``asyncio.Event``.
|
||||
"""
|
||||
return asyncio.Event(*args, **kwargs)
|
||||
|
||||
async def _handle_connect(self, environ, transport, b64=False,
|
||||
jsonp_index=None):
|
||||
"""Handle a client connection request."""
|
||||
if self.start_service_task:
|
||||
# start the service task to monitor connected clients
|
||||
self.start_service_task = False
|
||||
self.start_background_task(self._service_task)
|
||||
|
||||
sid = self._generate_id()
|
||||
s = asyncio_socket.AsyncSocket(self, sid)
|
||||
self.sockets[sid] = s
|
||||
|
||||
pkt = packet.Packet(
|
||||
packet.OPEN, {'sid': sid,
|
||||
'upgrades': self._upgrades(sid, transport),
|
||||
'pingTimeout': int(self.ping_timeout * 1000),
|
||||
'pingInterval': int(self.ping_interval * 1000)})
|
||||
await s.send(pkt)
|
||||
|
||||
ret = await self._trigger_event('connect', sid, environ,
|
||||
run_async=False)
|
||||
if ret is False:
|
||||
del self.sockets[sid]
|
||||
self.logger.warning('Application rejected connection')
|
||||
return self._unauthorized()
|
||||
|
||||
if transport == 'websocket':
|
||||
ret = await s.handle_get_request(environ)
|
||||
if s.closed:
|
||||
# websocket connection ended, so we are done
|
||||
del self.sockets[sid]
|
||||
return ret
|
||||
else:
|
||||
s.connected = True
|
||||
headers = None
|
||||
if self.cookie:
|
||||
headers = [('Set-Cookie', self.cookie + '=' + sid)]
|
||||
try:
|
||||
return self._ok(await s.poll(), headers=headers, b64=b64,
|
||||
jsonp_index=jsonp_index)
|
||||
except exceptions.QueueEmpty:
|
||||
return self._bad_request()
|
||||
|
||||
async def _trigger_event(self, event, *args, **kwargs):
|
||||
"""Invoke an event handler."""
|
||||
run_async = kwargs.pop('run_async', False)
|
||||
ret = None
|
||||
if event in self.handlers:
|
||||
if asyncio.iscoroutinefunction(self.handlers[event]) is True:
|
||||
if run_async:
|
||||
return self.start_background_task(self.handlers[event],
|
||||
*args)
|
||||
else:
|
||||
try:
|
||||
ret = await self.handlers[event](*args)
|
||||
except asyncio.CancelledError: # pragma: no cover
|
||||
pass
|
||||
except:
|
||||
self.logger.exception(event + ' async handler error')
|
||||
if event == 'connect':
|
||||
# if connect handler raised error we reject the
|
||||
# connection
|
||||
return False
|
||||
else:
|
||||
if run_async:
|
||||
async def async_handler():
|
||||
return self.handlers[event](*args)
|
||||
|
||||
return self.start_background_task(async_handler)
|
||||
else:
|
||||
try:
|
||||
ret = self.handlers[event](*args)
|
||||
except:
|
||||
self.logger.exception(event + ' handler error')
|
||||
if event == 'connect':
|
||||
# if connect handler raised error we reject the
|
||||
# connection
|
||||
return False
|
||||
return ret
|
||||
|
||||
async def _service_task(self): # pragma: no cover
|
||||
"""Monitor connected clients and clean up those that time out."""
|
||||
while True:
|
||||
if len(self.sockets) == 0:
|
||||
# nothing to do
|
||||
await self.sleep(self.ping_timeout)
|
||||
continue
|
||||
|
||||
# go through the entire client list in a ping interval cycle
|
||||
sleep_interval = self.ping_timeout / len(self.sockets)
|
||||
|
||||
try:
|
||||
# iterate over the current clients
|
||||
for socket in self.sockets.copy().values():
|
||||
if not socket.closing and not socket.closed:
|
||||
await socket.check_ping_timeout()
|
||||
await self.sleep(sleep_interval)
|
||||
except (SystemExit, KeyboardInterrupt, asyncio.CancelledError):
|
||||
self.logger.info('service task canceled')
|
||||
break
|
||||
except:
|
||||
if asyncio.get_event_loop().is_closed():
|
||||
self.logger.info('event loop is closed, exiting service '
|
||||
'task')
|
||||
break
|
||||
|
||||
# an unexpected exception has occurred, log it and continue
|
||||
self.logger.exception('service task exception')
|
||||
@@ -1,236 +0,0 @@
|
||||
import asyncio
|
||||
import six
|
||||
import sys
|
||||
import time
|
||||
|
||||
from . import exceptions
|
||||
from . import packet
|
||||
from . import payload
|
||||
from . import socket
|
||||
|
||||
|
||||
class AsyncSocket(socket.Socket):
|
||||
async def poll(self):
|
||||
"""Wait for packets to send to the client."""
|
||||
try:
|
||||
packets = [await asyncio.wait_for(self.queue.get(),
|
||||
self.server.ping_timeout)]
|
||||
self.queue.task_done()
|
||||
except (asyncio.TimeoutError, asyncio.CancelledError):
|
||||
raise exceptions.QueueEmpty()
|
||||
if packets == [None]:
|
||||
return []
|
||||
try:
|
||||
packets.append(self.queue.get_nowait())
|
||||
self.queue.task_done()
|
||||
except asyncio.QueueEmpty:
|
||||
pass
|
||||
return packets
|
||||
|
||||
async def receive(self, pkt):
|
||||
"""Receive packet from the client."""
|
||||
self.server.logger.info('%s: Received packet %s data %s',
|
||||
self.sid, packet.packet_names[pkt.packet_type],
|
||||
pkt.data if not isinstance(pkt.data, bytes)
|
||||
else '<binary>')
|
||||
if pkt.packet_type == packet.PING:
|
||||
self.last_ping = time.time()
|
||||
await self.send(packet.Packet(packet.PONG, pkt.data))
|
||||
elif pkt.packet_type == packet.MESSAGE:
|
||||
await self.server._trigger_event(
|
||||
'message', self.sid, pkt.data,
|
||||
run_async=self.server.async_handlers)
|
||||
elif pkt.packet_type == packet.UPGRADE:
|
||||
await self.send(packet.Packet(packet.NOOP))
|
||||
elif pkt.packet_type == packet.CLOSE:
|
||||
await self.close(wait=False, abort=True)
|
||||
else:
|
||||
raise exceptions.UnknownPacketError()
|
||||
|
||||
async def check_ping_timeout(self):
|
||||
"""Make sure the client is still sending pings.
|
||||
|
||||
This helps detect disconnections for long-polling clients.
|
||||
"""
|
||||
if self.closed:
|
||||
raise exceptions.SocketIsClosedError()
|
||||
if time.time() - self.last_ping > self.server.ping_interval + \
|
||||
self.server.ping_interval_grace_period:
|
||||
self.server.logger.info('%s: Client is gone, closing socket',
|
||||
self.sid)
|
||||
# Passing abort=False here will cause close() to write a
|
||||
# CLOSE packet. This has the effect of updating half-open sockets
|
||||
# to their correct state of disconnected
|
||||
await self.close(wait=False, abort=False)
|
||||
return False
|
||||
return True
|
||||
|
||||
async def send(self, pkt):
|
||||
"""Send a packet to the client."""
|
||||
if not await self.check_ping_timeout():
|
||||
return
|
||||
if self.upgrading:
|
||||
self.packet_backlog.append(pkt)
|
||||
else:
|
||||
await self.queue.put(pkt)
|
||||
self.server.logger.info('%s: Sending packet %s data %s',
|
||||
self.sid, packet.packet_names[pkt.packet_type],
|
||||
pkt.data if not isinstance(pkt.data, bytes)
|
||||
else '<binary>')
|
||||
|
||||
async def handle_get_request(self, environ):
|
||||
"""Handle a long-polling GET request from the client."""
|
||||
connections = [
|
||||
s.strip()
|
||||
for s in environ.get('HTTP_CONNECTION', '').lower().split(',')]
|
||||
transport = environ.get('HTTP_UPGRADE', '').lower()
|
||||
if 'upgrade' in connections and transport in self.upgrade_protocols:
|
||||
self.server.logger.info('%s: Received request to upgrade to %s',
|
||||
self.sid, transport)
|
||||
return await getattr(self, '_upgrade_' + transport)(environ)
|
||||
try:
|
||||
packets = await self.poll()
|
||||
except exceptions.QueueEmpty:
|
||||
exc = sys.exc_info()
|
||||
await self.close(wait=False)
|
||||
six.reraise(*exc)
|
||||
return packets
|
||||
|
||||
async def handle_post_request(self, environ):
|
||||
"""Handle a long-polling POST request from the client."""
|
||||
length = int(environ.get('CONTENT_LENGTH', '0'))
|
||||
if length > self.server.max_http_buffer_size:
|
||||
raise exceptions.ContentTooLongError()
|
||||
else:
|
||||
body = await environ['wsgi.input'].read(length)
|
||||
p = payload.Payload(encoded_payload=body)
|
||||
for pkt in p.packets:
|
||||
await self.receive(pkt)
|
||||
|
||||
async def close(self, wait=True, abort=False):
|
||||
"""Close the socket connection."""
|
||||
if not self.closed and not self.closing:
|
||||
self.closing = True
|
||||
await self.server._trigger_event('disconnect', self.sid)
|
||||
if not abort:
|
||||
await self.send(packet.Packet(packet.CLOSE))
|
||||
self.closed = True
|
||||
if wait:
|
||||
await self.queue.join()
|
||||
|
||||
async def _upgrade_websocket(self, environ):
|
||||
"""Upgrade the connection from polling to websocket."""
|
||||
if self.upgraded:
|
||||
raise IOError('Socket has been upgraded already')
|
||||
if self.server._async['websocket'] is None:
|
||||
# the selected async mode does not support websocket
|
||||
return self.server._bad_request()
|
||||
ws = self.server._async['websocket'](self._websocket_handler)
|
||||
return await ws(environ)
|
||||
|
||||
async def _websocket_handler(self, ws):
|
||||
"""Engine.IO handler for websocket transport."""
|
||||
if self.connected:
|
||||
# the socket was already connected, so this is an upgrade
|
||||
self.upgrading = True # hold packet sends during the upgrade
|
||||
|
||||
try:
|
||||
pkt = await ws.wait()
|
||||
except IOError: # pragma: no cover
|
||||
return
|
||||
decoded_pkt = packet.Packet(encoded_packet=pkt)
|
||||
if decoded_pkt.packet_type != packet.PING or \
|
||||
decoded_pkt.data != 'probe':
|
||||
self.server.logger.info(
|
||||
'%s: Failed websocket upgrade, no PING packet', self.sid)
|
||||
return
|
||||
await ws.send(packet.Packet(
|
||||
packet.PONG,
|
||||
data=six.text_type('probe')).encode(always_bytes=False))
|
||||
await self.queue.put(packet.Packet(packet.NOOP)) # end poll
|
||||
|
||||
try:
|
||||
pkt = await ws.wait()
|
||||
except IOError: # pragma: no cover
|
||||
return
|
||||
decoded_pkt = packet.Packet(encoded_packet=pkt)
|
||||
if decoded_pkt.packet_type != packet.UPGRADE:
|
||||
self.upgraded = False
|
||||
self.server.logger.info(
|
||||
('%s: Failed websocket upgrade, expected UPGRADE packet, '
|
||||
'received %s instead.'),
|
||||
self.sid, pkt)
|
||||
return
|
||||
self.upgraded = True
|
||||
|
||||
# flush any packets that were sent during the upgrade
|
||||
for pkt in self.packet_backlog:
|
||||
await self.queue.put(pkt)
|
||||
self.packet_backlog = []
|
||||
self.upgrading = False
|
||||
else:
|
||||
self.connected = True
|
||||
self.upgraded = True
|
||||
|
||||
# start separate writer thread
|
||||
async def writer():
|
||||
while True:
|
||||
packets = None
|
||||
try:
|
||||
packets = await self.poll()
|
||||
except exceptions.QueueEmpty:
|
||||
break
|
||||
if not packets:
|
||||
# empty packet list returned -> connection closed
|
||||
break
|
||||
try:
|
||||
for pkt in packets:
|
||||
await ws.send(pkt.encode(always_bytes=False))
|
||||
except:
|
||||
break
|
||||
writer_task = asyncio.ensure_future(writer())
|
||||
|
||||
self.server.logger.info(
|
||||
'%s: Upgrade to websocket successful', self.sid)
|
||||
|
||||
while True:
|
||||
p = None
|
||||
wait_task = asyncio.ensure_future(ws.wait())
|
||||
try:
|
||||
p = await asyncio.wait_for(wait_task, self.server.ping_timeout)
|
||||
except asyncio.CancelledError: # pragma: no cover
|
||||
# there is a bug (https://bugs.python.org/issue30508) in
|
||||
# asyncio that causes a "Task exception never retrieved" error
|
||||
# to appear when wait_task raises an exception before it gets
|
||||
# cancelled. Calling wait_task.exception() prevents the error
|
||||
# from being issued in Python 3.6, but causes other errors in
|
||||
# other versions, so we run it with all errors suppressed and
|
||||
# hope for the best.
|
||||
try:
|
||||
wait_task.exception()
|
||||
except:
|
||||
pass
|
||||
break
|
||||
except:
|
||||
break
|
||||
if p is None:
|
||||
# connection closed by client
|
||||
break
|
||||
if isinstance(p, six.text_type): # pragma: no cover
|
||||
p = p.encode('utf-8')
|
||||
pkt = packet.Packet(encoded_packet=p)
|
||||
try:
|
||||
await self.receive(pkt)
|
||||
except exceptions.UnknownPacketError: # pragma: no cover
|
||||
pass
|
||||
except exceptions.SocketIsClosedError: # pragma: no cover
|
||||
self.server.logger.info('Receive error -- socket is closed')
|
||||
break
|
||||
except: # pragma: no cover
|
||||
# if we get an unexpected exception we log the error and exit
|
||||
# the connection properly
|
||||
self.server.logger.exception('Unknown receive error')
|
||||
|
||||
await self.queue.put(None) # unlock the writer task so it can exit
|
||||
await asyncio.wait_for(writer_task, timeout=None)
|
||||
await self.close(wait=False, abort=True)
|
||||
@@ -1,680 +0,0 @@
|
||||
import logging
|
||||
try:
|
||||
import queue
|
||||
except ImportError: # pragma: no cover
|
||||
import Queue as queue
|
||||
import signal
|
||||
import ssl
|
||||
import threading
|
||||
import time
|
||||
|
||||
import six
|
||||
from six.moves import urllib
|
||||
try:
|
||||
import requests
|
||||
except ImportError: # pragma: no cover
|
||||
requests = None
|
||||
try:
|
||||
import websocket
|
||||
except ImportError: # pragma: no cover
|
||||
websocket = None
|
||||
from . import exceptions
|
||||
from . import packet
|
||||
from . import payload
|
||||
|
||||
default_logger = logging.getLogger('engineio.client')
|
||||
connected_clients = []
|
||||
|
||||
if six.PY2: # pragma: no cover
|
||||
ConnectionError = OSError
|
||||
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
"""SIGINT handler.
|
||||
|
||||
Disconnect all active clients and then invoke the original signal handler.
|
||||
"""
|
||||
for client in connected_clients[:]:
|
||||
if client.is_asyncio_based():
|
||||
client.start_background_task(client.disconnect, abort=True)
|
||||
else:
|
||||
client.disconnect(abort=True)
|
||||
if callable(original_signal_handler):
|
||||
return original_signal_handler(sig, frame)
|
||||
else: # pragma: no cover
|
||||
# Handle case where no original SIGINT handler was present.
|
||||
return signal.default_int_handler(sig, frame)
|
||||
|
||||
|
||||
original_signal_handler = None
|
||||
|
||||
|
||||
class Client(object):
|
||||
"""An Engine.IO client.
|
||||
|
||||
This class implements a fully compliant Engine.IO web client with support
|
||||
for websocket and long-polling transports.
|
||||
|
||||
:param logger: To enable logging set to ``True`` or pass a logger object to
|
||||
use. To disable logging set to ``False``. The default is
|
||||
``False``.
|
||||
:param json: An alternative json module to use for encoding and decoding
|
||||
packets. Custom json modules must have ``dumps`` and ``loads``
|
||||
functions that are compatible with the standard library
|
||||
versions.
|
||||
:param request_timeout: A timeout in seconds for requests. The default is
|
||||
5 seconds.
|
||||
:param ssl_verify: ``True`` to verify SSL certificates, or ``False`` to
|
||||
skip SSL certificate verification, allowing
|
||||
connections to servers with self signed certificates.
|
||||
The default is ``True``.
|
||||
"""
|
||||
event_names = ['connect', 'disconnect', 'message']
|
||||
|
||||
def __init__(self,
|
||||
logger=False,
|
||||
json=None,
|
||||
request_timeout=5,
|
||||
ssl_verify=True):
|
||||
global original_signal_handler
|
||||
if original_signal_handler is None:
|
||||
original_signal_handler = signal.signal(signal.SIGINT,
|
||||
signal_handler)
|
||||
self.handlers = {}
|
||||
self.base_url = None
|
||||
self.transports = None
|
||||
self.current_transport = None
|
||||
self.sid = None
|
||||
self.upgrades = None
|
||||
self.ping_interval = None
|
||||
self.ping_timeout = None
|
||||
self.pong_received = True
|
||||
self.http = None
|
||||
self.ws = None
|
||||
self.read_loop_task = None
|
||||
self.write_loop_task = None
|
||||
self.ping_loop_task = None
|
||||
self.ping_loop_event = None
|
||||
self.queue = None
|
||||
self.state = 'disconnected'
|
||||
self.ssl_verify = ssl_verify
|
||||
|
||||
if json is not None:
|
||||
packet.Packet.json = json
|
||||
if not isinstance(logger, bool):
|
||||
self.logger = logger
|
||||
else:
|
||||
self.logger = default_logger
|
||||
if not logging.root.handlers and \
|
||||
self.logger.level == logging.NOTSET:
|
||||
if logger:
|
||||
self.logger.setLevel(logging.INFO)
|
||||
else:
|
||||
self.logger.setLevel(logging.ERROR)
|
||||
self.logger.addHandler(logging.StreamHandler())
|
||||
|
||||
self.request_timeout = request_timeout
|
||||
|
||||
def is_asyncio_based(self):
|
||||
return False
|
||||
|
||||
def on(self, event, handler=None):
|
||||
"""Register an event handler.
|
||||
|
||||
:param event: The event name. Can be ``'connect'``, ``'message'`` or
|
||||
``'disconnect'``.
|
||||
:param handler: The function that should be invoked to handle the
|
||||
event. When this parameter is not given, the method
|
||||
acts as a decorator for the handler function.
|
||||
|
||||
Example usage::
|
||||
|
||||
# as a decorator:
|
||||
@eio.on('connect')
|
||||
def connect_handler():
|
||||
print('Connection request')
|
||||
|
||||
# as a method:
|
||||
def message_handler(msg):
|
||||
print('Received message: ', msg)
|
||||
eio.send('response')
|
||||
eio.on('message', message_handler)
|
||||
"""
|
||||
if event not in self.event_names:
|
||||
raise ValueError('Invalid event')
|
||||
|
||||
def set_handler(handler):
|
||||
self.handlers[event] = handler
|
||||
return handler
|
||||
|
||||
if handler is None:
|
||||
return set_handler
|
||||
set_handler(handler)
|
||||
|
||||
def connect(self, url, headers={}, transports=None,
|
||||
engineio_path='engine.io'):
|
||||
"""Connect to an Engine.IO server.
|
||||
|
||||
:param url: The URL of the Engine.IO server. It can include custom
|
||||
query string parameters if required by the server.
|
||||
:param headers: A dictionary with custom headers to send with the
|
||||
connection request.
|
||||
:param transports: The list of allowed transports. Valid transports
|
||||
are ``'polling'`` and ``'websocket'``. If not
|
||||
given, the polling transport is connected first,
|
||||
then an upgrade to websocket is attempted.
|
||||
:param engineio_path: The endpoint where the Engine.IO server is
|
||||
installed. The default value is appropriate for
|
||||
most cases.
|
||||
|
||||
Example usage::
|
||||
|
||||
eio = engineio.Client()
|
||||
eio.connect('http://localhost:5000')
|
||||
"""
|
||||
if self.state != 'disconnected':
|
||||
raise ValueError('Client is not in a disconnected state')
|
||||
valid_transports = ['polling', 'websocket']
|
||||
if transports is not None:
|
||||
if isinstance(transports, six.string_types):
|
||||
transports = [transports]
|
||||
transports = [transport for transport in transports
|
||||
if transport in valid_transports]
|
||||
if not transports:
|
||||
raise ValueError('No valid transports provided')
|
||||
self.transports = transports or valid_transports
|
||||
self.queue = self.create_queue()
|
||||
return getattr(self, '_connect_' + self.transports[0])(
|
||||
url, headers, engineio_path)
|
||||
|
||||
def wait(self):
|
||||
"""Wait until the connection with the server ends.
|
||||
|
||||
Client applications can use this function to block the main thread
|
||||
during the life of the connection.
|
||||
"""
|
||||
if self.read_loop_task:
|
||||
self.read_loop_task.join()
|
||||
|
||||
def send(self, data, binary=None):
|
||||
"""Send a message to a client.
|
||||
|
||||
:param data: The data to send to the client. Data can be of type
|
||||
``str``, ``bytes``, ``list`` or ``dict``. If a ``list``
|
||||
or ``dict``, the data will be serialized as JSON.
|
||||
:param binary: ``True`` to send packet as binary, ``False`` to send
|
||||
as text. If not given, unicode (Python 2) and str
|
||||
(Python 3) are sent as text, and str (Python 2) and
|
||||
bytes (Python 3) are sent as binary.
|
||||
"""
|
||||
self._send_packet(packet.Packet(packet.MESSAGE, data=data,
|
||||
binary=binary))
|
||||
|
||||
def disconnect(self, abort=False):
|
||||
"""Disconnect from the server.
|
||||
|
||||
:param abort: If set to ``True``, do not wait for background tasks
|
||||
associated with the connection to end.
|
||||
"""
|
||||
if self.state == 'connected':
|
||||
self._send_packet(packet.Packet(packet.CLOSE))
|
||||
self.queue.put(None)
|
||||
self.state = 'disconnecting'
|
||||
self._trigger_event('disconnect', run_async=False)
|
||||
if self.current_transport == 'websocket':
|
||||
self.ws.close()
|
||||
if not abort:
|
||||
self.read_loop_task.join()
|
||||
self.state = 'disconnected'
|
||||
try:
|
||||
connected_clients.remove(self)
|
||||
except ValueError: # pragma: no cover
|
||||
pass
|
||||
self._reset()
|
||||
|
||||
def transport(self):
|
||||
"""Return the name of the transport currently in use.
|
||||
|
||||
The possible values returned by this function are ``'polling'`` and
|
||||
``'websocket'``.
|
||||
"""
|
||||
return self.current_transport
|
||||
|
||||
def start_background_task(self, target, *args, **kwargs):
|
||||
"""Start a background task.
|
||||
|
||||
This is a utility function that applications can use to start a
|
||||
background task.
|
||||
|
||||
:param target: the target function to execute.
|
||||
:param args: arguments to pass to the function.
|
||||
:param kwargs: keyword arguments to pass to the function.
|
||||
|
||||
This function returns an object compatible with the `Thread` class in
|
||||
the Python standard library. The `start()` method on this object is
|
||||
already called by this function.
|
||||
"""
|
||||
th = threading.Thread(target=target, args=args, kwargs=kwargs)
|
||||
th.start()
|
||||
return th
|
||||
|
||||
def sleep(self, seconds=0):
|
||||
"""Sleep for the requested amount of time."""
|
||||
return time.sleep(seconds)
|
||||
|
||||
def create_queue(self, *args, **kwargs):
|
||||
"""Create a queue object."""
|
||||
q = queue.Queue(*args, **kwargs)
|
||||
q.Empty = queue.Empty
|
||||
return q
|
||||
|
||||
def create_event(self, *args, **kwargs):
|
||||
"""Create an event object."""
|
||||
return threading.Event(*args, **kwargs)
|
||||
|
||||
def _reset(self):
|
||||
self.state = 'disconnected'
|
||||
self.sid = None
|
||||
|
||||
def _connect_polling(self, url, headers, engineio_path):
|
||||
"""Establish a long-polling connection to the Engine.IO server."""
|
||||
if requests is None: # pragma: no cover
|
||||
# not installed
|
||||
self.logger.error('requests package is not installed -- cannot '
|
||||
'send HTTP requests!')
|
||||
return
|
||||
self.base_url = self._get_engineio_url(url, engineio_path, 'polling')
|
||||
self.logger.info('Attempting polling connection to ' + self.base_url)
|
||||
r = self._send_request(
|
||||
'GET', self.base_url + self._get_url_timestamp(), headers=headers,
|
||||
timeout=self.request_timeout)
|
||||
if r is None:
|
||||
self._reset()
|
||||
raise exceptions.ConnectionError(
|
||||
'Connection refused by the server')
|
||||
if r.status_code < 200 or r.status_code >= 300:
|
||||
raise exceptions.ConnectionError(
|
||||
'Unexpected status code {} in server response'.format(
|
||||
r.status_code))
|
||||
try:
|
||||
p = payload.Payload(encoded_payload=r.content)
|
||||
except ValueError:
|
||||
six.raise_from(exceptions.ConnectionError(
|
||||
'Unexpected response from server'), None)
|
||||
open_packet = p.packets[0]
|
||||
if open_packet.packet_type != packet.OPEN:
|
||||
raise exceptions.ConnectionError(
|
||||
'OPEN packet not returned by server')
|
||||
self.logger.info(
|
||||
'Polling connection accepted with ' + str(open_packet.data))
|
||||
self.sid = open_packet.data['sid']
|
||||
self.upgrades = open_packet.data['upgrades']
|
||||
self.ping_interval = open_packet.data['pingInterval'] / 1000.0
|
||||
self.ping_timeout = open_packet.data['pingTimeout'] / 1000.0
|
||||
self.current_transport = 'polling'
|
||||
self.base_url += '&sid=' + self.sid
|
||||
|
||||
self.state = 'connected'
|
||||
connected_clients.append(self)
|
||||
self._trigger_event('connect', run_async=False)
|
||||
|
||||
for pkt in p.packets[1:]:
|
||||
self._receive_packet(pkt)
|
||||
|
||||
if 'websocket' in self.upgrades and 'websocket' in self.transports:
|
||||
# attempt to upgrade to websocket
|
||||
if self._connect_websocket(url, headers, engineio_path):
|
||||
# upgrade to websocket succeeded, we're done here
|
||||
return
|
||||
|
||||
# start background tasks associated with this client
|
||||
self.ping_loop_task = self.start_background_task(self._ping_loop)
|
||||
self.write_loop_task = self.start_background_task(self._write_loop)
|
||||
self.read_loop_task = self.start_background_task(
|
||||
self._read_loop_polling)
|
||||
|
||||
def _connect_websocket(self, url, headers, engineio_path):
|
||||
"""Establish or upgrade to a WebSocket connection with the server."""
|
||||
if websocket is None: # pragma: no cover
|
||||
# not installed
|
||||
self.logger.warning('websocket-client package not installed, only '
|
||||
'polling transport is available')
|
||||
return False
|
||||
websocket_url = self._get_engineio_url(url, engineio_path, 'websocket')
|
||||
if self.sid:
|
||||
self.logger.info(
|
||||
'Attempting WebSocket upgrade to ' + websocket_url)
|
||||
upgrade = True
|
||||
websocket_url += '&sid=' + self.sid
|
||||
else:
|
||||
upgrade = False
|
||||
self.base_url = websocket_url
|
||||
self.logger.info(
|
||||
'Attempting WebSocket connection to ' + websocket_url)
|
||||
|
||||
# get the cookies from the long-polling connection so that they can
|
||||
# also be sent the the WebSocket route
|
||||
cookies = None
|
||||
if self.http:
|
||||
cookies = '; '.join(["{}={}".format(cookie.name, cookie.value)
|
||||
for cookie in self.http.cookies])
|
||||
|
||||
try:
|
||||
if not self.ssl_verify:
|
||||
ws = websocket.create_connection(
|
||||
websocket_url + self._get_url_timestamp(), header=headers,
|
||||
cookie=cookies, sslopt={"cert_reqs": ssl.CERT_NONE})
|
||||
else:
|
||||
ws = websocket.create_connection(
|
||||
websocket_url + self._get_url_timestamp(), header=headers,
|
||||
cookie=cookies)
|
||||
except (ConnectionError, IOError, websocket.WebSocketException):
|
||||
if upgrade:
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: connection error')
|
||||
return False
|
||||
else:
|
||||
raise exceptions.ConnectionError('Connection error')
|
||||
if upgrade:
|
||||
p = packet.Packet(packet.PING,
|
||||
data=six.text_type('probe')).encode()
|
||||
try:
|
||||
ws.send(p)
|
||||
except Exception as e: # pragma: no cover
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: unexpected send exception: %s',
|
||||
str(e))
|
||||
return False
|
||||
try:
|
||||
p = ws.recv()
|
||||
except Exception as e: # pragma: no cover
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: unexpected recv exception: %s',
|
||||
str(e))
|
||||
return False
|
||||
pkt = packet.Packet(encoded_packet=p)
|
||||
if pkt.packet_type != packet.PONG or pkt.data != 'probe':
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: no PONG packet')
|
||||
return False
|
||||
p = packet.Packet(packet.UPGRADE).encode()
|
||||
try:
|
||||
ws.send(p)
|
||||
except Exception as e: # pragma: no cover
|
||||
self.logger.warning(
|
||||
'WebSocket upgrade failed: unexpected send exception: %s',
|
||||
str(e))
|
||||
return False
|
||||
self.current_transport = 'websocket'
|
||||
self.logger.info('WebSocket upgrade was successful')
|
||||
else:
|
||||
try:
|
||||
p = ws.recv()
|
||||
except Exception as e: # pragma: no cover
|
||||
raise exceptions.ConnectionError(
|
||||
'Unexpected recv exception: ' + str(e))
|
||||
open_packet = packet.Packet(encoded_packet=p)
|
||||
if open_packet.packet_type != packet.OPEN:
|
||||
raise exceptions.ConnectionError('no OPEN packet')
|
||||
self.logger.info(
|
||||
'WebSocket connection accepted with ' + str(open_packet.data))
|
||||
self.sid = open_packet.data['sid']
|
||||
self.upgrades = open_packet.data['upgrades']
|
||||
self.ping_interval = open_packet.data['pingInterval'] / 1000.0
|
||||
self.ping_timeout = open_packet.data['pingTimeout'] / 1000.0
|
||||
self.current_transport = 'websocket'
|
||||
|
||||
self.state = 'connected'
|
||||
connected_clients.append(self)
|
||||
self._trigger_event('connect', run_async=False)
|
||||
self.ws = ws
|
||||
|
||||
# start background tasks associated with this client
|
||||
self.ping_loop_task = self.start_background_task(self._ping_loop)
|
||||
self.write_loop_task = self.start_background_task(self._write_loop)
|
||||
self.read_loop_task = self.start_background_task(
|
||||
self._read_loop_websocket)
|
||||
return True
|
||||
|
||||
def _receive_packet(self, pkt):
|
||||
"""Handle incoming packets from the server."""
|
||||
packet_name = packet.packet_names[pkt.packet_type] \
|
||||
if pkt.packet_type < len(packet.packet_names) else 'UNKNOWN'
|
||||
self.logger.info(
|
||||
'Received packet %s data %s', packet_name,
|
||||
pkt.data if not isinstance(pkt.data, bytes) else '<binary>')
|
||||
if pkt.packet_type == packet.MESSAGE:
|
||||
self._trigger_event('message', pkt.data, run_async=True)
|
||||
elif pkt.packet_type == packet.PONG:
|
||||
self.pong_received = True
|
||||
elif pkt.packet_type == packet.CLOSE:
|
||||
self.disconnect(abort=True)
|
||||
elif pkt.packet_type == packet.NOOP:
|
||||
pass
|
||||
else:
|
||||
self.logger.error('Received unexpected packet of type %s',
|
||||
pkt.packet_type)
|
||||
|
||||
def _send_packet(self, pkt):
|
||||
"""Queue a packet to be sent to the server."""
|
||||
if self.state != 'connected':
|
||||
return
|
||||
self.queue.put(pkt)
|
||||
self.logger.info(
|
||||
'Sending packet %s data %s',
|
||||
packet.packet_names[pkt.packet_type],
|
||||
pkt.data if not isinstance(pkt.data, bytes) else '<binary>')
|
||||
|
||||
def _send_request(
|
||||
self, method, url, headers=None, body=None,
|
||||
timeout=None): # pragma: no cover
|
||||
if self.http is None:
|
||||
self.http = requests.Session()
|
||||
try:
|
||||
return self.http.request(method, url, headers=headers, data=body,
|
||||
timeout=timeout, verify=self.ssl_verify)
|
||||
except requests.exceptions.RequestException as exc:
|
||||
self.logger.info('HTTP %s request to %s failed with error %s.',
|
||||
method, url, exc)
|
||||
|
||||
def _trigger_event(self, event, *args, **kwargs):
|
||||
"""Invoke an event handler."""
|
||||
run_async = kwargs.pop('run_async', False)
|
||||
if event in self.handlers:
|
||||
if run_async:
|
||||
return self.start_background_task(self.handlers[event], *args)
|
||||
else:
|
||||
try:
|
||||
return self.handlers[event](*args)
|
||||
except:
|
||||
self.logger.exception(event + ' handler error')
|
||||
|
||||
def _get_engineio_url(self, url, engineio_path, transport):
|
||||
"""Generate the Engine.IO connection URL."""
|
||||
engineio_path = engineio_path.strip('/')
|
||||
parsed_url = urllib.parse.urlparse(url)
|
||||
|
||||
if transport == 'polling':
|
||||
scheme = 'http'
|
||||
elif transport == 'websocket':
|
||||
scheme = 'ws'
|
||||
else: # pragma: no cover
|
||||
raise ValueError('invalid transport')
|
||||
if parsed_url.scheme in ['https', 'wss']:
|
||||
scheme += 's'
|
||||
|
||||
return ('{scheme}://{netloc}/{path}/?{query}'
|
||||
'{sep}transport={transport}&EIO=3').format(
|
||||
scheme=scheme, netloc=parsed_url.netloc,
|
||||
path=engineio_path, query=parsed_url.query,
|
||||
sep='&' if parsed_url.query else '',
|
||||
transport=transport)
|
||||
|
||||
def _get_url_timestamp(self):
|
||||
"""Generate the Engine.IO query string timestamp."""
|
||||
return '&t=' + str(time.time())
|
||||
|
||||
def _ping_loop(self):
|
||||
"""This background task sends a PING to the server at the requested
|
||||
interval.
|
||||
"""
|
||||
self.pong_received = True
|
||||
if self.ping_loop_event is None:
|
||||
self.ping_loop_event = self.create_event()
|
||||
else:
|
||||
self.ping_loop_event.clear()
|
||||
while self.state == 'connected':
|
||||
if not self.pong_received:
|
||||
self.logger.info(
|
||||
'PONG response has not been received, aborting')
|
||||
if self.ws:
|
||||
self.ws.close(timeout=0)
|
||||
self.queue.put(None)
|
||||
break
|
||||
self.pong_received = False
|
||||
self._send_packet(packet.Packet(packet.PING))
|
||||
self.ping_loop_event.wait(timeout=self.ping_interval)
|
||||
self.logger.info('Exiting ping task')
|
||||
|
||||
def _read_loop_polling(self):
|
||||
"""Read packets by polling the Engine.IO server."""
|
||||
while self.state == 'connected':
|
||||
self.logger.info(
|
||||
'Sending polling GET request to ' + self.base_url)
|
||||
r = self._send_request(
|
||||
'GET', self.base_url + self._get_url_timestamp(),
|
||||
timeout=max(self.ping_interval, self.ping_timeout) + 5)
|
||||
if r is None:
|
||||
self.logger.warning(
|
||||
'Connection refused by the server, aborting')
|
||||
self.queue.put(None)
|
||||
break
|
||||
if r.status_code < 200 or r.status_code >= 300:
|
||||
self.logger.warning('Unexpected status code %s in server '
|
||||
'response, aborting', r.status_code)
|
||||
self.queue.put(None)
|
||||
break
|
||||
try:
|
||||
p = payload.Payload(encoded_payload=r.content)
|
||||
except ValueError:
|
||||
self.logger.warning(
|
||||
'Unexpected packet from server, aborting')
|
||||
self.queue.put(None)
|
||||
break
|
||||
for pkt in p.packets:
|
||||
self._receive_packet(pkt)
|
||||
|
||||
self.logger.info('Waiting for write loop task to end')
|
||||
self.write_loop_task.join()
|
||||
self.logger.info('Waiting for ping loop task to end')
|
||||
if self.ping_loop_event: # pragma: no cover
|
||||
self.ping_loop_event.set()
|
||||
self.ping_loop_task.join()
|
||||
if self.state == 'connected':
|
||||
self._trigger_event('disconnect', run_async=False)
|
||||
try:
|
||||
connected_clients.remove(self)
|
||||
except ValueError: # pragma: no cover
|
||||
pass
|
||||
self._reset()
|
||||
self.logger.info('Exiting read loop task')
|
||||
|
||||
def _read_loop_websocket(self):
|
||||
"""Read packets from the Engine.IO WebSocket connection."""
|
||||
while self.state == 'connected':
|
||||
p = None
|
||||
try:
|
||||
p = self.ws.recv()
|
||||
except websocket.WebSocketConnectionClosedException:
|
||||
self.logger.warning(
|
||||
'WebSocket connection was closed, aborting')
|
||||
self.queue.put(None)
|
||||
break
|
||||
except Exception as e:
|
||||
self.logger.info(
|
||||
'Unexpected error "%s", aborting', str(e))
|
||||
self.queue.put(None)
|
||||
break
|
||||
if isinstance(p, six.text_type): # pragma: no cover
|
||||
p = p.encode('utf-8')
|
||||
pkt = packet.Packet(encoded_packet=p)
|
||||
self._receive_packet(pkt)
|
||||
|
||||
self.logger.info('Waiting for write loop task to end')
|
||||
self.write_loop_task.join()
|
||||
self.logger.info('Waiting for ping loop task to end')
|
||||
if self.ping_loop_event: # pragma: no cover
|
||||
self.ping_loop_event.set()
|
||||
self.ping_loop_task.join()
|
||||
if self.state == 'connected':
|
||||
self._trigger_event('disconnect', run_async=False)
|
||||
try:
|
||||
connected_clients.remove(self)
|
||||
except ValueError: # pragma: no cover
|
||||
pass
|
||||
self._reset()
|
||||
self.logger.info('Exiting read loop task')
|
||||
|
||||
def _write_loop(self):
|
||||
"""This background task sends packages to the server as they are
|
||||
pushed to the send queue.
|
||||
"""
|
||||
while self.state == 'connected':
|
||||
# to simplify the timeout handling, use the maximum of the
|
||||
# ping interval and ping timeout as timeout, with an extra 5
|
||||
# seconds grace period
|
||||
timeout = max(self.ping_interval, self.ping_timeout) + 5
|
||||
packets = None
|
||||
try:
|
||||
packets = [self.queue.get(timeout=timeout)]
|
||||
except self.queue.Empty:
|
||||
self.logger.error('packet queue is empty, aborting')
|
||||
break
|
||||
if packets == [None]:
|
||||
self.queue.task_done()
|
||||
packets = []
|
||||
else:
|
||||
while True:
|
||||
try:
|
||||
packets.append(self.queue.get(block=False))
|
||||
except self.queue.Empty:
|
||||
break
|
||||
if packets[-1] is None:
|
||||
packets = packets[:-1]
|
||||
self.queue.task_done()
|
||||
break
|
||||
if not packets:
|
||||
# empty packet list returned -> connection closed
|
||||
break
|
||||
if self.current_transport == 'polling':
|
||||
p = payload.Payload(packets=packets)
|
||||
r = self._send_request(
|
||||
'POST', self.base_url, body=p.encode(),
|
||||
headers={'Content-Type': 'application/octet-stream'},
|
||||
timeout=self.request_timeout)
|
||||
for pkt in packets:
|
||||
self.queue.task_done()
|
||||
if r is None:
|
||||
self.logger.warning(
|
||||
'Connection refused by the server, aborting')
|
||||
break
|
||||
if r.status_code < 200 or r.status_code >= 300:
|
||||
self.logger.warning('Unexpected status code %s in server '
|
||||
'response, aborting', r.status_code)
|
||||
self._reset()
|
||||
break
|
||||
else:
|
||||
# websocket
|
||||
try:
|
||||
for pkt in packets:
|
||||
encoded_packet = pkt.encode(always_bytes=False)
|
||||
if pkt.binary:
|
||||
self.ws.send_binary(encoded_packet)
|
||||
else:
|
||||
self.ws.send(encoded_packet)
|
||||
self.queue.task_done()
|
||||
except websocket.WebSocketConnectionClosedException:
|
||||
self.logger.warning(
|
||||
'WebSocket connection was closed, aborting')
|
||||
break
|
||||
self.logger.info('Exiting write loop task')
|
||||
@@ -1,22 +0,0 @@
|
||||
class EngineIOError(Exception):
|
||||
pass
|
||||
|
||||
|
||||
class ContentTooLongError(EngineIOError):
|
||||
pass
|
||||
|
||||
|
||||
class UnknownPacketError(EngineIOError):
|
||||
pass
|
||||
|
||||
|
||||
class QueueEmpty(EngineIOError):
|
||||
pass
|
||||
|
||||
|
||||
class SocketIsClosedError(EngineIOError):
|
||||
pass
|
||||
|
||||
|
||||
class ConnectionError(EngineIOError):
|
||||
pass
|
||||
@@ -1,87 +0,0 @@
|
||||
import os
|
||||
from engineio.static_files import get_static_file
|
||||
|
||||
|
||||
class WSGIApp(object):
|
||||
"""WSGI application middleware for Engine.IO.
|
||||
|
||||
This middleware dispatches traffic to an Engine.IO application. It can
|
||||
also serve a list of static files to the client, or forward unrelated
|
||||
HTTP traffic to another WSGI application.
|
||||
|
||||
:param engineio_app: The Engine.IO server. Must be an instance of the
|
||||
``engineio.Server`` class.
|
||||
:param wsgi_app: The WSGI app that receives all other traffic.
|
||||
:param static_files: A dictionary with static file mapping rules. See the
|
||||
documentation for details on this argument.
|
||||
:param engineio_path: The endpoint where the Engine.IO application should
|
||||
be installed. The default value is appropriate for
|
||||
most cases.
|
||||
|
||||
Example usage::
|
||||
|
||||
import engineio
|
||||
import eventlet
|
||||
|
||||
eio = engineio.Server()
|
||||
app = engineio.WSGIApp(eio, static_files={
|
||||
'/': {'content_type': 'text/html', 'filename': 'index.html'},
|
||||
'/index.html': {'content_type': 'text/html',
|
||||
'filename': 'index.html'},
|
||||
})
|
||||
eventlet.wsgi.server(eventlet.listen(('', 8000)), app)
|
||||
"""
|
||||
def __init__(self, engineio_app, wsgi_app=None, static_files=None,
|
||||
engineio_path='engine.io'):
|
||||
self.engineio_app = engineio_app
|
||||
self.wsgi_app = wsgi_app
|
||||
self.engineio_path = engineio_path.strip('/')
|
||||
self.static_files = static_files or {}
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
if 'gunicorn.socket' in environ:
|
||||
# gunicorn saves the socket under environ['gunicorn.socket'], while
|
||||
# eventlet saves it under environ['eventlet.input']. Eventlet also
|
||||
# stores the socket inside a wrapper class, while gunicon writes it
|
||||
# directly into the environment. To give eventlet's WebSocket
|
||||
# module access to this socket when running under gunicorn, here we
|
||||
# copy the socket to the eventlet format.
|
||||
class Input(object):
|
||||
def __init__(self, socket):
|
||||
self.socket = socket
|
||||
|
||||
def get_socket(self):
|
||||
return self.socket
|
||||
|
||||
environ['eventlet.input'] = Input(environ['gunicorn.socket'])
|
||||
path = environ['PATH_INFO']
|
||||
if path is not None and \
|
||||
path.startswith('/{0}/'.format(self.engineio_path)):
|
||||
return self.engineio_app.handle_request(environ, start_response)
|
||||
else:
|
||||
static_file = get_static_file(path, self.static_files) \
|
||||
if self.static_files else None
|
||||
if static_file:
|
||||
if os.path.exists(static_file['filename']):
|
||||
start_response(
|
||||
'200 OK',
|
||||
[('Content-Type', static_file['content_type'])])
|
||||
with open(static_file['filename'], 'rb') as f:
|
||||
return [f.read()]
|
||||
else:
|
||||
return self.not_found(start_response)
|
||||
elif self.wsgi_app is not None:
|
||||
return self.wsgi_app(environ, start_response)
|
||||
return self.not_found(start_response)
|
||||
|
||||
def not_found(self, start_response):
|
||||
start_response("404 Not Found", [('Content-Type', 'text/plain')])
|
||||
return [b'Not Found']
|
||||
|
||||
|
||||
class Middleware(WSGIApp):
|
||||
"""This class has been renamed to ``WSGIApp`` and is now deprecated."""
|
||||
def __init__(self, engineio_app, wsgi_app=None,
|
||||
engineio_path='engine.io'):
|
||||
super(Middleware, self).__init__(engineio_app, wsgi_app,
|
||||
engineio_path=engineio_path)
|
||||
@@ -1,92 +0,0 @@
|
||||
import base64
|
||||
import json as _json
|
||||
|
||||
import six
|
||||
|
||||
(OPEN, CLOSE, PING, PONG, MESSAGE, UPGRADE, NOOP) = (0, 1, 2, 3, 4, 5, 6)
|
||||
packet_names = ['OPEN', 'CLOSE', 'PING', 'PONG', 'MESSAGE', 'UPGRADE', 'NOOP']
|
||||
|
||||
binary_types = (six.binary_type, bytearray)
|
||||
|
||||
|
||||
class Packet(object):
|
||||
"""Engine.IO packet."""
|
||||
|
||||
json = _json
|
||||
|
||||
def __init__(self, packet_type=NOOP, data=None, binary=None,
|
||||
encoded_packet=None):
|
||||
self.packet_type = packet_type
|
||||
self.data = data
|
||||
if binary is not None:
|
||||
self.binary = binary
|
||||
elif isinstance(data, six.text_type):
|
||||
self.binary = False
|
||||
elif isinstance(data, binary_types):
|
||||
self.binary = True
|
||||
else:
|
||||
self.binary = False
|
||||
if encoded_packet:
|
||||
self.decode(encoded_packet)
|
||||
|
||||
def encode(self, b64=False, always_bytes=True):
|
||||
"""Encode the packet for transmission."""
|
||||
if self.binary and not b64:
|
||||
encoded_packet = six.int2byte(self.packet_type)
|
||||
else:
|
||||
encoded_packet = six.text_type(self.packet_type)
|
||||
if self.binary and b64:
|
||||
encoded_packet = 'b' + encoded_packet
|
||||
if self.binary:
|
||||
if b64:
|
||||
encoded_packet += base64.b64encode(self.data).decode('utf-8')
|
||||
else:
|
||||
encoded_packet += self.data
|
||||
elif isinstance(self.data, six.string_types):
|
||||
encoded_packet += self.data
|
||||
elif isinstance(self.data, dict) or isinstance(self.data, list):
|
||||
encoded_packet += self.json.dumps(self.data,
|
||||
separators=(',', ':'))
|
||||
elif self.data is not None:
|
||||
encoded_packet += str(self.data)
|
||||
if always_bytes and not isinstance(encoded_packet, binary_types):
|
||||
encoded_packet = encoded_packet.encode('utf-8')
|
||||
return encoded_packet
|
||||
|
||||
def decode(self, encoded_packet):
|
||||
"""Decode a transmitted package."""
|
||||
b64 = False
|
||||
if not isinstance(encoded_packet, binary_types):
|
||||
encoded_packet = encoded_packet.encode('utf-8')
|
||||
elif not isinstance(encoded_packet, bytes):
|
||||
encoded_packet = bytes(encoded_packet)
|
||||
self.packet_type = six.byte2int(encoded_packet[0:1])
|
||||
if self.packet_type == 98: # 'b' --> binary base64 encoded packet
|
||||
self.binary = True
|
||||
encoded_packet = encoded_packet[1:]
|
||||
self.packet_type = six.byte2int(encoded_packet[0:1])
|
||||
self.packet_type -= 48
|
||||
b64 = True
|
||||
elif self.packet_type >= 48:
|
||||
self.packet_type -= 48
|
||||
self.binary = False
|
||||
else:
|
||||
self.binary = True
|
||||
self.data = None
|
||||
if len(encoded_packet) > 1:
|
||||
if self.binary:
|
||||
if b64:
|
||||
self.data = base64.b64decode(encoded_packet[1:])
|
||||
else:
|
||||
self.data = encoded_packet[1:]
|
||||
else:
|
||||
try:
|
||||
self.data = self.json.loads(
|
||||
encoded_packet[1:].decode('utf-8'))
|
||||
if isinstance(self.data, int):
|
||||
# do not allow integer payloads, see
|
||||
# github.com/miguelgrinberg/python-engineio/issues/75
|
||||
# for background on this decision
|
||||
raise ValueError
|
||||
except ValueError:
|
||||
self.data = encoded_packet[1:].decode('utf-8')
|
||||
@@ -1,81 +0,0 @@
|
||||
import six
|
||||
|
||||
from . import packet
|
||||
|
||||
from six.moves import urllib
|
||||
|
||||
|
||||
class Payload(object):
|
||||
"""Engine.IO payload."""
|
||||
max_decode_packets = 16
|
||||
|
||||
def __init__(self, packets=None, encoded_payload=None):
|
||||
self.packets = packets or []
|
||||
if encoded_payload is not None:
|
||||
self.decode(encoded_payload)
|
||||
|
||||
def encode(self, b64=False, jsonp_index=None):
|
||||
"""Encode the payload for transmission."""
|
||||
encoded_payload = b''
|
||||
for pkt in self.packets:
|
||||
encoded_packet = pkt.encode(b64=b64)
|
||||
packet_len = len(encoded_packet)
|
||||
if b64:
|
||||
encoded_payload += str(packet_len).encode('utf-8') + b':' + \
|
||||
encoded_packet
|
||||
else:
|
||||
binary_len = b''
|
||||
while packet_len != 0:
|
||||
binary_len = six.int2byte(packet_len % 10) + binary_len
|
||||
packet_len = int(packet_len / 10)
|
||||
if not pkt.binary:
|
||||
encoded_payload += b'\0'
|
||||
else:
|
||||
encoded_payload += b'\1'
|
||||
encoded_payload += binary_len + b'\xff' + encoded_packet
|
||||
if jsonp_index is not None:
|
||||
encoded_payload = b'___eio[' + \
|
||||
str(jsonp_index).encode() + \
|
||||
b']("' + \
|
||||
encoded_payload.replace(b'"', b'\\"') + \
|
||||
b'");'
|
||||
return encoded_payload
|
||||
|
||||
def decode(self, encoded_payload):
|
||||
"""Decode a transmitted payload."""
|
||||
self.packets = []
|
||||
|
||||
if len(encoded_payload) == 0:
|
||||
return
|
||||
|
||||
# JSONP POST payload starts with 'd='
|
||||
if encoded_payload.startswith(b'd='):
|
||||
encoded_payload = urllib.parse.parse_qs(
|
||||
encoded_payload)[b'd'][0]
|
||||
|
||||
i = 0
|
||||
if six.byte2int(encoded_payload[0:1]) <= 1:
|
||||
# binary encoding
|
||||
while i < len(encoded_payload):
|
||||
if len(self.packets) >= self.max_decode_packets:
|
||||
raise ValueError('Too many packets in payload')
|
||||
packet_len = 0
|
||||
i += 1
|
||||
while six.byte2int(encoded_payload[i:i + 1]) != 255:
|
||||
packet_len = packet_len * 10 + six.byte2int(
|
||||
encoded_payload[i:i + 1])
|
||||
i += 1
|
||||
self.packets.append(packet.Packet(
|
||||
encoded_packet=encoded_payload[i + 1:i + 1 + packet_len]))
|
||||
i += packet_len + 1
|
||||
else:
|
||||
# assume text encoding
|
||||
encoded_payload = encoded_payload.decode('utf-8')
|
||||
while i < len(encoded_payload):
|
||||
if len(self.packets) >= self.max_decode_packets:
|
||||
raise ValueError('Too many packets in payload')
|
||||
j = encoded_payload.find(':', i)
|
||||
packet_len = int(encoded_payload[i:j])
|
||||
pkt = encoded_payload[j + 1:j + 1 + packet_len]
|
||||
self.packets.append(packet.Packet(encoded_packet=pkt))
|
||||
i = j + 1 + packet_len
|
||||
@@ -1,675 +0,0 @@
|
||||
import gzip
|
||||
import importlib
|
||||
import logging
|
||||
import uuid
|
||||
import zlib
|
||||
|
||||
import six
|
||||
from six.moves import urllib
|
||||
|
||||
from . import exceptions
|
||||
from . import packet
|
||||
from . import payload
|
||||
from . import socket
|
||||
|
||||
default_logger = logging.getLogger('engineio.server')
|
||||
|
||||
|
||||
class Server(object):
|
||||
"""An Engine.IO server.
|
||||
|
||||
This class implements a fully compliant Engine.IO web server with support
|
||||
for websocket and long-polling transports.
|
||||
|
||||
:param async_mode: The asynchronous model to use. See the Deployment
|
||||
section in the documentation for a description of the
|
||||
available options. Valid async modes are "threading",
|
||||
"eventlet", "gevent" and "gevent_uwsgi". If this
|
||||
argument is not given, "eventlet" is tried first, then
|
||||
"gevent_uwsgi", then "gevent", and finally "threading".
|
||||
The first async mode that has all its dependencies
|
||||
installed is the one that is chosen.
|
||||
:param ping_timeout: The time in seconds that the client waits for the
|
||||
server to respond before disconnecting. The default
|
||||
is 60 seconds.
|
||||
:param ping_interval: The interval in seconds at which the client pings
|
||||
the server. The default is 25 seconds. For advanced
|
||||
control, a two element tuple can be given, where
|
||||
the first number is the ping interval and the second
|
||||
is a grace period added by the server. The default
|
||||
grace period is 5 seconds.
|
||||
:param max_http_buffer_size: The maximum size of a message when using the
|
||||
polling transport. The default is 100,000,000
|
||||
bytes.
|
||||
:param allow_upgrades: Whether to allow transport upgrades or not. The
|
||||
default is ``True``.
|
||||
:param http_compression: Whether to compress packages when using the
|
||||
polling transport. The default is ``True``.
|
||||
:param compression_threshold: Only compress messages when their byte size
|
||||
is greater than this value. The default is
|
||||
1024 bytes.
|
||||
:param cookie: Name of the HTTP cookie that contains the client session
|
||||
id. If set to ``None``, a cookie is not sent to the client.
|
||||
The default is ``'io'``.
|
||||
:param cors_allowed_origins: Origin or list of origins that are allowed to
|
||||
connect to this server. Only the same origin
|
||||
is allowed by default. Set this argument to
|
||||
``'*'`` to allow all origins, or to ``[]`` to
|
||||
disable CORS handling.
|
||||
:param cors_credentials: Whether credentials (cookies, authentication) are
|
||||
allowed in requests to this server. The default
|
||||
is ``True``.
|
||||
:param logger: To enable logging set to ``True`` or pass a logger object to
|
||||
use. To disable logging set to ``False``. The default is
|
||||
``False``.
|
||||
:param json: An alternative json module to use for encoding and decoding
|
||||
packets. Custom json modules must have ``dumps`` and ``loads``
|
||||
functions that are compatible with the standard library
|
||||
versions.
|
||||
:param async_handlers: If set to ``True``, run message event handlers in
|
||||
non-blocking threads. To run handlers synchronously,
|
||||
set to ``False``. The default is ``True``.
|
||||
:param monitor_clients: If set to ``True``, a background task will ensure
|
||||
inactive clients are closed. Set to ``False`` to
|
||||
disable the monitoring task (not recommended). The
|
||||
default is ``True``.
|
||||
:param kwargs: Reserved for future extensions, any additional parameters
|
||||
given as keyword arguments will be silently ignored.
|
||||
"""
|
||||
compression_methods = ['gzip', 'deflate']
|
||||
event_names = ['connect', 'disconnect', 'message']
|
||||
_default_monitor_clients = True
|
||||
|
||||
def __init__(self, async_mode=None, ping_timeout=60, ping_interval=25,
|
||||
max_http_buffer_size=100000000, allow_upgrades=True,
|
||||
http_compression=True, compression_threshold=1024,
|
||||
cookie='io', cors_allowed_origins=None,
|
||||
cors_credentials=True, logger=False, json=None,
|
||||
async_handlers=True, monitor_clients=None, **kwargs):
|
||||
self.ping_timeout = ping_timeout
|
||||
if isinstance(ping_interval, tuple):
|
||||
self.ping_interval = ping_interval[0]
|
||||
self.ping_interval_grace_period = ping_interval[1]
|
||||
else:
|
||||
self.ping_interval = ping_interval
|
||||
self.ping_interval_grace_period = 5
|
||||
self.max_http_buffer_size = max_http_buffer_size
|
||||
self.allow_upgrades = allow_upgrades
|
||||
self.http_compression = http_compression
|
||||
self.compression_threshold = compression_threshold
|
||||
self.cookie = cookie
|
||||
self.cors_allowed_origins = cors_allowed_origins
|
||||
self.cors_credentials = cors_credentials
|
||||
self.async_handlers = async_handlers
|
||||
self.sockets = {}
|
||||
self.handlers = {}
|
||||
self.start_service_task = monitor_clients \
|
||||
if monitor_clients is not None else self._default_monitor_clients
|
||||
if json is not None:
|
||||
packet.Packet.json = json
|
||||
if not isinstance(logger, bool):
|
||||
self.logger = logger
|
||||
else:
|
||||
self.logger = default_logger
|
||||
if not logging.root.handlers and \
|
||||
self.logger.level == logging.NOTSET:
|
||||
if logger:
|
||||
self.logger.setLevel(logging.INFO)
|
||||
else:
|
||||
self.logger.setLevel(logging.ERROR)
|
||||
self.logger.addHandler(logging.StreamHandler())
|
||||
modes = self.async_modes()
|
||||
if async_mode is not None:
|
||||
modes = [async_mode] if async_mode in modes else []
|
||||
self._async = None
|
||||
self.async_mode = None
|
||||
for mode in modes:
|
||||
try:
|
||||
self._async = importlib.import_module(
|
||||
'engineio.async_drivers.' + mode)._async
|
||||
asyncio_based = self._async['asyncio'] \
|
||||
if 'asyncio' in self._async else False
|
||||
if asyncio_based != self.is_asyncio_based():
|
||||
continue # pragma: no cover
|
||||
self.async_mode = mode
|
||||
break
|
||||
except ImportError:
|
||||
pass
|
||||
if self.async_mode is None:
|
||||
raise ValueError('Invalid async_mode specified')
|
||||
if self.is_asyncio_based() and \
|
||||
('asyncio' not in self._async or not
|
||||
self._async['asyncio']): # pragma: no cover
|
||||
raise ValueError('The selected async_mode is not asyncio '
|
||||
'compatible')
|
||||
if not self.is_asyncio_based() and 'asyncio' in self._async and \
|
||||
self._async['asyncio']: # pragma: no cover
|
||||
raise ValueError('The selected async_mode requires asyncio and '
|
||||
'must use the AsyncServer class')
|
||||
self.logger.info('Server initialized for %s.', self.async_mode)
|
||||
|
||||
def is_asyncio_based(self):
|
||||
return False
|
||||
|
||||
def async_modes(self):
|
||||
return ['eventlet', 'gevent_uwsgi', 'gevent', 'threading']
|
||||
|
||||
def on(self, event, handler=None):
|
||||
"""Register an event handler.
|
||||
|
||||
:param event: The event name. Can be ``'connect'``, ``'message'`` or
|
||||
``'disconnect'``.
|
||||
:param handler: The function that should be invoked to handle the
|
||||
event. When this parameter is not given, the method
|
||||
acts as a decorator for the handler function.
|
||||
|
||||
Example usage::
|
||||
|
||||
# as a decorator:
|
||||
@eio.on('connect')
|
||||
def connect_handler(sid, environ):
|
||||
print('Connection request')
|
||||
if environ['REMOTE_ADDR'] in blacklisted:
|
||||
return False # reject
|
||||
|
||||
# as a method:
|
||||
def message_handler(sid, msg):
|
||||
print('Received message: ', msg)
|
||||
eio.send(sid, 'response')
|
||||
eio.on('message', message_handler)
|
||||
|
||||
The handler function receives the ``sid`` (session ID) for the
|
||||
client as first argument. The ``'connect'`` event handler receives the
|
||||
WSGI environment as a second argument, and can return ``False`` to
|
||||
reject the connection. The ``'message'`` handler receives the message
|
||||
payload as a second argument. The ``'disconnect'`` handler does not
|
||||
take a second argument.
|
||||
"""
|
||||
if event not in self.event_names:
|
||||
raise ValueError('Invalid event')
|
||||
|
||||
def set_handler(handler):
|
||||
self.handlers[event] = handler
|
||||
return handler
|
||||
|
||||
if handler is None:
|
||||
return set_handler
|
||||
set_handler(handler)
|
||||
|
||||
def send(self, sid, data, binary=None):
|
||||
"""Send a message to a client.
|
||||
|
||||
:param sid: The session id of the recipient client.
|
||||
:param data: The data to send to the client. Data can be of type
|
||||
``str``, ``bytes``, ``list`` or ``dict``. If a ``list``
|
||||
or ``dict``, the data will be serialized as JSON.
|
||||
:param binary: ``True`` to send packet as binary, ``False`` to send
|
||||
as text. If not given, unicode (Python 2) and str
|
||||
(Python 3) are sent as text, and str (Python 2) and
|
||||
bytes (Python 3) are sent as binary.
|
||||
"""
|
||||
try:
|
||||
socket = self._get_socket(sid)
|
||||
except KeyError:
|
||||
# the socket is not available
|
||||
self.logger.warning('Cannot send to sid %s', sid)
|
||||
return
|
||||
socket.send(packet.Packet(packet.MESSAGE, data=data, binary=binary))
|
||||
|
||||
def get_session(self, sid):
|
||||
"""Return the user session for a client.
|
||||
|
||||
:param sid: The session id of the client.
|
||||
|
||||
The return value is a dictionary. Modifications made to this
|
||||
dictionary are not guaranteed to be preserved unless
|
||||
``save_session()`` is called, or when the ``session`` context manager
|
||||
is used.
|
||||
"""
|
||||
socket = self._get_socket(sid)
|
||||
return socket.session
|
||||
|
||||
def save_session(self, sid, session):
|
||||
"""Store the user session for a client.
|
||||
|
||||
:param sid: The session id of the client.
|
||||
:param session: The session dictionary.
|
||||
"""
|
||||
socket = self._get_socket(sid)
|
||||
socket.session = session
|
||||
|
||||
def session(self, sid):
|
||||
"""Return the user session for a client with context manager syntax.
|
||||
|
||||
:param sid: The session id of the client.
|
||||
|
||||
This is a context manager that returns the user session dictionary for
|
||||
the client. Any changes that are made to this dictionary inside the
|
||||
context manager block are saved back to the session. Example usage::
|
||||
|
||||
@eio.on('connect')
|
||||
def on_connect(sid, environ):
|
||||
username = authenticate_user(environ)
|
||||
if not username:
|
||||
return False
|
||||
with eio.session(sid) as session:
|
||||
session['username'] = username
|
||||
|
||||
@eio.on('message')
|
||||
def on_message(sid, msg):
|
||||
with eio.session(sid) as session:
|
||||
print('received message from ', session['username'])
|
||||
"""
|
||||
class _session_context_manager(object):
|
||||
def __init__(self, server, sid):
|
||||
self.server = server
|
||||
self.sid = sid
|
||||
self.session = None
|
||||
|
||||
def __enter__(self):
|
||||
self.session = self.server.get_session(sid)
|
||||
return self.session
|
||||
|
||||
def __exit__(self, *args):
|
||||
self.server.save_session(sid, self.session)
|
||||
|
||||
return _session_context_manager(self, sid)
|
||||
|
||||
def disconnect(self, sid=None):
|
||||
"""Disconnect a client.
|
||||
|
||||
:param sid: The session id of the client to close. If this parameter
|
||||
is not given, then all clients are closed.
|
||||
"""
|
||||
if sid is not None:
|
||||
try:
|
||||
socket = self._get_socket(sid)
|
||||
except KeyError: # pragma: no cover
|
||||
# the socket was already closed or gone
|
||||
pass
|
||||
else:
|
||||
socket.close()
|
||||
if sid in self.sockets: # pragma: no cover
|
||||
del self.sockets[sid]
|
||||
else:
|
||||
for client in six.itervalues(self.sockets):
|
||||
client.close()
|
||||
self.sockets = {}
|
||||
|
||||
def transport(self, sid):
|
||||
"""Return the name of the transport used by the client.
|
||||
|
||||
The two possible values returned by this function are ``'polling'``
|
||||
and ``'websocket'``.
|
||||
|
||||
:param sid: The session of the client.
|
||||
"""
|
||||
return 'websocket' if self._get_socket(sid).upgraded else 'polling'
|
||||
|
||||
def handle_request(self, environ, start_response):
|
||||
"""Handle an HTTP request from the client.
|
||||
|
||||
This is the entry point of the Engine.IO application, using the same
|
||||
interface as a WSGI application. For the typical usage, this function
|
||||
is invoked by the :class:`Middleware` instance, but it can be invoked
|
||||
directly when the middleware is not used.
|
||||
|
||||
:param environ: The WSGI environment.
|
||||
:param start_response: The WSGI ``start_response`` function.
|
||||
|
||||
This function returns the HTTP response body to deliver to the client
|
||||
as a byte sequence.
|
||||
"""
|
||||
if self.cors_allowed_origins != []:
|
||||
# Validate the origin header if present
|
||||
# This is important for WebSocket more than for HTTP, since
|
||||
# browsers only apply CORS controls to HTTP.
|
||||
origin = environ.get('HTTP_ORIGIN')
|
||||
if origin:
|
||||
allowed_origins = self._cors_allowed_origins(environ)
|
||||
if allowed_origins is not None and origin not in \
|
||||
allowed_origins:
|
||||
self.logger.info(origin + ' is not an accepted origin.')
|
||||
r = self._bad_request()
|
||||
start_response(r['status'], r['headers'])
|
||||
return [r['response']]
|
||||
|
||||
method = environ['REQUEST_METHOD']
|
||||
query = urllib.parse.parse_qs(environ.get('QUERY_STRING', ''))
|
||||
|
||||
sid = query['sid'][0] if 'sid' in query else None
|
||||
b64 = False
|
||||
jsonp = False
|
||||
jsonp_index = None
|
||||
|
||||
if 'b64' in query:
|
||||
if query['b64'][0] == "1" or query['b64'][0].lower() == "true":
|
||||
b64 = True
|
||||
if 'j' in query:
|
||||
jsonp = True
|
||||
try:
|
||||
jsonp_index = int(query['j'][0])
|
||||
except (ValueError, KeyError, IndexError):
|
||||
# Invalid JSONP index number
|
||||
pass
|
||||
|
||||
if jsonp and jsonp_index is None:
|
||||
self.logger.warning('Invalid JSONP index number')
|
||||
r = self._bad_request()
|
||||
elif method == 'GET':
|
||||
if sid is None:
|
||||
transport = query.get('transport', ['polling'])[0]
|
||||
if transport != 'polling' and transport != 'websocket':
|
||||
self.logger.warning('Invalid transport %s', transport)
|
||||
r = self._bad_request()
|
||||
else:
|
||||
r = self._handle_connect(environ, start_response,
|
||||
transport, b64, jsonp_index)
|
||||
else:
|
||||
if sid not in self.sockets:
|
||||
self.logger.warning('Invalid session %s', sid)
|
||||
r = self._bad_request()
|
||||
else:
|
||||
socket = self._get_socket(sid)
|
||||
try:
|
||||
packets = socket.handle_get_request(
|
||||
environ, start_response)
|
||||
if isinstance(packets, list):
|
||||
r = self._ok(packets, b64=b64,
|
||||
jsonp_index=jsonp_index)
|
||||
else:
|
||||
r = packets
|
||||
except exceptions.EngineIOError:
|
||||
if sid in self.sockets: # pragma: no cover
|
||||
self.disconnect(sid)
|
||||
r = self._bad_request()
|
||||
if sid in self.sockets and self.sockets[sid].closed:
|
||||
del self.sockets[sid]
|
||||
elif method == 'POST':
|
||||
if sid is None or sid not in self.sockets:
|
||||
self.logger.warning('Invalid session %s', sid)
|
||||
r = self._bad_request()
|
||||
else:
|
||||
socket = self._get_socket(sid)
|
||||
try:
|
||||
socket.handle_post_request(environ)
|
||||
r = self._ok(jsonp_index=jsonp_index)
|
||||
except exceptions.EngineIOError:
|
||||
if sid in self.sockets: # pragma: no cover
|
||||
self.disconnect(sid)
|
||||
r = self._bad_request()
|
||||
except: # pragma: no cover
|
||||
# for any other unexpected errors, we log the error
|
||||
# and keep going
|
||||
self.logger.exception('post request handler error')
|
||||
r = self._ok(jsonp_index=jsonp_index)
|
||||
elif method == 'OPTIONS':
|
||||
r = self._ok()
|
||||
else:
|
||||
self.logger.warning('Method %s not supported', method)
|
||||
r = self._method_not_found()
|
||||
|
||||
if not isinstance(r, dict):
|
||||
return r or []
|
||||
if self.http_compression and \
|
||||
len(r['response']) >= self.compression_threshold:
|
||||
encodings = [e.split(';')[0].strip() for e in
|
||||
environ.get('HTTP_ACCEPT_ENCODING', '').split(',')]
|
||||
for encoding in encodings:
|
||||
if encoding in self.compression_methods:
|
||||
r['response'] = \
|
||||
getattr(self, '_' + encoding)(r['response'])
|
||||
r['headers'] += [('Content-Encoding', encoding)]
|
||||
break
|
||||
cors_headers = self._cors_headers(environ)
|
||||
start_response(r['status'], r['headers'] + cors_headers)
|
||||
return [r['response']]
|
||||
|
||||
def start_background_task(self, target, *args, **kwargs):
|
||||
"""Start a background task using the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to start a
|
||||
background task using the method that is compatible with the
|
||||
selected async mode.
|
||||
|
||||
:param target: the target function to execute.
|
||||
:param args: arguments to pass to the function.
|
||||
:param kwargs: keyword arguments to pass to the function.
|
||||
|
||||
This function returns an object compatible with the `Thread` class in
|
||||
the Python standard library. The `start()` method on this object is
|
||||
already called by this function.
|
||||
"""
|
||||
th = self._async['thread'](target=target, args=args, kwargs=kwargs)
|
||||
th.start()
|
||||
return th # pragma: no cover
|
||||
|
||||
def sleep(self, seconds=0):
|
||||
"""Sleep for the requested amount of time using the appropriate async
|
||||
model.
|
||||
|
||||
This is a utility function that applications can use to put a task to
|
||||
sleep without having to worry about using the correct call for the
|
||||
selected async mode.
|
||||
"""
|
||||
return self._async['sleep'](seconds)
|
||||
|
||||
def create_queue(self, *args, **kwargs):
|
||||
"""Create a queue object using the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to create a queue
|
||||
without having to worry about using the correct call for the selected
|
||||
async mode.
|
||||
"""
|
||||
return self._async['queue'](*args, **kwargs)
|
||||
|
||||
def get_queue_empty_exception(self):
|
||||
"""Return the queue empty exception for the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to work with a
|
||||
queue without having to worry about using the correct call for the
|
||||
selected async mode.
|
||||
"""
|
||||
return self._async['queue_empty']
|
||||
|
||||
def create_event(self, *args, **kwargs):
|
||||
"""Create an event object using the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to create an
|
||||
event without having to worry about using the correct call for the
|
||||
selected async mode.
|
||||
"""
|
||||
return self._async['event'](*args, **kwargs)
|
||||
|
||||
def _generate_id(self):
|
||||
"""Generate a unique session id."""
|
||||
return uuid.uuid4().hex
|
||||
|
||||
def _handle_connect(self, environ, start_response, transport, b64=False,
|
||||
jsonp_index=None):
|
||||
"""Handle a client connection request."""
|
||||
if self.start_service_task:
|
||||
# start the service task to monitor connected clients
|
||||
self.start_service_task = False
|
||||
self.start_background_task(self._service_task)
|
||||
|
||||
sid = self._generate_id()
|
||||
s = socket.Socket(self, sid)
|
||||
self.sockets[sid] = s
|
||||
|
||||
pkt = packet.Packet(
|
||||
packet.OPEN, {'sid': sid,
|
||||
'upgrades': self._upgrades(sid, transport),
|
||||
'pingTimeout': int(self.ping_timeout * 1000),
|
||||
'pingInterval': int(self.ping_interval * 1000)})
|
||||
s.send(pkt)
|
||||
|
||||
ret = self._trigger_event('connect', sid, environ, run_async=False)
|
||||
if ret is False:
|
||||
del self.sockets[sid]
|
||||
self.logger.warning('Application rejected connection')
|
||||
return self._unauthorized()
|
||||
|
||||
if transport == 'websocket':
|
||||
ret = s.handle_get_request(environ, start_response)
|
||||
if s.closed:
|
||||
# websocket connection ended, so we are done
|
||||
del self.sockets[sid]
|
||||
return ret
|
||||
else:
|
||||
s.connected = True
|
||||
headers = None
|
||||
if self.cookie:
|
||||
headers = [('Set-Cookie', self.cookie + '=' + sid)]
|
||||
try:
|
||||
return self._ok(s.poll(), headers=headers, b64=b64,
|
||||
jsonp_index=jsonp_index)
|
||||
except exceptions.QueueEmpty:
|
||||
return self._bad_request()
|
||||
|
||||
def _upgrades(self, sid, transport):
|
||||
"""Return the list of possible upgrades for a client connection."""
|
||||
if not self.allow_upgrades or self._get_socket(sid).upgraded or \
|
||||
self._async['websocket'] is None or transport == 'websocket':
|
||||
return []
|
||||
return ['websocket']
|
||||
|
||||
def _trigger_event(self, event, *args, **kwargs):
|
||||
"""Invoke an event handler."""
|
||||
run_async = kwargs.pop('run_async', False)
|
||||
if event in self.handlers:
|
||||
if run_async:
|
||||
return self.start_background_task(self.handlers[event], *args)
|
||||
else:
|
||||
try:
|
||||
return self.handlers[event](*args)
|
||||
except:
|
||||
self.logger.exception(event + ' handler error')
|
||||
if event == 'connect':
|
||||
# if connect handler raised error we reject the
|
||||
# connection
|
||||
return False
|
||||
|
||||
def _get_socket(self, sid):
|
||||
"""Return the socket object for a given session."""
|
||||
try:
|
||||
s = self.sockets[sid]
|
||||
except KeyError:
|
||||
raise KeyError('Session not found')
|
||||
if s.closed:
|
||||
del self.sockets[sid]
|
||||
raise KeyError('Session is disconnected')
|
||||
return s
|
||||
|
||||
def _ok(self, packets=None, headers=None, b64=False, jsonp_index=None):
|
||||
"""Generate a successful HTTP response."""
|
||||
if packets is not None:
|
||||
if headers is None:
|
||||
headers = []
|
||||
if b64:
|
||||
headers += [('Content-Type', 'text/plain; charset=UTF-8')]
|
||||
else:
|
||||
headers += [('Content-Type', 'application/octet-stream')]
|
||||
return {'status': '200 OK',
|
||||
'headers': headers,
|
||||
'response': payload.Payload(packets=packets).encode(
|
||||
b64=b64, jsonp_index=jsonp_index)}
|
||||
else:
|
||||
return {'status': '200 OK',
|
||||
'headers': [('Content-Type', 'text/plain')],
|
||||
'response': b'OK'}
|
||||
|
||||
def _bad_request(self):
|
||||
"""Generate a bad request HTTP error response."""
|
||||
return {'status': '400 BAD REQUEST',
|
||||
'headers': [('Content-Type', 'text/plain')],
|
||||
'response': b'Bad Request'}
|
||||
|
||||
def _method_not_found(self):
|
||||
"""Generate a method not found HTTP error response."""
|
||||
return {'status': '405 METHOD NOT FOUND',
|
||||
'headers': [('Content-Type', 'text/plain')],
|
||||
'response': b'Method Not Found'}
|
||||
|
||||
def _unauthorized(self):
|
||||
"""Generate a unauthorized HTTP error response."""
|
||||
return {'status': '401 UNAUTHORIZED',
|
||||
'headers': [('Content-Type', 'text/plain')],
|
||||
'response': b'Unauthorized'}
|
||||
|
||||
def _cors_allowed_origins(self, environ):
|
||||
default_origins = []
|
||||
if 'wsgi.url_scheme' in environ and 'HTTP_HOST' in environ:
|
||||
default_origins.append('{scheme}://{host}'.format(
|
||||
scheme=environ['wsgi.url_scheme'], host=environ['HTTP_HOST']))
|
||||
if 'HTTP_X_FORWARDED_HOST' in environ:
|
||||
scheme = environ.get(
|
||||
'HTTP_X_FORWARDED_PROTO',
|
||||
environ['wsgi.url_scheme']).split(',')[0].strip()
|
||||
default_origins.append('{scheme}://{host}'.format(
|
||||
scheme=scheme, host=environ['HTTP_X_FORWARDED_HOST'].split(
|
||||
',')[0].strip()))
|
||||
if self.cors_allowed_origins is None:
|
||||
allowed_origins = default_origins
|
||||
elif self.cors_allowed_origins == '*':
|
||||
allowed_origins = None
|
||||
elif isinstance(self.cors_allowed_origins, six.string_types):
|
||||
allowed_origins = [self.cors_allowed_origins]
|
||||
else:
|
||||
allowed_origins = self.cors_allowed_origins
|
||||
return allowed_origins
|
||||
|
||||
def _cors_headers(self, environ):
|
||||
"""Return the cross-origin-resource-sharing headers."""
|
||||
if self.cors_allowed_origins == []:
|
||||
# special case, CORS handling is completely disabled
|
||||
return []
|
||||
headers = []
|
||||
allowed_origins = self._cors_allowed_origins(environ)
|
||||
if 'HTTP_ORIGIN' in environ and \
|
||||
(allowed_origins is None or environ['HTTP_ORIGIN'] in
|
||||
allowed_origins):
|
||||
headers = [('Access-Control-Allow-Origin', environ['HTTP_ORIGIN'])]
|
||||
if environ['REQUEST_METHOD'] == 'OPTIONS':
|
||||
headers += [('Access-Control-Allow-Methods', 'OPTIONS, GET, POST')]
|
||||
if 'HTTP_ACCESS_CONTROL_REQUEST_HEADERS' in environ:
|
||||
headers += [('Access-Control-Allow-Headers',
|
||||
environ['HTTP_ACCESS_CONTROL_REQUEST_HEADERS'])]
|
||||
if self.cors_credentials:
|
||||
headers += [('Access-Control-Allow-Credentials', 'true')]
|
||||
return headers
|
||||
|
||||
def _gzip(self, response):
|
||||
"""Apply gzip compression to a response."""
|
||||
bytesio = six.BytesIO()
|
||||
with gzip.GzipFile(fileobj=bytesio, mode='w') as gz:
|
||||
gz.write(response)
|
||||
return bytesio.getvalue()
|
||||
|
||||
def _deflate(self, response):
|
||||
"""Apply deflate compression to a response."""
|
||||
return zlib.compress(response)
|
||||
|
||||
def _service_task(self): # pragma: no cover
|
||||
"""Monitor connected clients and clean up those that time out."""
|
||||
while True:
|
||||
if len(self.sockets) == 0:
|
||||
# nothing to do
|
||||
self.sleep(self.ping_timeout)
|
||||
continue
|
||||
|
||||
# go through the entire client list in a ping interval cycle
|
||||
sleep_interval = float(self.ping_timeout) / len(self.sockets)
|
||||
|
||||
try:
|
||||
# iterate over the current clients
|
||||
for s in self.sockets.copy().values():
|
||||
if not s.closing and not s.closed:
|
||||
s.check_ping_timeout()
|
||||
self.sleep(sleep_interval)
|
||||
except (SystemExit, KeyboardInterrupt):
|
||||
self.logger.info('service task canceled')
|
||||
break
|
||||
except:
|
||||
# an unexpected exception has occurred, log it and continue
|
||||
self.logger.exception('service task exception')
|
||||
@@ -1,248 +0,0 @@
|
||||
import six
|
||||
import sys
|
||||
import time
|
||||
|
||||
from . import exceptions
|
||||
from . import packet
|
||||
from . import payload
|
||||
|
||||
|
||||
class Socket(object):
|
||||
"""An Engine.IO socket."""
|
||||
upgrade_protocols = ['websocket']
|
||||
|
||||
def __init__(self, server, sid):
|
||||
self.server = server
|
||||
self.sid = sid
|
||||
self.queue = self.server.create_queue()
|
||||
self.last_ping = time.time()
|
||||
self.connected = False
|
||||
self.upgrading = False
|
||||
self.upgraded = False
|
||||
self.packet_backlog = []
|
||||
self.closing = False
|
||||
self.closed = False
|
||||
self.session = {}
|
||||
|
||||
def poll(self):
|
||||
"""Wait for packets to send to the client."""
|
||||
queue_empty = self.server.get_queue_empty_exception()
|
||||
try:
|
||||
packets = [self.queue.get(timeout=self.server.ping_timeout)]
|
||||
self.queue.task_done()
|
||||
except queue_empty:
|
||||
raise exceptions.QueueEmpty()
|
||||
if packets == [None]:
|
||||
return []
|
||||
while True:
|
||||
try:
|
||||
packets.append(self.queue.get(block=False))
|
||||
self.queue.task_done()
|
||||
except queue_empty:
|
||||
break
|
||||
return packets
|
||||
|
||||
def receive(self, pkt):
|
||||
"""Receive packet from the client."""
|
||||
packet_name = packet.packet_names[pkt.packet_type] \
|
||||
if pkt.packet_type < len(packet.packet_names) else 'UNKNOWN'
|
||||
self.server.logger.info('%s: Received packet %s data %s',
|
||||
self.sid, packet_name,
|
||||
pkt.data if not isinstance(pkt.data, bytes)
|
||||
else '<binary>')
|
||||
if pkt.packet_type == packet.PING:
|
||||
self.last_ping = time.time()
|
||||
self.send(packet.Packet(packet.PONG, pkt.data))
|
||||
elif pkt.packet_type == packet.MESSAGE:
|
||||
self.server._trigger_event('message', self.sid, pkt.data,
|
||||
run_async=self.server.async_handlers)
|
||||
elif pkt.packet_type == packet.UPGRADE:
|
||||
self.send(packet.Packet(packet.NOOP))
|
||||
elif pkt.packet_type == packet.CLOSE:
|
||||
self.close(wait=False, abort=True)
|
||||
else:
|
||||
raise exceptions.UnknownPacketError()
|
||||
|
||||
def check_ping_timeout(self):
|
||||
"""Make sure the client is still sending pings.
|
||||
|
||||
This helps detect disconnections for long-polling clients.
|
||||
"""
|
||||
if self.closed:
|
||||
raise exceptions.SocketIsClosedError()
|
||||
if time.time() - self.last_ping > self.server.ping_interval + \
|
||||
self.server.ping_interval_grace_period:
|
||||
self.server.logger.info('%s: Client is gone, closing socket',
|
||||
self.sid)
|
||||
# Passing abort=False here will cause close() to write a
|
||||
# CLOSE packet. This has the effect of updating half-open sockets
|
||||
# to their correct state of disconnected
|
||||
self.close(wait=False, abort=False)
|
||||
return False
|
||||
return True
|
||||
|
||||
def send(self, pkt):
|
||||
"""Send a packet to the client."""
|
||||
if not self.check_ping_timeout():
|
||||
return
|
||||
if self.upgrading:
|
||||
self.packet_backlog.append(pkt)
|
||||
else:
|
||||
self.queue.put(pkt)
|
||||
self.server.logger.info('%s: Sending packet %s data %s',
|
||||
self.sid, packet.packet_names[pkt.packet_type],
|
||||
pkt.data if not isinstance(pkt.data, bytes)
|
||||
else '<binary>')
|
||||
|
||||
def handle_get_request(self, environ, start_response):
|
||||
"""Handle a long-polling GET request from the client."""
|
||||
connections = [
|
||||
s.strip()
|
||||
for s in environ.get('HTTP_CONNECTION', '').lower().split(',')]
|
||||
transport = environ.get('HTTP_UPGRADE', '').lower()
|
||||
if 'upgrade' in connections and transport in self.upgrade_protocols:
|
||||
self.server.logger.info('%s: Received request to upgrade to %s',
|
||||
self.sid, transport)
|
||||
return getattr(self, '_upgrade_' + transport)(environ,
|
||||
start_response)
|
||||
try:
|
||||
packets = self.poll()
|
||||
except exceptions.QueueEmpty:
|
||||
exc = sys.exc_info()
|
||||
self.close(wait=False)
|
||||
six.reraise(*exc)
|
||||
return packets
|
||||
|
||||
def handle_post_request(self, environ):
|
||||
"""Handle a long-polling POST request from the client."""
|
||||
length = int(environ.get('CONTENT_LENGTH', '0'))
|
||||
if length > self.server.max_http_buffer_size:
|
||||
raise exceptions.ContentTooLongError()
|
||||
else:
|
||||
body = environ['wsgi.input'].read(length)
|
||||
p = payload.Payload(encoded_payload=body)
|
||||
for pkt in p.packets:
|
||||
self.receive(pkt)
|
||||
|
||||
def close(self, wait=True, abort=False):
|
||||
"""Close the socket connection."""
|
||||
if not self.closed and not self.closing:
|
||||
self.closing = True
|
||||
self.server._trigger_event('disconnect', self.sid, run_async=False)
|
||||
if not abort:
|
||||
self.send(packet.Packet(packet.CLOSE))
|
||||
self.closed = True
|
||||
self.queue.put(None)
|
||||
if wait:
|
||||
self.queue.join()
|
||||
|
||||
def _upgrade_websocket(self, environ, start_response):
|
||||
"""Upgrade the connection from polling to websocket."""
|
||||
if self.upgraded:
|
||||
raise IOError('Socket has been upgraded already')
|
||||
if self.server._async['websocket'] is None:
|
||||
# the selected async mode does not support websocket
|
||||
return self.server._bad_request()
|
||||
ws = self.server._async['websocket'](self._websocket_handler)
|
||||
return ws(environ, start_response)
|
||||
|
||||
def _websocket_handler(self, ws):
|
||||
"""Engine.IO handler for websocket transport."""
|
||||
# try to set a socket timeout matching the configured ping interval
|
||||
for attr in ['_sock', 'socket']: # pragma: no cover
|
||||
if hasattr(ws, attr) and hasattr(getattr(ws, attr), 'settimeout'):
|
||||
getattr(ws, attr).settimeout(self.server.ping_timeout)
|
||||
|
||||
if self.connected:
|
||||
# the socket was already connected, so this is an upgrade
|
||||
self.upgrading = True # hold packet sends during the upgrade
|
||||
|
||||
pkt = ws.wait()
|
||||
decoded_pkt = packet.Packet(encoded_packet=pkt)
|
||||
if decoded_pkt.packet_type != packet.PING or \
|
||||
decoded_pkt.data != 'probe':
|
||||
self.server.logger.info(
|
||||
'%s: Failed websocket upgrade, no PING packet', self.sid)
|
||||
return []
|
||||
ws.send(packet.Packet(
|
||||
packet.PONG,
|
||||
data=six.text_type('probe')).encode(always_bytes=False))
|
||||
self.queue.put(packet.Packet(packet.NOOP)) # end poll
|
||||
|
||||
pkt = ws.wait()
|
||||
decoded_pkt = packet.Packet(encoded_packet=pkt)
|
||||
if decoded_pkt.packet_type != packet.UPGRADE:
|
||||
self.upgraded = False
|
||||
self.server.logger.info(
|
||||
('%s: Failed websocket upgrade, expected UPGRADE packet, '
|
||||
'received %s instead.'),
|
||||
self.sid, pkt)
|
||||
return []
|
||||
self.upgraded = True
|
||||
|
||||
# flush any packets that were sent during the upgrade
|
||||
for pkt in self.packet_backlog:
|
||||
self.queue.put(pkt)
|
||||
self.packet_backlog = []
|
||||
self.upgrading = False
|
||||
else:
|
||||
self.connected = True
|
||||
self.upgraded = True
|
||||
|
||||
# start separate writer thread
|
||||
def writer():
|
||||
while True:
|
||||
packets = None
|
||||
try:
|
||||
packets = self.poll()
|
||||
except exceptions.QueueEmpty:
|
||||
break
|
||||
if not packets:
|
||||
# empty packet list returned -> connection closed
|
||||
break
|
||||
try:
|
||||
for pkt in packets:
|
||||
ws.send(pkt.encode(always_bytes=False))
|
||||
except:
|
||||
break
|
||||
writer_task = self.server.start_background_task(writer)
|
||||
|
||||
self.server.logger.info(
|
||||
'%s: Upgrade to websocket successful', self.sid)
|
||||
|
||||
while True:
|
||||
p = None
|
||||
try:
|
||||
p = ws.wait()
|
||||
except Exception as e:
|
||||
# if the socket is already closed, we can assume this is a
|
||||
# downstream error of that
|
||||
if not self.closed: # pragma: no cover
|
||||
self.server.logger.info(
|
||||
'%s: Unexpected error "%s", closing connection',
|
||||
self.sid, str(e))
|
||||
break
|
||||
if p is None:
|
||||
# connection closed by client
|
||||
break
|
||||
if isinstance(p, six.text_type): # pragma: no cover
|
||||
p = p.encode('utf-8')
|
||||
pkt = packet.Packet(encoded_packet=p)
|
||||
try:
|
||||
self.receive(pkt)
|
||||
except exceptions.UnknownPacketError: # pragma: no cover
|
||||
pass
|
||||
except exceptions.SocketIsClosedError: # pragma: no cover
|
||||
self.server.logger.info('Receive error -- socket is closed')
|
||||
break
|
||||
except: # pragma: no cover
|
||||
# if we get an unexpected exception we log the error and exit
|
||||
# the connection properly
|
||||
self.server.logger.exception('Unknown receive error')
|
||||
break
|
||||
|
||||
self.queue.put(None) # unlock the writer task so that it can exit
|
||||
writer_task.join()
|
||||
self.close(wait=False, abort=True)
|
||||
|
||||
return []
|
||||
@@ -1,55 +0,0 @@
|
||||
content_types = {
|
||||
'css': 'text/css',
|
||||
'gif': 'image/gif',
|
||||
'html': 'text/html',
|
||||
'jpg': 'image/jpeg',
|
||||
'js': 'application/javascript',
|
||||
'json': 'application/json',
|
||||
'png': 'image/png',
|
||||
'txt': 'text/plain',
|
||||
}
|
||||
|
||||
|
||||
def get_static_file(path, static_files):
|
||||
"""Return the local filename and content type for the requested static
|
||||
file URL.
|
||||
|
||||
:param path: the path portion of the requested URL.
|
||||
:param static_files: a static file configuration dictionary.
|
||||
|
||||
This function returns a dictionary with two keys, "filename" and
|
||||
"content_type". If the requested URL does not match any static file, the
|
||||
return value is None.
|
||||
"""
|
||||
if path in static_files:
|
||||
f = static_files[path]
|
||||
else:
|
||||
f = None
|
||||
rest = ''
|
||||
while path != '':
|
||||
path, last = path.rsplit('/', 1)
|
||||
rest = '/' + last + rest
|
||||
if path in static_files:
|
||||
f = static_files[path] + rest
|
||||
break
|
||||
elif path + '/' in static_files:
|
||||
f = static_files[path + '/'] + rest[1:]
|
||||
break
|
||||
if f:
|
||||
if isinstance(f, str):
|
||||
f = {'filename': f}
|
||||
if f['filename'].endswith('/'):
|
||||
if '' in static_files:
|
||||
if isinstance(static_files[''], str):
|
||||
f['filename'] += static_files['']
|
||||
else:
|
||||
f['filename'] += static_files['']['filename']
|
||||
if 'content_type' in static_files['']:
|
||||
f['content_type'] = static_files['']['content_type']
|
||||
else:
|
||||
f['filename'] += 'index.html'
|
||||
if 'content_type' not in f:
|
||||
ext = f['filename'].rsplit('.')[-1]
|
||||
f['content_type'] = content_types.get(
|
||||
ext, 'application/octet-stream')
|
||||
return f
|
||||
@@ -1,922 +0,0 @@
|
||||
from functools import wraps
|
||||
import os
|
||||
import sys
|
||||
|
||||
# make sure gevent-socketio is not installed, as it conflicts with
|
||||
# python-socketio
|
||||
gevent_socketio_found = True
|
||||
try:
|
||||
from socketio import socketio_manage
|
||||
except ImportError:
|
||||
gevent_socketio_found = False
|
||||
if gevent_socketio_found:
|
||||
print('The gevent-socketio package is incompatible with this version of '
|
||||
'the Flask-SocketIO extension. Please uninstall it, and then '
|
||||
'install the latest version of python-socketio in its place.')
|
||||
sys.exit(1)
|
||||
|
||||
import flask
|
||||
from flask import _request_ctx_stack, json as flask_json
|
||||
from flask.sessions import SessionMixin
|
||||
import socketio
|
||||
from socketio.exceptions import ConnectionRefusedError
|
||||
from werkzeug.debug import DebuggedApplication
|
||||
from werkzeug.serving import run_with_reloader
|
||||
|
||||
from .namespace import Namespace
|
||||
from .test_client import SocketIOTestClient
|
||||
|
||||
__version__ = '4.2.1'
|
||||
|
||||
|
||||
class _SocketIOMiddleware(socketio.WSGIApp):
|
||||
"""This WSGI middleware simply exposes the Flask application in the WSGI
|
||||
environment before executing the request.
|
||||
"""
|
||||
def __init__(self, socketio_app, flask_app, socketio_path='socket.io'):
|
||||
self.flask_app = flask_app
|
||||
super(_SocketIOMiddleware, self).__init__(socketio_app,
|
||||
flask_app.wsgi_app,
|
||||
socketio_path=socketio_path)
|
||||
|
||||
def __call__(self, environ, start_response):
|
||||
environ = environ.copy()
|
||||
environ['flask.app'] = self.flask_app
|
||||
return super(_SocketIOMiddleware, self).__call__(environ,
|
||||
start_response)
|
||||
|
||||
|
||||
class _ManagedSession(dict, SessionMixin):
|
||||
"""This class is used for user sessions that are managed by
|
||||
Flask-SocketIO. It is simple dict, expanded with the Flask session
|
||||
attributes."""
|
||||
pass
|
||||
|
||||
|
||||
class SocketIO(object):
|
||||
"""Create a Flask-SocketIO server.
|
||||
|
||||
:param app: The flask application instance. If the application instance
|
||||
isn't known at the time this class is instantiated, then call
|
||||
``socketio.init_app(app)`` once the application instance is
|
||||
available.
|
||||
:param manage_session: If set to ``True``, this extension manages the user
|
||||
session for Socket.IO events. If set to ``False``,
|
||||
Flask's own session management is used. When using
|
||||
Flask's cookie based sessions it is recommended that
|
||||
you leave this set to the default of ``True``. When
|
||||
using server-side sessions, a ``False`` setting
|
||||
enables sharing the user session between HTTP routes
|
||||
and Socket.IO events.
|
||||
:param message_queue: A connection URL for a message queue service the
|
||||
server can use for multi-process communication. A
|
||||
message queue is not required when using a single
|
||||
server process.
|
||||
:param channel: The channel name, when using a message queue. If a channel
|
||||
isn't specified, a default channel will be used. If
|
||||
multiple clusters of SocketIO processes need to use the
|
||||
same message queue without interfering with each other, then
|
||||
each cluster should use a different channel.
|
||||
:param path: The path where the Socket.IO server is exposed. Defaults to
|
||||
``'socket.io'``. Leave this as is unless you know what you are
|
||||
doing.
|
||||
:param resource: Alias to ``path``.
|
||||
:param kwargs: Socket.IO and Engine.IO server options.
|
||||
|
||||
The Socket.IO server options are detailed below:
|
||||
|
||||
:param client_manager: The client manager instance that will manage the
|
||||
client list. When this is omitted, the client list
|
||||
is stored in an in-memory structure, so the use of
|
||||
multiple connected servers is not possible. In most
|
||||
cases, this argument does not need to be set
|
||||
explicitly.
|
||||
:param logger: To enable logging set to ``True`` or pass a logger object to
|
||||
use. To disable logging set to ``False``. The default is
|
||||
``False``.
|
||||
:param binary: ``True`` to support binary payloads, ``False`` to treat all
|
||||
payloads as text. On Python 2, if this is set to ``True``,
|
||||
``unicode`` values are treated as text, and ``str`` and
|
||||
``bytes`` values are treated as binary. This option has no
|
||||
effect on Python 3, where text and binary payloads are
|
||||
always automatically discovered.
|
||||
:param json: An alternative json module to use for encoding and decoding
|
||||
packets. Custom json modules must have ``dumps`` and ``loads``
|
||||
functions that are compatible with the standard library
|
||||
versions. To use the same json encoder and decoder as a Flask
|
||||
application, use ``flask.json``.
|
||||
:param async_handlers: If set to ``True``, event handlers for a client are
|
||||
executed in separate threads. To run handlers for a
|
||||
client synchronously, set to ``False``. The default
|
||||
is ``True``.
|
||||
:param always_connect: When set to ``False``, new connections are
|
||||
provisory until the connect handler returns
|
||||
something other than ``False``, at which point they
|
||||
are accepted. When set to ``True``, connections are
|
||||
immediately accepted, and then if the connect
|
||||
handler returns ``False`` a disconnect is issued.
|
||||
Set to ``True`` if you need to emit events from the
|
||||
connect handler and your client is confused when it
|
||||
receives events before the connection acceptance.
|
||||
In any other case use the default of ``False``.
|
||||
|
||||
The Engine.IO server configuration supports the following settings:
|
||||
|
||||
:param async_mode: The asynchronous model to use. See the Deployment
|
||||
section in the documentation for a description of the
|
||||
available options. Valid async modes are
|
||||
``threading``, ``eventlet``, ``gevent`` and
|
||||
``gevent_uwsgi``. If this argument is not given,
|
||||
``eventlet`` is tried first, then ``gevent_uwsgi``,
|
||||
then ``gevent``, and finally ``threading``. The
|
||||
first async mode that has all its dependencies installed
|
||||
is then one that is chosen.
|
||||
:param ping_timeout: The time in seconds that the client waits for the
|
||||
server to respond before disconnecting. The default is
|
||||
60 seconds.
|
||||
:param ping_interval: The interval in seconds at which the client pings
|
||||
the server. The default is 25 seconds.
|
||||
:param max_http_buffer_size: The maximum size of a message when using the
|
||||
polling transport. The default is 100,000,000
|
||||
bytes.
|
||||
:param allow_upgrades: Whether to allow transport upgrades or not. The
|
||||
default is ``True``.
|
||||
:param http_compression: Whether to compress packages when using the
|
||||
polling transport. The default is ``True``.
|
||||
:param compression_threshold: Only compress messages when their byte size
|
||||
is greater than this value. The default is
|
||||
1024 bytes.
|
||||
:param cookie: Name of the HTTP cookie that contains the client session
|
||||
id. If set to ``None``, a cookie is not sent to the client.
|
||||
The default is ``'io'``.
|
||||
:param cors_allowed_origins: Origin or list of origins that are allowed to
|
||||
connect to this server. Only the same origin
|
||||
is allowed by default. Set this argument to
|
||||
``'*'`` to allow all origins, or to ``[]`` to
|
||||
disable CORS handling.
|
||||
:param cors_credentials: Whether credentials (cookies, authentication) are
|
||||
allowed in requests to this server. The default is
|
||||
``True``.
|
||||
:param monitor_clients: If set to ``True``, a background task will ensure
|
||||
inactive clients are closed. Set to ``False`` to
|
||||
disable the monitoring task (not recommended). The
|
||||
default is ``True``.
|
||||
:param engineio_logger: To enable Engine.IO logging set to ``True`` or pass
|
||||
a logger object to use. To disable logging set to
|
||||
``False``. The default is ``False``.
|
||||
"""
|
||||
|
||||
def __init__(self, app=None, **kwargs):
|
||||
self.server = None
|
||||
self.server_options = {}
|
||||
self.wsgi_server = None
|
||||
self.handlers = []
|
||||
self.namespace_handlers = []
|
||||
self.exception_handlers = {}
|
||||
self.default_exception_handler = None
|
||||
self.manage_session = True
|
||||
# We can call init_app when:
|
||||
# - we were given the Flask app instance (standard initialization)
|
||||
# - we were not given the app, but we were given a message_queue
|
||||
# (standard initialization for auxiliary process)
|
||||
# In all other cases we collect the arguments and assume the client
|
||||
# will call init_app from an app factory function.
|
||||
if app is not None or 'message_queue' in kwargs:
|
||||
self.init_app(app, **kwargs)
|
||||
else:
|
||||
self.server_options.update(kwargs)
|
||||
|
||||
def init_app(self, app, **kwargs):
|
||||
if app is not None:
|
||||
if not hasattr(app, 'extensions'):
|
||||
app.extensions = {} # pragma: no cover
|
||||
app.extensions['socketio'] = self
|
||||
self.server_options.update(kwargs)
|
||||
self.manage_session = self.server_options.pop('manage_session',
|
||||
self.manage_session)
|
||||
|
||||
if 'client_manager' not in self.server_options:
|
||||
url = self.server_options.pop('message_queue', None)
|
||||
channel = self.server_options.pop('channel', 'flask-socketio')
|
||||
write_only = app is None
|
||||
if url:
|
||||
if url.startswith(('redis://', "rediss://")):
|
||||
queue_class = socketio.RedisManager
|
||||
elif url.startswith(('kafka://')):
|
||||
queue_class = socketio.KafkaManager
|
||||
elif url.startswith('zmq'):
|
||||
queue_class = socketio.ZmqManager
|
||||
else:
|
||||
queue_class = socketio.KombuManager
|
||||
queue = queue_class(url, channel=channel,
|
||||
write_only=write_only)
|
||||
self.server_options['client_manager'] = queue
|
||||
|
||||
if 'json' in self.server_options and \
|
||||
self.server_options['json'] == flask_json:
|
||||
# flask's json module is tricky to use because its output
|
||||
# changes when it is invoked inside or outside the app context
|
||||
# so here to prevent any ambiguities we replace it with wrappers
|
||||
# that ensure that the app context is always present
|
||||
class FlaskSafeJSON(object):
|
||||
@staticmethod
|
||||
def dumps(*args, **kwargs):
|
||||
with app.app_context():
|
||||
return flask_json.dumps(*args, **kwargs)
|
||||
|
||||
@staticmethod
|
||||
def loads(*args, **kwargs):
|
||||
with app.app_context():
|
||||
return flask_json.loads(*args, **kwargs)
|
||||
|
||||
self.server_options['json'] = FlaskSafeJSON
|
||||
|
||||
resource = self.server_options.pop('path', None) or \
|
||||
self.server_options.pop('resource', None) or 'socket.io'
|
||||
if resource.startswith('/'):
|
||||
resource = resource[1:]
|
||||
if os.environ.get('FLASK_RUN_FROM_CLI'):
|
||||
if self.server_options.get('async_mode') is None:
|
||||
if app is not None:
|
||||
app.logger.warning(
|
||||
'Flask-SocketIO is Running under Werkzeug, WebSocket '
|
||||
'is not available.')
|
||||
self.server_options['async_mode'] = 'threading'
|
||||
self.server = socketio.Server(**self.server_options)
|
||||
self.async_mode = self.server.async_mode
|
||||
for handler in self.handlers:
|
||||
self.server.on(handler[0], handler[1], namespace=handler[2])
|
||||
for namespace_handler in self.namespace_handlers:
|
||||
self.server.register_namespace(namespace_handler)
|
||||
|
||||
if app is not None:
|
||||
# here we attach the SocketIO middlware to the SocketIO object so it
|
||||
# can be referenced later if debug middleware needs to be inserted
|
||||
self.sockio_mw = _SocketIOMiddleware(self.server, app,
|
||||
socketio_path=resource)
|
||||
app.wsgi_app = self.sockio_mw
|
||||
|
||||
def on(self, message, namespace=None):
|
||||
"""Decorator to register a SocketIO event handler.
|
||||
|
||||
This decorator must be applied to SocketIO event handlers. Example::
|
||||
|
||||
@socketio.on('my event', namespace='/chat')
|
||||
def handle_my_custom_event(json):
|
||||
print('received json: ' + str(json))
|
||||
|
||||
:param message: The name of the event. This is normally a user defined
|
||||
string, but a few event names are already defined. Use
|
||||
``'message'`` to define a handler that takes a string
|
||||
payload, ``'json'`` to define a handler that takes a
|
||||
JSON blob payload, ``'connect'`` or ``'disconnect'``
|
||||
to create handlers for connection and disconnection
|
||||
events.
|
||||
:param namespace: The namespace on which the handler is to be
|
||||
registered. Defaults to the global namespace.
|
||||
"""
|
||||
namespace = namespace or '/'
|
||||
|
||||
def decorator(handler):
|
||||
@wraps(handler)
|
||||
def _handler(sid, *args):
|
||||
return self._handle_event(handler, message, namespace, sid,
|
||||
*args)
|
||||
|
||||
if self.server:
|
||||
self.server.on(message, _handler, namespace=namespace)
|
||||
else:
|
||||
self.handlers.append((message, _handler, namespace))
|
||||
return handler
|
||||
return decorator
|
||||
|
||||
def on_error(self, namespace=None):
|
||||
"""Decorator to define a custom error handler for SocketIO events.
|
||||
|
||||
This decorator can be applied to a function that acts as an error
|
||||
handler for a namespace. This handler will be invoked when a SocketIO
|
||||
event handler raises an exception. The handler function must accept one
|
||||
argument, which is the exception raised. Example::
|
||||
|
||||
@socketio.on_error(namespace='/chat')
|
||||
def chat_error_handler(e):
|
||||
print('An error has occurred: ' + str(e))
|
||||
|
||||
:param namespace: The namespace for which to register the error
|
||||
handler. Defaults to the global namespace.
|
||||
"""
|
||||
namespace = namespace or '/'
|
||||
|
||||
def decorator(exception_handler):
|
||||
if not callable(exception_handler):
|
||||
raise ValueError('exception_handler must be callable')
|
||||
self.exception_handlers[namespace] = exception_handler
|
||||
return exception_handler
|
||||
return decorator
|
||||
|
||||
def on_error_default(self, exception_handler):
|
||||
"""Decorator to define a default error handler for SocketIO events.
|
||||
|
||||
This decorator can be applied to a function that acts as a default
|
||||
error handler for any namespaces that do not have a specific handler.
|
||||
Example::
|
||||
|
||||
@socketio.on_error_default
|
||||
def error_handler(e):
|
||||
print('An error has occurred: ' + str(e))
|
||||
"""
|
||||
if not callable(exception_handler):
|
||||
raise ValueError('exception_handler must be callable')
|
||||
self.default_exception_handler = exception_handler
|
||||
return exception_handler
|
||||
|
||||
def on_event(self, message, handler, namespace=None):
|
||||
"""Register a SocketIO event handler.
|
||||
|
||||
``on_event`` is the non-decorator version of ``'on'``.
|
||||
|
||||
Example::
|
||||
|
||||
def on_foo_event(json):
|
||||
print('received json: ' + str(json))
|
||||
|
||||
socketio.on_event('my event', on_foo_event, namespace='/chat')
|
||||
|
||||
:param message: The name of the event. This is normally a user defined
|
||||
string, but a few event names are already defined. Use
|
||||
``'message'`` to define a handler that takes a string
|
||||
payload, ``'json'`` to define a handler that takes a
|
||||
JSON blob payload, ``'connect'`` or ``'disconnect'``
|
||||
to create handlers for connection and disconnection
|
||||
events.
|
||||
:param handler: The function that handles the event.
|
||||
:param namespace: The namespace on which the handler is to be
|
||||
registered. Defaults to the global namespace.
|
||||
"""
|
||||
self.on(message, namespace=namespace)(handler)
|
||||
|
||||
def on_namespace(self, namespace_handler):
|
||||
if not isinstance(namespace_handler, Namespace):
|
||||
raise ValueError('Not a namespace instance.')
|
||||
namespace_handler._set_socketio(self)
|
||||
if self.server:
|
||||
self.server.register_namespace(namespace_handler)
|
||||
else:
|
||||
self.namespace_handlers.append(namespace_handler)
|
||||
|
||||
def emit(self, event, *args, **kwargs):
|
||||
"""Emit a server generated SocketIO event.
|
||||
|
||||
This function emits a SocketIO event to one or more connected clients.
|
||||
A JSON blob can be attached to the event as payload. This function can
|
||||
be used outside of a SocketIO event context, so it is appropriate to
|
||||
use when the server is the originator of an event, outside of any
|
||||
client context, such as in a regular HTTP request handler or a
|
||||
background task. Example::
|
||||
|
||||
@app.route('/ping')
|
||||
def ping():
|
||||
socketio.emit('ping event', {'data': 42}, namespace='/chat')
|
||||
|
||||
:param event: The name of the user event to emit.
|
||||
:param args: A dictionary with the JSON data to send as payload.
|
||||
:param namespace: The namespace under which the message is to be sent.
|
||||
Defaults to the global namespace.
|
||||
:param room: Send the message to all the users in the given room. If
|
||||
this parameter is not included, the event is sent to
|
||||
all connected users.
|
||||
:param skip_sid: The session id of a client to ignore when broadcasting
|
||||
or addressing a room. This is typically set to the
|
||||
originator of the message, so that everyone except
|
||||
that client receive the message. To skip multiple sids
|
||||
pass a list.
|
||||
:param callback: If given, this function will be called to acknowledge
|
||||
that the client has received the message. The
|
||||
arguments that will be passed to the function are
|
||||
those provided by the client. Callback functions can
|
||||
only be used when addressing an individual client.
|
||||
"""
|
||||
namespace = kwargs.pop('namespace', '/')
|
||||
room = kwargs.pop('room', None)
|
||||
include_self = kwargs.pop('include_self', True)
|
||||
skip_sid = kwargs.pop('skip_sid', None)
|
||||
if not include_self and not skip_sid:
|
||||
skip_sid = flask.request.sid
|
||||
callback = kwargs.pop('callback', None)
|
||||
if callback:
|
||||
# wrap the callback so that it sets app app and request contexts
|
||||
sid = flask.request.sid
|
||||
original_callback = callback
|
||||
|
||||
def _callback_wrapper(*args):
|
||||
return self._handle_event(original_callback, None, namespace,
|
||||
sid, *args)
|
||||
|
||||
callback = _callback_wrapper
|
||||
self.server.emit(event, *args, namespace=namespace, room=room,
|
||||
skip_sid=skip_sid, callback=callback, **kwargs)
|
||||
|
||||
def send(self, data, json=False, namespace=None, room=None,
|
||||
callback=None, include_self=True, skip_sid=None, **kwargs):
|
||||
"""Send a server-generated SocketIO message.
|
||||
|
||||
This function sends a simple SocketIO message to one or more connected
|
||||
clients. The message can be a string or a JSON blob. This is a simpler
|
||||
version of ``emit()``, which should be preferred. This function can be
|
||||
used outside of a SocketIO event context, so it is appropriate to use
|
||||
when the server is the originator of an event.
|
||||
|
||||
:param data: The message to send, either a string or a JSON blob.
|
||||
:param json: ``True`` if ``message`` is a JSON blob, ``False``
|
||||
otherwise.
|
||||
:param namespace: The namespace under which the message is to be sent.
|
||||
Defaults to the global namespace.
|
||||
:param room: Send the message only to the users in the given room. If
|
||||
this parameter is not included, the message is sent to
|
||||
all connected users.
|
||||
:param skip_sid: The session id of a client to ignore when broadcasting
|
||||
or addressing a room. This is typically set to the
|
||||
originator of the message, so that everyone except
|
||||
that client receive the message. To skip multiple sids
|
||||
pass a list.
|
||||
:param callback: If given, this function will be called to acknowledge
|
||||
that the client has received the message. The
|
||||
arguments that will be passed to the function are
|
||||
those provided by the client. Callback functions can
|
||||
only be used when addressing an individual client.
|
||||
"""
|
||||
skip_sid = flask.request.sid if not include_self else skip_sid
|
||||
if json:
|
||||
self.emit('json', data, namespace=namespace, room=room,
|
||||
skip_sid=skip_sid, callback=callback, **kwargs)
|
||||
else:
|
||||
self.emit('message', data, namespace=namespace, room=room,
|
||||
skip_sid=skip_sid, callback=callback, **kwargs)
|
||||
|
||||
def close_room(self, room, namespace=None):
|
||||
"""Close a room.
|
||||
|
||||
This function removes any users that are in the given room and then
|
||||
deletes the room from the server. This function can be used outside
|
||||
of a SocketIO event context.
|
||||
|
||||
:param room: The name of the room to close.
|
||||
:param namespace: The namespace under which the room exists. Defaults
|
||||
to the global namespace.
|
||||
"""
|
||||
self.server.close_room(room, namespace)
|
||||
|
||||
def run(self, app, host=None, port=None, **kwargs):
|
||||
"""Run the SocketIO web server.
|
||||
|
||||
:param app: The Flask application instance.
|
||||
:param host: The hostname or IP address for the server to listen on.
|
||||
Defaults to 127.0.0.1.
|
||||
:param port: The port number for the server to listen on. Defaults to
|
||||
5000.
|
||||
:param debug: ``True`` to start the server in debug mode, ``False`` to
|
||||
start in normal mode.
|
||||
:param use_reloader: ``True`` to enable the Flask reloader, ``False``
|
||||
to disable it.
|
||||
:param extra_files: A list of additional files that the Flask
|
||||
reloader should watch. Defaults to ``None``
|
||||
:param log_output: If ``True``, the server logs all incomming
|
||||
connections. If ``False`` logging is disabled.
|
||||
Defaults to ``True`` in debug mode, ``False``
|
||||
in normal mode. Unused when the threading async
|
||||
mode is used.
|
||||
:param kwargs: Additional web server options. The web server options
|
||||
are specific to the server used in each of the supported
|
||||
async modes. Note that options provided here will
|
||||
not be seen when using an external web server such
|
||||
as gunicorn, since this method is not called in that
|
||||
case.
|
||||
"""
|
||||
if host is None:
|
||||
host = '127.0.0.1'
|
||||
if port is None:
|
||||
server_name = app.config['SERVER_NAME']
|
||||
if server_name and ':' in server_name:
|
||||
port = int(server_name.rsplit(':', 1)[1])
|
||||
else:
|
||||
port = 5000
|
||||
|
||||
debug = kwargs.pop('debug', app.debug)
|
||||
log_output = kwargs.pop('log_output', debug)
|
||||
use_reloader = kwargs.pop('use_reloader', debug)
|
||||
extra_files = kwargs.pop('extra_files', None)
|
||||
|
||||
app.debug = debug
|
||||
if app.debug and self.server.eio.async_mode != 'threading':
|
||||
# put the debug middleware between the SocketIO middleware
|
||||
# and the Flask application instance
|
||||
#
|
||||
# mw1 mw2 mw3 Flask app
|
||||
# o ---- o ---- o ---- o
|
||||
# /
|
||||
# o Flask-SocketIO
|
||||
# \ middleware
|
||||
# o
|
||||
# Flask-SocketIO WebSocket handler
|
||||
#
|
||||
# BECOMES
|
||||
#
|
||||
# dbg-mw mw1 mw2 mw3 Flask app
|
||||
# o ---- o ---- o ---- o ---- o
|
||||
# /
|
||||
# o Flask-SocketIO
|
||||
# \ middleware
|
||||
# o
|
||||
# Flask-SocketIO WebSocket handler
|
||||
#
|
||||
self.sockio_mw.wsgi_app = DebuggedApplication(self.sockio_mw.wsgi_app,
|
||||
evalex=True)
|
||||
|
||||
if self.server.eio.async_mode == 'threading':
|
||||
from werkzeug._internal import _log
|
||||
_log('warning', 'WebSocket transport not available. Install '
|
||||
'eventlet or gevent and gevent-websocket for '
|
||||
'improved performance.')
|
||||
app.run(host=host, port=port, threaded=True,
|
||||
use_reloader=use_reloader, **kwargs)
|
||||
elif self.server.eio.async_mode == 'eventlet':
|
||||
def run_server():
|
||||
import eventlet
|
||||
import eventlet.wsgi
|
||||
import eventlet.green
|
||||
addresses = eventlet.green.socket.getaddrinfo(host, port)
|
||||
if not addresses:
|
||||
raise RuntimeError('Could not resolve host to a valid address')
|
||||
eventlet_socket = eventlet.listen(addresses[0][4], addresses[0][0])
|
||||
|
||||
# If provided an SSL argument, use an SSL socket
|
||||
ssl_args = ['keyfile', 'certfile', 'server_side', 'cert_reqs',
|
||||
'ssl_version', 'ca_certs',
|
||||
'do_handshake_on_connect', 'suppress_ragged_eofs',
|
||||
'ciphers']
|
||||
ssl_params = {k: kwargs[k] for k in kwargs if k in ssl_args}
|
||||
if len(ssl_params) > 0:
|
||||
for k in ssl_params:
|
||||
kwargs.pop(k)
|
||||
ssl_params['server_side'] = True # Listening requires true
|
||||
eventlet_socket = eventlet.wrap_ssl(eventlet_socket,
|
||||
**ssl_params)
|
||||
|
||||
eventlet.wsgi.server(eventlet_socket, app,
|
||||
log_output=log_output, **kwargs)
|
||||
|
||||
if use_reloader:
|
||||
run_with_reloader(run_server, extra_files=extra_files)
|
||||
else:
|
||||
run_server()
|
||||
elif self.server.eio.async_mode == 'gevent':
|
||||
from gevent import pywsgi
|
||||
try:
|
||||
from geventwebsocket.handler import WebSocketHandler
|
||||
websocket = True
|
||||
except ImportError:
|
||||
websocket = False
|
||||
|
||||
log = 'default'
|
||||
if not log_output:
|
||||
log = None
|
||||
if websocket:
|
||||
self.wsgi_server = pywsgi.WSGIServer(
|
||||
(host, port), app, handler_class=WebSocketHandler,
|
||||
log=log, **kwargs)
|
||||
else:
|
||||
self.wsgi_server = pywsgi.WSGIServer((host, port), app,
|
||||
log=log, **kwargs)
|
||||
|
||||
if use_reloader:
|
||||
# monkey patching is required by the reloader
|
||||
from gevent import monkey
|
||||
monkey.patch_thread()
|
||||
monkey.patch_time()
|
||||
|
||||
def run_server():
|
||||
self.wsgi_server.serve_forever()
|
||||
|
||||
run_with_reloader(run_server, extra_files=extra_files)
|
||||
else:
|
||||
self.wsgi_server.serve_forever()
|
||||
|
||||
def stop(self):
|
||||
"""Stop a running SocketIO web server.
|
||||
|
||||
This method must be called from a HTTP or SocketIO handler function.
|
||||
"""
|
||||
if self.server.eio.async_mode == 'threading':
|
||||
func = flask.request.environ.get('werkzeug.server.shutdown')
|
||||
if func:
|
||||
func()
|
||||
else:
|
||||
raise RuntimeError('Cannot stop unknown web server')
|
||||
elif self.server.eio.async_mode == 'eventlet':
|
||||
raise SystemExit
|
||||
elif self.server.eio.async_mode == 'gevent':
|
||||
self.wsgi_server.stop()
|
||||
|
||||
def start_background_task(self, target, *args, **kwargs):
|
||||
"""Start a background task using the appropriate async model.
|
||||
|
||||
This is a utility function that applications can use to start a
|
||||
background task using the method that is compatible with the
|
||||
selected async mode.
|
||||
|
||||
:param target: the target function to execute.
|
||||
:param args: arguments to pass to the function.
|
||||
:param kwargs: keyword arguments to pass to the function.
|
||||
|
||||
This function returns an object compatible with the `Thread` class in
|
||||
the Python standard library. The `start()` method on this object is
|
||||
already called by this function.
|
||||
"""
|
||||
return self.server.start_background_task(target, *args, **kwargs)
|
||||
|
||||
def sleep(self, seconds=0):
|
||||
"""Sleep for the requested amount of time using the appropriate async
|
||||
model.
|
||||
|
||||
This is a utility function that applications can use to put a task to
|
||||
sleep without having to worry about using the correct call for the
|
||||
selected async mode.
|
||||
"""
|
||||
return self.server.sleep(seconds)
|
||||
|
||||
def test_client(self, app, namespace=None, query_string=None,
|
||||
headers=None, flask_test_client=None):
|
||||
"""The Socket.IO test client is useful for testing a Flask-SocketIO
|
||||
server. It works in a similar way to the Flask Test Client, but
|
||||
adapted to the Socket.IO server.
|
||||
|
||||
:param app: The Flask application instance.
|
||||
:param namespace: The namespace for the client. If not provided, the
|
||||
client connects to the server on the global
|
||||
namespace.
|
||||
:param query_string: A string with custom query string arguments.
|
||||
:param headers: A dictionary with custom HTTP headers.
|
||||
:param flask_test_client: The instance of the Flask test client
|
||||
currently in use. Passing the Flask test
|
||||
client is optional, but is necessary if you
|
||||
want the Flask user session and any other
|
||||
cookies set in HTTP routes accessible from
|
||||
Socket.IO events.
|
||||
"""
|
||||
return SocketIOTestClient(app, self, namespace=namespace,
|
||||
query_string=query_string, headers=headers,
|
||||
flask_test_client=flask_test_client)
|
||||
|
||||
def _handle_event(self, handler, message, namespace, sid, *args):
|
||||
if sid not in self.server.environ:
|
||||
# we don't have record of this client, ignore this event
|
||||
return '', 400
|
||||
app = self.server.environ[sid]['flask.app']
|
||||
with app.request_context(self.server.environ[sid]):
|
||||
if self.manage_session:
|
||||
# manage a separate session for this client's Socket.IO events
|
||||
# created as a copy of the regular user session
|
||||
if 'saved_session' not in self.server.environ[sid]:
|
||||
self.server.environ[sid]['saved_session'] = \
|
||||
_ManagedSession(flask.session)
|
||||
session_obj = self.server.environ[sid]['saved_session']
|
||||
else:
|
||||
# let Flask handle the user session
|
||||
# for cookie based sessions, this effectively freezes the
|
||||
# session to its state at connection time
|
||||
# for server-side sessions, this allows HTTP and Socket.IO to
|
||||
# share the session, with both having read/write access to it
|
||||
session_obj = flask.session._get_current_object()
|
||||
_request_ctx_stack.top.session = session_obj
|
||||
flask.request.sid = sid
|
||||
flask.request.namespace = namespace
|
||||
flask.request.event = {'message': message, 'args': args}
|
||||
try:
|
||||
if message == 'connect':
|
||||
ret = handler()
|
||||
else:
|
||||
ret = handler(*args)
|
||||
except:
|
||||
err_handler = self.exception_handlers.get(
|
||||
namespace, self.default_exception_handler)
|
||||
if err_handler is None:
|
||||
raise
|
||||
type, value, traceback = sys.exc_info()
|
||||
return err_handler(value)
|
||||
if not self.manage_session:
|
||||
# when Flask is managing the user session, it needs to save it
|
||||
if not hasattr(session_obj, 'modified') or session_obj.modified:
|
||||
resp = app.response_class()
|
||||
app.session_interface.save_session(app, session_obj, resp)
|
||||
return ret
|
||||
|
||||
|
||||
def emit(event, *args, **kwargs):
|
||||
"""Emit a SocketIO event.
|
||||
|
||||
This function emits a SocketIO event to one or more connected clients. A
|
||||
JSON blob can be attached to the event as payload. This is a function that
|
||||
can only be called from a SocketIO event handler, as in obtains some
|
||||
information from the current client context. Example::
|
||||
|
||||
@socketio.on('my event')
|
||||
def handle_my_custom_event(json):
|
||||
emit('my response', {'data': 42})
|
||||
|
||||
:param event: The name of the user event to emit.
|
||||
:param args: A dictionary with the JSON data to send as payload.
|
||||
:param namespace: The namespace under which the message is to be sent.
|
||||
Defaults to the namespace used by the originating event.
|
||||
A ``'/'`` can be used to explicitly specify the global
|
||||
namespace.
|
||||
:param callback: Callback function to invoke with the client's
|
||||
acknowledgement.
|
||||
:param broadcast: ``True`` to send the message to all clients, or ``False``
|
||||
to only reply to the sender of the originating event.
|
||||
:param room: Send the message to all the users in the given room. If this
|
||||
argument is set, then broadcast is implied to be ``True``.
|
||||
:param include_self: ``True`` to include the sender when broadcasting or
|
||||
addressing a room, or ``False`` to send to everyone
|
||||
but the sender.
|
||||
:param ignore_queue: Only used when a message queue is configured. If
|
||||
set to ``True``, the event is emitted to the
|
||||
clients directly, without going through the queue.
|
||||
This is more efficient, but only works when a
|
||||
single server process is used, or when there is a
|
||||
single addresee. It is recommended to always leave
|
||||
this parameter with its default value of ``False``.
|
||||
"""
|
||||
if 'namespace' in kwargs:
|
||||
namespace = kwargs['namespace']
|
||||
else:
|
||||
namespace = flask.request.namespace
|
||||
callback = kwargs.get('callback')
|
||||
broadcast = kwargs.get('broadcast')
|
||||
room = kwargs.get('room')
|
||||
if room is None and not broadcast:
|
||||
room = flask.request.sid
|
||||
include_self = kwargs.get('include_self', True)
|
||||
ignore_queue = kwargs.get('ignore_queue', False)
|
||||
|
||||
socketio = flask.current_app.extensions['socketio']
|
||||
return socketio.emit(event, *args, namespace=namespace, room=room,
|
||||
include_self=include_self, callback=callback,
|
||||
ignore_queue=ignore_queue)
|
||||
|
||||
|
||||
def send(message, **kwargs):
|
||||
"""Send a SocketIO message.
|
||||
|
||||
This function sends a simple SocketIO message to one or more connected
|
||||
clients. The message can be a string or a JSON blob. This is a simpler
|
||||
version of ``emit()``, which should be preferred. This is a function that
|
||||
can only be called from a SocketIO event handler.
|
||||
|
||||
:param message: The message to send, either a string or a JSON blob.
|
||||
:param json: ``True`` if ``message`` is a JSON blob, ``False``
|
||||
otherwise.
|
||||
:param namespace: The namespace under which the message is to be sent.
|
||||
Defaults to the namespace used by the originating event.
|
||||
An empty string can be used to use the global namespace.
|
||||
:param callback: Callback function to invoke with the client's
|
||||
acknowledgement.
|
||||
:param broadcast: ``True`` to send the message to all connected clients, or
|
||||
``False`` to only reply to the sender of the originating
|
||||
event.
|
||||
:param room: Send the message to all the users in the given room.
|
||||
:param include_self: ``True`` to include the sender when broadcasting or
|
||||
addressing a room, or ``False`` to send to everyone
|
||||
but the sender.
|
||||
:param ignore_queue: Only used when a message queue is configured. If
|
||||
set to ``True``, the event is emitted to the
|
||||
clients directly, without going through the queue.
|
||||
This is more efficient, but only works when a
|
||||
single server process is used, or when there is a
|
||||
single addresee. It is recommended to always leave
|
||||
this parameter with its default value of ``False``.
|
||||
"""
|
||||
json = kwargs.get('json', False)
|
||||
if 'namespace' in kwargs:
|
||||
namespace = kwargs['namespace']
|
||||
else:
|
||||
namespace = flask.request.namespace
|
||||
callback = kwargs.get('callback')
|
||||
broadcast = kwargs.get('broadcast')
|
||||
room = kwargs.get('room')
|
||||
if room is None and not broadcast:
|
||||
room = flask.request.sid
|
||||
include_self = kwargs.get('include_self', True)
|
||||
ignore_queue = kwargs.get('ignore_queue', False)
|
||||
|
||||
socketio = flask.current_app.extensions['socketio']
|
||||
return socketio.send(message, json=json, namespace=namespace, room=room,
|
||||
include_self=include_self, callback=callback,
|
||||
ignore_queue=ignore_queue)
|
||||
|
||||
|
||||
def join_room(room, sid=None, namespace=None):
|
||||
"""Join a room.
|
||||
|
||||
This function puts the user in a room, under the current namespace. The
|
||||
user and the namespace are obtained from the event context. This is a
|
||||
function that can only be called from a SocketIO event handler. Example::
|
||||
|
||||
@socketio.on('join')
|
||||
def on_join(data):
|
||||
username = session['username']
|
||||
room = data['room']
|
||||
join_room(room)
|
||||
send(username + ' has entered the room.', room=room)
|
||||
|
||||
:param room: The name of the room to join.
|
||||
:param sid: The session id of the client. If not provided, the client is
|
||||
obtained from the request context.
|
||||
:param namespace: The namespace for the room. If not provided, the
|
||||
namespace is obtained from the request context.
|
||||
"""
|
||||
socketio = flask.current_app.extensions['socketio']
|
||||
sid = sid or flask.request.sid
|
||||
namespace = namespace or flask.request.namespace
|
||||
socketio.server.enter_room(sid, room, namespace=namespace)
|
||||
|
||||
|
||||
def leave_room(room, sid=None, namespace=None):
|
||||
"""Leave a room.
|
||||
|
||||
This function removes the user from a room, under the current namespace.
|
||||
The user and the namespace are obtained from the event context. Example::
|
||||
|
||||
@socketio.on('leave')
|
||||
def on_leave(data):
|
||||
username = session['username']
|
||||
room = data['room']
|
||||
leave_room(room)
|
||||
send(username + ' has left the room.', room=room)
|
||||
|
||||
:param room: The name of the room to leave.
|
||||
:param sid: The session id of the client. If not provided, the client is
|
||||
obtained from the request context.
|
||||
:param namespace: The namespace for the room. If not provided, the
|
||||
namespace is obtained from the request context.
|
||||
"""
|
||||
socketio = flask.current_app.extensions['socketio']
|
||||
sid = sid or flask.request.sid
|
||||
namespace = namespace or flask.request.namespace
|
||||
socketio.server.leave_room(sid, room, namespace=namespace)
|
||||
|
||||
|
||||
def close_room(room, namespace=None):
|
||||
"""Close a room.
|
||||
|
||||
This function removes any users that are in the given room and then deletes
|
||||
the room from the server.
|
||||
|
||||
:param room: The name of the room to close.
|
||||
:param namespace: The namespace for the room. If not provided, the
|
||||
namespace is obtained from the request context.
|
||||
"""
|
||||
socketio = flask.current_app.extensions['socketio']
|
||||
namespace = namespace or flask.request.namespace
|
||||
socketio.server.close_room(room, namespace=namespace)
|
||||
|
||||
|
||||
def rooms(sid=None, namespace=None):
|
||||
"""Return a list of the rooms the client is in.
|
||||
|
||||
This function returns all the rooms the client has entered, including its
|
||||
own room, assigned by the Socket.IO server.
|
||||
|
||||
:param sid: The session id of the client. If not provided, the client is
|
||||
obtained from the request context.
|
||||
:param namespace: The namespace for the room. If not provided, the
|
||||
namespace is obtained from the request context.
|
||||
"""
|
||||
socketio = flask.current_app.extensions['socketio']
|
||||
sid = sid or flask.request.sid
|
||||
namespace = namespace or flask.request.namespace
|
||||
return socketio.server.rooms(sid, namespace=namespace)
|
||||
|
||||
|
||||
def disconnect(sid=None, namespace=None, silent=False):
|
||||
"""Disconnect the client.
|
||||
|
||||
This function terminates the connection with the client. As a result of
|
||||
this call the client will receive a disconnect event. Example::
|
||||
|
||||
@socketio.on('message')
|
||||
def receive_message(msg):
|
||||
if is_banned(session['username']):
|
||||
disconnect()
|
||||
else:
|
||||
# ...
|
||||
|
||||
:param sid: The session id of the client. If not provided, the client is
|
||||
obtained from the request context.
|
||||
:param namespace: The namespace for the room. If not provided, the
|
||||
namespace is obtained from the request context.
|
||||
:param silent: this option is deprecated.
|
||||
"""
|
||||
socketio = flask.current_app.extensions['socketio']
|
||||
sid = sid or flask.request.sid
|
||||
namespace = namespace or flask.request.namespace
|
||||
return socketio.server.disconnect(sid, namespace=namespace)
|
||||
@@ -1,47 +0,0 @@
|
||||
from socketio import Namespace as _Namespace
|
||||
|
||||
|
||||
class Namespace(_Namespace):
|
||||
def __init__(self, namespace=None):
|
||||
super(Namespace, self).__init__(namespace)
|
||||
self.socketio = None
|
||||
|
||||
def _set_socketio(self, socketio):
|
||||
self.socketio = socketio
|
||||
|
||||
def trigger_event(self, event, *args):
|
||||
"""Dispatch an event to the proper handler method.
|
||||
|
||||
In the most common usage, this method is not overloaded by subclasses,
|
||||
as it performs the routing of events to methods. However, this
|
||||
method can be overriden if special dispatching rules are needed, or if
|
||||
having a single method that catches all events is desired.
|
||||
"""
|
||||
handler_name = 'on_' + event
|
||||
if not hasattr(self, handler_name):
|
||||
# there is no handler for this event, so we ignore it
|
||||
return
|
||||
handler = getattr(self, handler_name)
|
||||
return self.socketio._handle_event(handler, event, self.namespace,
|
||||
*args)
|
||||
|
||||
def emit(self, event, data=None, room=None, include_self=True,
|
||||
namespace=None, callback=None):
|
||||
"""Emit a custom event to one or more connected clients."""
|
||||
return self.socketio.emit(event, data, room=room,
|
||||
include_self=include_self,
|
||||
namespace=namespace or self.namespace,
|
||||
callback=callback)
|
||||
|
||||
def send(self, data, room=None, include_self=True, namespace=None,
|
||||
callback=None):
|
||||
"""Send a message to one or more connected clients."""
|
||||
return self.socketio.send(data, room=room, include_self=include_self,
|
||||
namespace=namespace or self.namespace,
|
||||
callback=callback)
|
||||
|
||||
def close_room(self, room, namespace=None):
|
||||
"""Close a room."""
|
||||
return self.socketio.close_room(room=room,
|
||||
namespace=namespace or self.namespace)
|
||||
|
||||
@@ -1,205 +0,0 @@
|
||||
import uuid
|
||||
|
||||
from socketio import packet
|
||||
from socketio.pubsub_manager import PubSubManager
|
||||
from werkzeug.test import EnvironBuilder
|
||||
|
||||
|
||||
class SocketIOTestClient(object):
|
||||
"""
|
||||
This class is useful for testing a Flask-SocketIO server. It works in a
|
||||
similar way to the Flask Test Client, but adapted to the Socket.IO server.
|
||||
|
||||
:param app: The Flask application instance.
|
||||
:param socketio: The application's ``SocketIO`` instance.
|
||||
:param namespace: The namespace for the client. If not provided, the client
|
||||
connects to the server on the global namespace.
|
||||
:param query_string: A string with custom query string arguments.
|
||||
:param headers: A dictionary with custom HTTP headers.
|
||||
:param flask_test_client: The instance of the Flask test client
|
||||
currently in use. Passing the Flask test
|
||||
client is optional, but is necessary if you
|
||||
want the Flask user session and any other
|
||||
cookies set in HTTP routes accessible from
|
||||
Socket.IO events.
|
||||
"""
|
||||
queue = {}
|
||||
acks = {}
|
||||
|
||||
def __init__(self, app, socketio, namespace=None, query_string=None,
|
||||
headers=None, flask_test_client=None):
|
||||
def _mock_send_packet(sid, pkt):
|
||||
if pkt.packet_type == packet.EVENT or \
|
||||
pkt.packet_type == packet.BINARY_EVENT:
|
||||
if sid not in self.queue:
|
||||
self.queue[sid] = []
|
||||
if pkt.data[0] == 'message' or pkt.data[0] == 'json':
|
||||
self.queue[sid].append({'name': pkt.data[0],
|
||||
'args': pkt.data[1],
|
||||
'namespace': pkt.namespace or '/'})
|
||||
else:
|
||||
self.queue[sid].append({'name': pkt.data[0],
|
||||
'args': pkt.data[1:],
|
||||
'namespace': pkt.namespace or '/'})
|
||||
elif pkt.packet_type == packet.ACK or \
|
||||
pkt.packet_type == packet.BINARY_ACK:
|
||||
self.acks[sid] = {'args': pkt.data,
|
||||
'namespace': pkt.namespace or '/'}
|
||||
elif pkt.packet_type == packet.DISCONNECT:
|
||||
self.connected[pkt.namespace or '/'] = False
|
||||
|
||||
self.app = app
|
||||
self.flask_test_client = flask_test_client
|
||||
self.sid = uuid.uuid4().hex
|
||||
self.queue[self.sid] = []
|
||||
self.acks[self.sid] = None
|
||||
self.callback_counter = 0
|
||||
self.socketio = socketio
|
||||
self.connected = {}
|
||||
socketio.server._send_packet = _mock_send_packet
|
||||
socketio.server.environ[self.sid] = {}
|
||||
socketio.server.async_handlers = False # easier to test when
|
||||
socketio.server.eio.async_handlers = False # events are sync
|
||||
if isinstance(socketio.server.manager, PubSubManager):
|
||||
raise RuntimeError('Test client cannot be used with a message '
|
||||
'queue. Disable the queue on your test '
|
||||
'configuration.')
|
||||
socketio.server.manager.initialize()
|
||||
self.connect(namespace=namespace, query_string=query_string,
|
||||
headers=headers)
|
||||
|
||||
def is_connected(self, namespace=None):
|
||||
"""Check if a namespace is connected.
|
||||
|
||||
:param namespace: The namespace to check. The global namespace is
|
||||
assumed if this argument is not provided.
|
||||
"""
|
||||
return self.connected.get(namespace or '/', False)
|
||||
|
||||
def connect(self, namespace=None, query_string=None, headers=None):
|
||||
"""Connect the client.
|
||||
|
||||
:param namespace: The namespace for the client. If not provided, the
|
||||
client connects to the server on the global
|
||||
namespace.
|
||||
:param query_string: A string with custom query string arguments.
|
||||
:param headers: A dictionary with custom HTTP headers.
|
||||
|
||||
Note that it is usually not necessary to explicitly call this method,
|
||||
since a connection is automatically established when an instance of
|
||||
this class is created. An example where it this method would be useful
|
||||
is when the application accepts multiple namespace connections.
|
||||
"""
|
||||
url = '/socket.io'
|
||||
if query_string:
|
||||
if query_string[0] != '?':
|
||||
query_string = '?' + query_string
|
||||
url += query_string
|
||||
environ = EnvironBuilder(url, headers=headers).get_environ()
|
||||
environ['flask.app'] = self.app
|
||||
if self.flask_test_client:
|
||||
# inject cookies from Flask
|
||||
self.flask_test_client.cookie_jar.inject_wsgi(environ)
|
||||
self.connected['/'] = True
|
||||
if self.socketio.server._handle_eio_connect(
|
||||
self.sid, environ) is False:
|
||||
del self.connected['/']
|
||||
if namespace is not None and namespace != '/':
|
||||
self.connected[namespace] = True
|
||||
pkt = packet.Packet(packet.CONNECT, namespace=namespace)
|
||||
with self.app.app_context():
|
||||
if self.socketio.server._handle_eio_message(
|
||||
self.sid, pkt.encode()) is False:
|
||||
del self.connected[namespace]
|
||||
|
||||
def disconnect(self, namespace=None):
|
||||
"""Disconnect the client.
|
||||
|
||||
:param namespace: The namespace to disconnect. The global namespace is
|
||||
assumed if this argument is not provided.
|
||||
"""
|
||||
if not self.is_connected(namespace):
|
||||
raise RuntimeError('not connected')
|
||||
pkt = packet.Packet(packet.DISCONNECT, namespace=namespace)
|
||||
with self.app.app_context():
|
||||
self.socketio.server._handle_eio_message(self.sid, pkt.encode())
|
||||
del self.connected[namespace or '/']
|
||||
|
||||
def emit(self, event, *args, **kwargs):
|
||||
"""Emit an event to the server.
|
||||
|
||||
:param event: The event name.
|
||||
:param *args: The event arguments.
|
||||
:param callback: ``True`` if the client requests a callback, ``False``
|
||||
if not. Note that client-side callbacks are not
|
||||
implemented, a callback request will just tell the
|
||||
server to provide the arguments to invoke the
|
||||
callback, but no callback is invoked. Instead, the
|
||||
arguments that the server provided for the callback
|
||||
are returned by this function.
|
||||
:param namespace: The namespace of the event. The global namespace is
|
||||
assumed if this argument is not provided.
|
||||
"""
|
||||
namespace = kwargs.pop('namespace', None)
|
||||
if not self.is_connected(namespace):
|
||||
raise RuntimeError('not connected')
|
||||
callback = kwargs.pop('callback', False)
|
||||
id = None
|
||||
if callback:
|
||||
self.callback_counter += 1
|
||||
id = self.callback_counter
|
||||
pkt = packet.Packet(packet.EVENT, data=[event] + list(args),
|
||||
namespace=namespace, id=id)
|
||||
with self.app.app_context():
|
||||
encoded_pkt = pkt.encode()
|
||||
if isinstance(encoded_pkt, list):
|
||||
for epkt in encoded_pkt:
|
||||
self.socketio.server._handle_eio_message(self.sid, epkt)
|
||||
else:
|
||||
self.socketio.server._handle_eio_message(self.sid, encoded_pkt)
|
||||
ack = self.acks.pop(self.sid, None)
|
||||
if ack is not None:
|
||||
return ack['args'][0] if len(ack['args']) == 1 \
|
||||
else ack['args']
|
||||
|
||||
def send(self, data, json=False, callback=False, namespace=None):
|
||||
"""Send a text or JSON message to the server.
|
||||
|
||||
:param data: A string, dictionary or list to send to the server.
|
||||
:param json: ``True`` to send a JSON message, ``False`` to send a text
|
||||
message.
|
||||
:param callback: ``True`` if the client requests a callback, ``False``
|
||||
if not. Note that client-side callbacks are not
|
||||
implemented, a callback request will just tell the
|
||||
server to provide the arguments to invoke the
|
||||
callback, but no callback is invoked. Instead, the
|
||||
arguments that the server provided for the callback
|
||||
are returned by this function.
|
||||
:param namespace: The namespace of the event. The global namespace is
|
||||
assumed if this argument is not provided.
|
||||
"""
|
||||
if json:
|
||||
msg = 'json'
|
||||
else:
|
||||
msg = 'message'
|
||||
return self.emit(msg, data, callback=callback, namespace=namespace)
|
||||
|
||||
def get_received(self, namespace=None):
|
||||
"""Return the list of messages received from the server.
|
||||
|
||||
Since this is not a real client, any time the server emits an event,
|
||||
the event is simply stored. The test code can invoke this method to
|
||||
obtain the list of events that were received since the last call.
|
||||
|
||||
:param namespace: The namespace to get events from. The global
|
||||
namespace is assumed if this argument is not
|
||||
provided.
|
||||
"""
|
||||
if not self.is_connected(namespace):
|
||||
raise RuntimeError('not connected')
|
||||
namespace = namespace or '/'
|
||||
r = [pkt for pkt in self.queue[self.sid]
|
||||
if pkt['namespace'] == namespace]
|
||||
self.queue[self.sid] = [pkt for pkt in self.queue[self.sid]
|
||||
if pkt not in r]
|
||||
return r
|
||||
@@ -1,38 +0,0 @@
|
||||
import sys
|
||||
|
||||
from .client import Client
|
||||
from .base_manager import BaseManager
|
||||
from .pubsub_manager import PubSubManager
|
||||
from .kombu_manager import KombuManager
|
||||
from .redis_manager import RedisManager
|
||||
from .kafka_manager import KafkaManager
|
||||
from .zmq_manager import ZmqManager
|
||||
from .server import Server
|
||||
from .namespace import Namespace, ClientNamespace
|
||||
from .middleware import WSGIApp, Middleware
|
||||
from .tornado import get_tornado_handler
|
||||
if sys.version_info >= (3, 5): # pragma: no cover
|
||||
from .asyncio_client import AsyncClient
|
||||
from .asyncio_server import AsyncServer
|
||||
from .asyncio_manager import AsyncManager
|
||||
from .asyncio_namespace import AsyncNamespace, AsyncClientNamespace
|
||||
from .asyncio_redis_manager import AsyncRedisManager
|
||||
from .asyncio_aiopika_manager import AsyncAioPikaManager
|
||||
from .asgi import ASGIApp
|
||||
else: # pragma: no cover
|
||||
AsyncClient = None
|
||||
AsyncServer = None
|
||||
AsyncManager = None
|
||||
AsyncNamespace = None
|
||||
AsyncRedisManager = None
|
||||
AsyncAioPikaManager = None
|
||||
|
||||
__version__ = '4.4.0'
|
||||
|
||||
__all__ = ['__version__', 'Client', 'Server', 'BaseManager', 'PubSubManager',
|
||||
'KombuManager', 'RedisManager', 'ZmqManager', 'KafkaManager',
|
||||
'Namespace', 'ClientNamespace', 'WSGIApp', 'Middleware']
|
||||
if AsyncServer is not None: # pragma: no cover
|
||||
__all__ += ['AsyncClient', 'AsyncServer', 'AsyncNamespace',
|
||||
'AsyncClientNamespace', 'AsyncManager', 'AsyncRedisManager',
|
||||
'ASGIApp', 'get_tornado_handler', 'AsyncAioPikaManager']
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user