From 95b8aadb239bdce8d7f7a03e4ab995e56bf4e820 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis=20V=C3=A9zina?= <5130500+morpheus65535@users.noreply.github.com> Date: Wed, 29 Jan 2020 06:53:29 -0500 Subject: [PATCH] WIP --- bazarr/config.py | 6 +- bazarr/get_subtitle.py | 19 +- bazarr/libs.py | 5 - bazarr/list_subtitles.py | 10 +- bazarr/logger.py | 3 +- bazarr/main.py | 7 +- libs2/bs4/AUTHORS.txt | 43 - libs2/bs4/COPYING.txt | 27 - libs2/bs4/NEWS.txt | 1190 ---------- libs2/bs4/README.txt | 63 - libs2/bs4/TODO.txt | 31 - libs2/bs4/__init__.py | 529 ----- libs2/bs4/builder/__init__.py | 333 --- libs2/bs4/builder/_html5lib.py | 426 ---- libs2/bs4/builder/_htmlparser.py | 314 --- libs2/bs4/builder/_lxml.py | 258 -- libs2/bs4/dammit.py | 842 ------- libs2/bs4/diagnose.py | 219 -- libs2/bs4/element.py | 1808 -------------- libs2/bs4/formatter.py | 99 - libs2/bs4/testing.py | 770 ------ libs2/bs4/tests/__init__.py | 1 - libs2/bs4/tests/test_builder_registry.py | 147 -- libs2/bs4/tests/test_docs.py | 36 - libs2/bs4/tests/test_html5lib.py | 130 - libs2/bs4/tests/test_htmlparser.py | 34 - libs2/bs4/tests/test_lxml.py | 76 - libs2/bs4/tests/test_soup.py | 501 ---- libs2/bs4/tests/test_tree.py | 2050 ---------------- libs2/concurrent/__init__.py | 3 - libs2/concurrent/futures/__init__.py | 23 - libs2/concurrent/futures/_base.py | 607 ----- libs2/concurrent/futures/process.py | 359 --- libs2/concurrent/futures/thread.py | 134 -- libs2/enum/LICENSE | 32 - libs2/enum/README | 3 - libs2/enum/__init__.py | 837 ------- libs2/enum/doc/enum.pdf | 2237 ------------------ libs2/enum/doc/enum.rst | 735 ------ libs2/enum/test.py | 1820 -------------- libs2/yaml/__init__.py | 406 ---- libs2/yaml/composer.py | 139 -- libs2/yaml/constructor.py | 709 ------ libs2/yaml/cyaml.py | 101 - libs2/yaml/dumper.py | 62 - libs2/yaml/emitter.py | 1144 --------- libs2/yaml/error.py | 75 - libs2/yaml/events.py | 86 - libs2/yaml/loader.py | 63 - libs2/yaml/nodes.py | 49 - libs2/yaml/parser.py | 589 ----- libs2/yaml/reader.py | 188 -- libs2/yaml/representer.py | 488 ---- libs2/yaml/resolver.py | 227 -- libs2/yaml/scanner.py | 1444 ----------- libs2/yaml/serializer.py | 111 - libs2/yaml/tokens.py | 104 - libs3/bs4/__init__.py | 616 ----- libs3/bs4/builder/__init__.py | 367 --- libs3/bs4/builder/_html5lib.py | 426 ---- libs3/bs4/builder/_htmlparser.py | 350 --- libs3/bs4/builder/_lxml.py | 296 --- libs3/bs4/dammit.py | 850 ------- libs3/bs4/diagnose.py | 224 -- libs3/bs4/element.py | 1579 ------------ libs3/bs4/formatter.py | 99 - libs3/bs4/testing.py | 992 -------- libs3/bs4/tests/__init__.py | 1 - libs3/bs4/tests/test_builder_registry.py | 147 -- libs3/bs4/tests/test_docs.py | 36 - libs3/bs4/tests/test_html5lib.py | 170 -- libs3/bs4/tests/test_htmlparser.py | 47 - libs3/bs4/tests/test_lxml.py | 100 - libs3/bs4/tests/test_soup.py | 567 ----- libs3/bs4/tests/test_tree.py | 2205 ----------------- libs3/engineio/__init__.py | 25 - libs3/engineio/async_drivers/__init__.py | 0 libs3/engineio/async_drivers/aiohttp.py | 128 - libs3/engineio/async_drivers/asgi.py | 214 -- libs3/engineio/async_drivers/eventlet.py | 30 - libs3/engineio/async_drivers/gevent.py | 63 - libs3/engineio/async_drivers/gevent_uwsgi.py | 156 -- libs3/engineio/async_drivers/sanic.py | 144 -- libs3/engineio/async_drivers/threading.py | 17 - libs3/engineio/async_drivers/tornado.py | 184 -- libs3/engineio/asyncio_client.py | 585 ----- libs3/engineio/asyncio_server.py | 472 ---- libs3/engineio/asyncio_socket.py | 236 -- libs3/engineio/client.py | 680 ------ libs3/engineio/exceptions.py | 22 - libs3/engineio/middleware.py | 87 - libs3/engineio/packet.py | 92 - libs3/engineio/payload.py | 81 - libs3/engineio/server.py | 675 ------ libs3/engineio/socket.py | 248 -- libs3/engineio/static_files.py | 55 - libs3/flask_socketio/__init__.py | 922 -------- libs3/flask_socketio/namespace.py | 47 - libs3/flask_socketio/test_client.py | 205 -- libs3/socketio/__init__.py | 38 - libs3/socketio/asgi.py | 36 - libs3/socketio/asyncio_aiopika_manager.py | 105 - libs3/socketio/asyncio_client.py | 475 ---- libs3/socketio/asyncio_manager.py | 58 - libs3/socketio/asyncio_namespace.py | 204 -- libs3/socketio/asyncio_pubsub_manager.py | 163 -- libs3/socketio/asyncio_redis_manager.py | 107 - libs3/socketio/asyncio_server.py | 526 ---- libs3/socketio/base_manager.py | 178 -- libs3/socketio/client.py | 620 ----- libs3/socketio/exceptions.py | 30 - libs3/socketio/kafka_manager.py | 63 - libs3/socketio/kombu_manager.py | 122 - libs3/socketio/middleware.py | 42 - libs3/socketio/namespace.py | 191 -- libs3/socketio/packet.py | 179 -- libs3/socketio/pubsub_manager.py | 154 -- libs3/socketio/redis_manager.py | 115 - libs3/socketio/server.py | 730 ------ libs3/socketio/tornado.py | 11 - libs3/socketio/zmq_manager.py | 111 - libs3/yaml/__init__.py | 402 ---- libs3/yaml/composer.py | 139 -- libs3/yaml/constructor.py | 720 ------ libs3/yaml/cyaml.py | 101 - libs3/yaml/dumper.py | 62 - libs3/yaml/emitter.py | 1137 --------- libs3/yaml/error.py | 75 - libs3/yaml/events.py | 86 - libs3/yaml/loader.py | 63 - libs3/yaml/nodes.py | 49 - libs3/yaml/parser.py | 589 ----- libs3/yaml/reader.py | 185 -- libs3/yaml/representer.py | 389 --- libs3/yaml/resolver.py | 227 -- libs3/yaml/scanner.py | 1435 ----------- libs3/yaml/serializer.py | 111 - libs3/yaml/tokens.py | 104 - views/episodes.html | 2 +- 139 files changed, 12 insertions(+), 47314 deletions(-) delete mode 100644 libs2/bs4/AUTHORS.txt delete mode 100644 libs2/bs4/COPYING.txt delete mode 100644 libs2/bs4/NEWS.txt delete mode 100644 libs2/bs4/README.txt delete mode 100644 libs2/bs4/TODO.txt delete mode 100644 libs2/bs4/__init__.py delete mode 100644 libs2/bs4/builder/__init__.py delete mode 100644 libs2/bs4/builder/_html5lib.py delete mode 100644 libs2/bs4/builder/_htmlparser.py delete mode 100644 libs2/bs4/builder/_lxml.py delete mode 100644 libs2/bs4/dammit.py delete mode 100644 libs2/bs4/diagnose.py delete mode 100644 libs2/bs4/element.py delete mode 100644 libs2/bs4/formatter.py delete mode 100644 libs2/bs4/testing.py delete mode 100644 libs2/bs4/tests/__init__.py delete mode 100644 libs2/bs4/tests/test_builder_registry.py delete mode 100644 libs2/bs4/tests/test_docs.py delete mode 100644 libs2/bs4/tests/test_html5lib.py delete mode 100644 libs2/bs4/tests/test_htmlparser.py delete mode 100644 libs2/bs4/tests/test_lxml.py delete mode 100644 libs2/bs4/tests/test_soup.py delete mode 100644 libs2/bs4/tests/test_tree.py delete mode 100644 libs2/concurrent/__init__.py delete mode 100644 libs2/concurrent/futures/__init__.py delete mode 100644 libs2/concurrent/futures/_base.py delete mode 100644 libs2/concurrent/futures/process.py delete mode 100644 libs2/concurrent/futures/thread.py delete mode 100644 libs2/enum/LICENSE delete mode 100644 libs2/enum/README delete mode 100644 libs2/enum/__init__.py delete mode 100644 libs2/enum/doc/enum.pdf delete mode 100644 libs2/enum/doc/enum.rst delete mode 100644 libs2/enum/test.py delete mode 100644 libs2/yaml/__init__.py delete mode 100644 libs2/yaml/composer.py delete mode 100644 libs2/yaml/constructor.py delete mode 100644 libs2/yaml/cyaml.py delete mode 100644 libs2/yaml/dumper.py delete mode 100644 libs2/yaml/emitter.py delete mode 100644 libs2/yaml/error.py delete mode 100644 libs2/yaml/events.py delete mode 100644 libs2/yaml/loader.py delete mode 100644 libs2/yaml/nodes.py delete mode 100644 libs2/yaml/parser.py delete mode 100644 libs2/yaml/reader.py delete mode 100644 libs2/yaml/representer.py delete mode 100644 libs2/yaml/resolver.py delete mode 100644 libs2/yaml/scanner.py delete mode 100644 libs2/yaml/serializer.py delete mode 100644 libs2/yaml/tokens.py delete mode 100644 libs3/bs4/__init__.py delete mode 100644 libs3/bs4/builder/__init__.py delete mode 100644 libs3/bs4/builder/_html5lib.py delete mode 100644 libs3/bs4/builder/_htmlparser.py delete mode 100644 libs3/bs4/builder/_lxml.py delete mode 100644 libs3/bs4/dammit.py delete mode 100644 libs3/bs4/diagnose.py delete mode 100644 libs3/bs4/element.py delete mode 100644 libs3/bs4/formatter.py delete mode 100644 libs3/bs4/testing.py delete mode 100644 libs3/bs4/tests/__init__.py delete mode 100644 libs3/bs4/tests/test_builder_registry.py delete mode 100644 libs3/bs4/tests/test_docs.py delete mode 100644 libs3/bs4/tests/test_html5lib.py delete mode 100644 libs3/bs4/tests/test_htmlparser.py delete mode 100644 libs3/bs4/tests/test_lxml.py delete mode 100644 libs3/bs4/tests/test_soup.py delete mode 100644 libs3/bs4/tests/test_tree.py delete mode 100644 libs3/engineio/__init__.py delete mode 100644 libs3/engineio/async_drivers/__init__.py delete mode 100644 libs3/engineio/async_drivers/aiohttp.py delete mode 100644 libs3/engineio/async_drivers/asgi.py delete mode 100644 libs3/engineio/async_drivers/eventlet.py delete mode 100644 libs3/engineio/async_drivers/gevent.py delete mode 100644 libs3/engineio/async_drivers/gevent_uwsgi.py delete mode 100644 libs3/engineio/async_drivers/sanic.py delete mode 100644 libs3/engineio/async_drivers/threading.py delete mode 100644 libs3/engineio/async_drivers/tornado.py delete mode 100644 libs3/engineio/asyncio_client.py delete mode 100644 libs3/engineio/asyncio_server.py delete mode 100644 libs3/engineio/asyncio_socket.py delete mode 100644 libs3/engineio/client.py delete mode 100644 libs3/engineio/exceptions.py delete mode 100644 libs3/engineio/middleware.py delete mode 100644 libs3/engineio/packet.py delete mode 100644 libs3/engineio/payload.py delete mode 100644 libs3/engineio/server.py delete mode 100644 libs3/engineio/socket.py delete mode 100644 libs3/engineio/static_files.py delete mode 100644 libs3/flask_socketio/__init__.py delete mode 100644 libs3/flask_socketio/namespace.py delete mode 100644 libs3/flask_socketio/test_client.py delete mode 100644 libs3/socketio/__init__.py delete mode 100644 libs3/socketio/asgi.py delete mode 100644 libs3/socketio/asyncio_aiopika_manager.py delete mode 100644 libs3/socketio/asyncio_client.py delete mode 100644 libs3/socketio/asyncio_manager.py delete mode 100644 libs3/socketio/asyncio_namespace.py delete mode 100644 libs3/socketio/asyncio_pubsub_manager.py delete mode 100644 libs3/socketio/asyncio_redis_manager.py delete mode 100644 libs3/socketio/asyncio_server.py delete mode 100644 libs3/socketio/base_manager.py delete mode 100644 libs3/socketio/client.py delete mode 100644 libs3/socketio/exceptions.py delete mode 100644 libs3/socketio/kafka_manager.py delete mode 100644 libs3/socketio/kombu_manager.py delete mode 100644 libs3/socketio/middleware.py delete mode 100644 libs3/socketio/namespace.py delete mode 100644 libs3/socketio/packet.py delete mode 100644 libs3/socketio/pubsub_manager.py delete mode 100644 libs3/socketio/redis_manager.py delete mode 100644 libs3/socketio/server.py delete mode 100644 libs3/socketio/tornado.py delete mode 100644 libs3/socketio/zmq_manager.py delete mode 100644 libs3/yaml/__init__.py delete mode 100644 libs3/yaml/composer.py delete mode 100644 libs3/yaml/constructor.py delete mode 100644 libs3/yaml/cyaml.py delete mode 100644 libs3/yaml/dumper.py delete mode 100644 libs3/yaml/emitter.py delete mode 100644 libs3/yaml/error.py delete mode 100644 libs3/yaml/events.py delete mode 100644 libs3/yaml/loader.py delete mode 100644 libs3/yaml/nodes.py delete mode 100644 libs3/yaml/parser.py delete mode 100644 libs3/yaml/reader.py delete mode 100644 libs3/yaml/representer.py delete mode 100644 libs3/yaml/resolver.py delete mode 100644 libs3/yaml/scanner.py delete mode 100644 libs3/yaml/serializer.py delete mode 100644 libs3/yaml/tokens.py diff --git a/bazarr/config.py b/bazarr/config.py index 6028bc56c..5eaacc92d 100644 --- a/bazarr/config.py +++ b/bazarr/config.py @@ -5,7 +5,6 @@ import os from simpleconfigparser import simpleconfigparser from get_args import args -from six import PY3 defaults = { 'general': { @@ -140,10 +139,7 @@ defaults = { } } -if PY3: - settings = simpleconfigparser(defaults=defaults, interpolation=None) -else: - settings = simpleconfigparser(defaults=defaults) +settings = simpleconfigparser(defaults=defaults, interpolation=None) settings.read(os.path.join(args.config_dir, 'config', 'config.ini')) base_url = settings.general.base_url diff --git a/bazarr/get_subtitle.py b/bazarr/get_subtitle.py index 698afd853..ea5e31495 100644 --- a/bazarr/get_subtitle.py +++ b/bazarr/get_subtitle.py @@ -1126,28 +1126,17 @@ def postprocessing(command, path): try: encoding = getpreferredencoding() if os.name == 'nt': - if six.PY3: - codepage = subprocess.Popen("chcp", shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, encoding=getpreferredencoding()) - else: - codepage = subprocess.Popen("chcp", shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + codepage = subprocess.Popen("chcp", shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, encoding=getpreferredencoding()) # wait for the process to terminate out_codepage, err_codepage = codepage.communicate() encoding = out_codepage.split(':')[-1].strip() - if six.PY3: - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE, encoding=encoding) - else: - process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, - stderr=subprocess.PIPE) + process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, + stderr=subprocess.PIPE, encoding=encoding) # wait for the process to terminate out, err = process.communicate() - if six.PY2: - out = out.decode(encoding) - out = out.replace('\n', ' ').replace('\r', ' ') except Exception as e: diff --git a/bazarr/libs.py b/bazarr/libs.py index 078fbec3e..a3450e315 100644 --- a/bazarr/libs.py +++ b/bazarr/libs.py @@ -18,11 +18,6 @@ def clean_libs(): def set_libs(): sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs/')) - from six import PY3 - if PY3: - sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs3/')) - else: - sys.path.insert(0, os.path.join(os.path.dirname(__file__), '../libs2/')) clean_libs() diff --git a/bazarr/list_subtitles.py b/bazarr/list_subtitles.py index 34641d1e0..f7cf3b0e6 100644 --- a/bazarr/list_subtitles.py +++ b/bazarr/list_subtitles.py @@ -379,17 +379,11 @@ def guess_external_subtitles(dest_folder, subtitles): continue detected_language = None - if six.PY3: - with open(subtitle_path, 'r', errors='ignore') as f: - text = f.read() - else: - with open(subtitle_path, 'r') as f: - text = f.read() + with open(subtitle_path, 'r', errors='ignore') as f: + text = f.read() try: encoding = UnicodeDammit(text) - if six.PY2: - text = text.decode(encoding.original_encoding) detected_language = langdetect.detect(text) except Exception as e: logging.exception('BAZARR Error trying to detect language for this subtitles file: ' + diff --git a/bazarr/logger.py b/bazarr/logger.py index 1bf8a6476..49af82261 100644 --- a/bazarr/logger.py +++ b/bazarr/logger.py @@ -42,8 +42,7 @@ class NoExceptionFormatter(logging.Formatter): def configure_logging(debug=False): - if six.PY3: - warnings.simplefilter('ignore', category=ResourceWarning) + warnings.simplefilter('ignore', category=ResourceWarning) if not debug: log_level = "INFO" diff --git a/bazarr/main.py b/bazarr/main.py index f4ab85977..e850eaff4 100644 --- a/bazarr/main.py +++ b/bazarr/main.py @@ -1,6 +1,6 @@ # coding=utf-8 -bazarr_version = '0.8.4.1' +bazarr_version = '0.9' import os os.environ["SZ_USER_AGENT"] = "Bazarr/1" @@ -41,7 +41,7 @@ from notifier import update_notifier from cherrypy.wsgiserver import CherryPyWSGIServer from io import BytesIO -from six import text_type, PY2 +from six import text_type from datetime import timedelta from get_languages import load_language_in_db, language_from_alpha3, language_from_alpha2, alpha2_from_alpha3 from flask import make_response, request, redirect, abort, render_template, Response, session, flash, url_for, \ @@ -1647,8 +1647,7 @@ def movie_history(no): # Mute DeprecationWarning warnings.simplefilter("ignore", DeprecationWarning) -if six.PY3: - warnings.simplefilter("ignore", BrokenPipeError) +warnings.simplefilter("ignore", BrokenPipeError) if args.dev: server = app.run( host=str(settings.general.ip), port=(int(args.port) if args.port else int(settings.general.port))) diff --git a/libs2/bs4/AUTHORS.txt b/libs2/bs4/AUTHORS.txt deleted file mode 100644 index 2ac8fcc8c..000000000 --- a/libs2/bs4/AUTHORS.txt +++ /dev/null @@ -1,43 +0,0 @@ -Behold, mortal, the origins of Beautiful Soup... -================================================ - -Leonard Richardson is the primary programmer. - -Aaron DeVore is awesome. - -Mark Pilgrim provided the encoding detection code that forms the base -of UnicodeDammit. - -Thomas Kluyver and Ezio Melotti finished the work of getting Beautiful -Soup 4 working under Python 3. - -Simon Willison wrote soupselect, which was used to make Beautiful Soup -support CSS selectors. - -Sam Ruby helped with a lot of edge cases. - -Jonathan Ellis was awarded the prestigous Beau Potage D'Or for his -work in solving the nestable tags conundrum. - -An incomplete list of people have contributed patches to Beautiful -Soup: - - Istvan Albert, Andrew Lin, Anthony Baxter, Andrew Boyko, Tony Chang, - Zephyr Fang, Fuzzy, Roman Gaufman, Yoni Gilad, Richie Hindle, Peteris - Krumins, Kent Johnson, Ben Last, Robert Leftwich, Staffan Malmgren, - Ksenia Marasanova, JP Moins, Adam Monsen, John Nagle, "Jon", Ed - Oskiewicz, Greg Phillips, Giles Radford, Arthur Rudolph, Marko - Samastur, Jouni Seppänen, Alexander Schmolck, Andy Theyers, Glyn - Webster, Paul Wright, Danny Yoo - -An incomplete list of people who made suggestions or found bugs or -found ways to break Beautiful Soup: - - Hanno Böck, Matteo Bertini, Chris Curvey, Simon Cusack, Bruce Eckel, - Matt Ernst, Michael Foord, Tom Harris, Bill de hOra, Donald Howes, - Matt Patterson, Scott Roberts, Steve Strassmann, Mike Williams, - warchild at redho dot com, Sami Kuisma, Carlos Rocha, Bob Hutchison, - Joren Mc, Michal Migurski, John Kleven, Tim Heaney, Tripp Lilley, Ed - Summers, Dennis Sutch, Chris Smith, Aaron Sweep^W Swartz, Stuart - Turner, Greg Edwards, Kevin J Kalupson, Nikos Kouremenos, Artur de - Sousa Rocha, Yichun Wei, Per Vognsen diff --git a/libs2/bs4/COPYING.txt b/libs2/bs4/COPYING.txt deleted file mode 100644 index b91188869..000000000 --- a/libs2/bs4/COPYING.txt +++ /dev/null @@ -1,27 +0,0 @@ -Beautiful Soup is made available under the MIT license: - - Copyright (c) 2004-2015 Leonard Richardson - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - SOFTWARE. - -Beautiful Soup incorporates code from the html5lib library, which is -also made available under the MIT license. Copyright (c) 2006-2013 -James Graham and other contributors diff --git a/libs2/bs4/NEWS.txt b/libs2/bs4/NEWS.txt deleted file mode 100644 index 3726c570a..000000000 --- a/libs2/bs4/NEWS.txt +++ /dev/null @@ -1,1190 +0,0 @@ -= 4.4.1 (20150928) = - -* Fixed a bug that deranged the tree when part of it was - removed. Thanks to Eric Weiser for the patch and John Wiseman for a - test. [bug=1481520] - -* Fixed a parse bug with the html5lib tree-builder. Thanks to Roel - Kramer for the patch. [bug=1483781] - -* Improved the implementation of CSS selector grouping. Thanks to - Orangain for the patch. [bug=1484543] - -* Fixed the test_detect_utf8 test so that it works when chardet is - installed. [bug=1471359] - -* Corrected the output of Declaration objects. [bug=1477847] - - -= 4.4.0 (20150703) = - -Especially important changes: - -* Added a warning when you instantiate a BeautifulSoup object without - explicitly naming a parser. [bug=1398866] - -* __repr__ now returns an ASCII bytestring in Python 2, and a Unicode - string in Python 3, instead of a UTF8-encoded bytestring in both - versions. In Python 3, __str__ now returns a Unicode string instead - of a bytestring. [bug=1420131] - -* The `text` argument to the find_* methods is now called `string`, - which is more accurate. `text` still works, but `string` is the - argument described in the documentation. `text` may eventually - change its meaning, but not for a very long time. [bug=1366856] - -* Changed the way soup objects work under copy.copy(). Copying a - NavigableString or a Tag will give you a new NavigableString that's - equal to the old one but not connected to the parse tree. Patch by - Martijn Peters. [bug=1307490] - -* Started using a standard MIT license. [bug=1294662] - -* Added a Chinese translation of the documentation by Delong .w. - -New features: - -* Introduced the select_one() method, which uses a CSS selector but - only returns the first match, instead of a list of - matches. [bug=1349367] - -* You can now create a Tag object without specifying a - TreeBuilder. Patch by Martijn Pieters. [bug=1307471] - -* You can now create a NavigableString or a subclass just by invoking - the constructor. [bug=1294315] - -* Added an `exclude_encodings` argument to UnicodeDammit and to the - Beautiful Soup constructor, which lets you prohibit the detection of - an encoding that you know is wrong. [bug=1469408] - -* The select() method now supports selector grouping. Patch by - Francisco Canas [bug=1191917] - -Bug fixes: - -* Fixed yet another problem that caused the html5lib tree builder to - create a disconnected parse tree. [bug=1237763] - -* Force object_was_parsed() to keep the tree intact even when an element - from later in the document is moved into place. [bug=1430633] - -* Fixed yet another bug that caused a disconnected tree when html5lib - copied an element from one part of the tree to another. [bug=1270611] - -* Fixed a bug where Element.extract() could create an infinite loop in - the remaining tree. - -* The select() method can now find tags whose names contain - dashes. Patch by Francisco Canas. [bug=1276211] - -* The select() method can now find tags with attributes whose names - contain dashes. Patch by Marek Kapolka. [bug=1304007] - -* Improved the lxml tree builder's handling of processing - instructions. [bug=1294645] - -* Restored the helpful syntax error that happens when you try to - import the Python 2 edition of Beautiful Soup under Python - 3. [bug=1213387] - -* In Python 3.4 and above, set the new convert_charrefs argument to - the html.parser constructor to avoid a warning and future - failures. Patch by Stefano Revera. [bug=1375721] - -* The warning when you pass in a filename or URL as markup will now be - displayed correctly even if the filename or URL is a Unicode - string. [bug=1268888] - -* If the initial tag contains a CDATA list attribute such as - 'class', the html5lib tree builder will now turn its value into a - list, as it would with any other tag. [bug=1296481] - -* Fixed an import error in Python 3.5 caused by the removal of the - HTMLParseError class. [bug=1420063] - -* Improved docstring for encode_contents() and - decode_contents(). [bug=1441543] - -* Fixed a crash in Unicode, Dammit's encoding detector when the name - of the encoding itself contained invalid bytes. [bug=1360913] - -* Improved the exception raised when you call .unwrap() or - .replace_with() on an element that's not attached to a tree. - -* Raise a NotImplementedError whenever an unsupported CSS pseudoclass - is used in select(). Previously some cases did not result in a - NotImplementedError. - -* It's now possible to pickle a BeautifulSoup object no matter which - tree builder was used to create it. However, the only tree builder - that survives the pickling process is the HTMLParserTreeBuilder - ('html.parser'). If you unpickle a BeautifulSoup object created with - some other tree builder, soup.builder will be None. [bug=1231545] - -= 4.3.2 (20131002) = - -* Fixed a bug in which short Unicode input was improperly encoded to - ASCII when checking whether or not it was the name of a file on - disk. [bug=1227016] - -* Fixed a crash when a short input contains data not valid in - filenames. [bug=1232604] - -* Fixed a bug that caused Unicode data put into UnicodeDammit to - return None instead of the original data. [bug=1214983] - -* Combined two tests to stop a spurious test failure when tests are - run by nosetests. [bug=1212445] - -= 4.3.1 (20130815) = - -* Fixed yet another problem with the html5lib tree builder, caused by - html5lib's tendency to rearrange the tree during - parsing. [bug=1189267] - -* Fixed a bug that caused the optimized version of find_all() to - return nothing. [bug=1212655] - -= 4.3.0 (20130812) = - -* Instead of converting incoming data to Unicode and feeding it to the - lxml tree builder in chunks, Beautiful Soup now makes successive - guesses at the encoding of the incoming data, and tells lxml to - parse the data as that encoding. Giving lxml more control over the - parsing process improves performance and avoids a number of bugs and - issues with the lxml parser which had previously required elaborate - workarounds: - - - An issue in which lxml refuses to parse Unicode strings on some - systems. [bug=1180527] - - - A returning bug that truncated documents longer than a (very - small) size. [bug=963880] - - - A returning bug in which extra spaces were added to a document if - the document defined a charset other than UTF-8. [bug=972466] - - This required a major overhaul of the tree builder architecture. If - you wrote your own tree builder and didn't tell me, you'll need to - modify your prepare_markup() method. - -* The UnicodeDammit code that makes guesses at encodings has been - split into its own class, EncodingDetector. A lot of apparently - redundant code has been removed from Unicode, Dammit, and some - undocumented features have also been removed. - -* Beautiful Soup will issue a warning if instead of markup you pass it - a URL or the name of a file on disk (a common beginner's mistake). - -* A number of optimizations improve the performance of the lxml tree - builder by about 33%, the html.parser tree builder by about 20%, and - the html5lib tree builder by about 15%. - -* All find_all calls should now return a ResultSet object. Patch by - Aaron DeVore. [bug=1194034] - -= 4.2.1 (20130531) = - -* The default XML formatter will now replace ampersands even if they - appear to be part of entities. That is, "<" will become - "&lt;". The old code was left over from Beautiful Soup 3, which - didn't always turn entities into Unicode characters. - - If you really want the old behavior (maybe because you add new - strings to the tree, those strings include entities, and you want - the formatter to leave them alone on output), it can be found in - EntitySubstitution.substitute_xml_containing_entities(). [bug=1182183] - -* Gave new_string() the ability to create subclasses of - NavigableString. [bug=1181986] - -* Fixed another bug by which the html5lib tree builder could create a - disconnected tree. [bug=1182089] - -* The .previous_element of a BeautifulSoup object is now always None, - not the last element to be parsed. [bug=1182089] - -* Fixed test failures when lxml is not installed. [bug=1181589] - -* html5lib now supports Python 3. Fixed some Python 2-specific - code in the html5lib test suite. [bug=1181624] - -* The html.parser treebuilder can now handle numeric attributes in - text when the hexidecimal name of the attribute starts with a - capital X. Patch by Tim Shirley. [bug=1186242] - -= 4.2.0 (20130514) = - -* The Tag.select() method now supports a much wider variety of CSS - selectors. - - - Added support for the adjacent sibling combinator (+) and the - general sibling combinator (~). Tests by "liquider". [bug=1082144] - - - The combinators (>, +, and ~) can now combine with any supported - selector, not just one that selects based on tag name. - - - Added limited support for the "nth-of-type" pseudo-class. Code - by Sven Slootweg. [bug=1109952] - -* The BeautifulSoup class is now aliased to "_s" and "_soup", making - it quicker to type the import statement in an interactive session: - - from bs4 import _s - or - from bs4 import _soup - - The alias may change in the future, so don't use this in code you're - going to run more than once. - -* Added the 'diagnose' submodule, which includes several useful - functions for reporting problems and doing tech support. - - - diagnose(data) tries the given markup on every installed parser, - reporting exceptions and displaying successes. If a parser is not - installed, diagnose() mentions this fact. - - - lxml_trace(data, html=True) runs the given markup through lxml's - XML parser or HTML parser, and prints out the parser events as - they happen. This helps you quickly determine whether a given - problem occurs in lxml code or Beautiful Soup code. - - - htmlparser_trace(data) is the same thing, but for Python's - built-in HTMLParser class. - -* In an HTML document, the contents of a - -Hello, world! - - -''' - soup = self.soup(html) - self.assertEqual("text/javascript", soup.find('script')['type']) - - def test_comment(self): - # Comments are represented as Comment objects. - markup = "

foobaz

" - self.assertSoupEquals(markup) - - soup = self.soup(markup) - comment = soup.find(text="foobar") - self.assertEqual(comment.__class__, Comment) - - # The comment is properly integrated into the tree. - foo = soup.find(text="foo") - self.assertEqual(comment, foo.next_element) - baz = soup.find(text="baz") - self.assertEqual(comment, baz.previous_element) - - def test_preserved_whitespace_in_pre_and_textarea(self): - """Whitespace must be preserved in
 and "
-        self.assertSoupEquals(pre_markup)
-        self.assertSoupEquals(textarea_markup)
-
-        soup = self.soup(pre_markup)
-        self.assertEqual(soup.pre.prettify(), pre_markup)
-
-        soup = self.soup(textarea_markup)
-        self.assertEqual(soup.textarea.prettify(), textarea_markup)
-
-        soup = self.soup("")
-        self.assertEqual(soup.textarea.prettify(), "")
-
-    def test_nested_inline_elements(self):
-        """Inline elements can be nested indefinitely."""
-        b_tag = "Inside a B tag"
-        self.assertSoupEquals(b_tag)
-
-        nested_b_tag = "

A nested tag

" - self.assertSoupEquals(nested_b_tag) - - double_nested_b_tag = "

A doubly nested tag

" - self.assertSoupEquals(nested_b_tag) - - def test_nested_block_level_elements(self): - """Block elements can be nested.""" - soup = self.soup('

Foo

') - blockquote = soup.blockquote - self.assertEqual(blockquote.p.b.string, 'Foo') - self.assertEqual(blockquote.b.string, 'Foo') - - def test_correctly_nested_tables(self): - """One table can go inside another one.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_deeply_nested_multivalued_attribute(self): - # html5lib can set the attributes of the same tag many times - # as it rearranges the tree. This has caused problems with - # multivalued attributes. - markup = '
' - soup = self.soup(markup) - self.assertEqual(["css"], soup.div.div['class']) - - def test_multivalued_attribute_on_html(self): - # html5lib uses a different API to set the attributes ot the - # tag. This has caused problems with multivalued - # attributes. - markup = '' - soup = self.soup(markup) - self.assertEqual(["a", "b"], soup.html['class']) - - def test_angle_brackets_in_attribute_values_are_escaped(self): - self.assertSoupEquals('', '') - - def test_entities_in_attributes_converted_to_unicode(self): - expect = u'

' - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - self.assertSoupEquals('

', expect) - - def test_entities_in_text_converted_to_unicode(self): - expect = u'

pi\N{LATIN SMALL LETTER N WITH TILDE}ata

' - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - self.assertSoupEquals("

piñata

", expect) - - def test_quot_entity_converted_to_quotation_mark(self): - self.assertSoupEquals("

I said "good day!"

", - '

I said "good day!"

') - - def test_out_of_range_entity(self): - expect = u"\N{REPLACEMENT CHARACTER}" - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - self.assertSoupEquals("�", expect) - - def test_multipart_strings(self): - "Mostly to prevent a recurrence of a bug in the html5lib treebuilder." - soup = self.soup("

\nfoo

") - self.assertEqual("p", soup.h2.string.next_element.name) - self.assertEqual("p", soup.p.name) - self.assertConnectedness(soup) - - def test_empty_element_tags(self): - """Verify consistent handling of empty-element tags, - no matter how they come in through the markup. - """ - self.assertSoupEquals('


', "


") - self.assertSoupEquals('


', "


") - - def test_head_tag_between_head_and_body(self): - "Prevent recurrence of a bug in the html5lib treebuilder." - content = """ - - foo - -""" - soup = self.soup(content) - self.assertNotEqual(None, soup.html.body) - self.assertConnectedness(soup) - - def test_multiple_copies_of_a_tag(self): - "Prevent recurrence of a bug in the html5lib treebuilder." - content = """ - - -
-
- -
- - -""" - soup = self.soup(content) - self.assertConnectedness(soup.article) - - def test_basic_namespaces(self): - """Parsers don't need to *understand* namespaces, but at the - very least they should not choke on namespaces or lose - data.""" - - markup = b'4' - soup = self.soup(markup) - self.assertEqual(markup, soup.encode()) - html = soup.html - self.assertEqual('http://www.w3.org/1999/xhtml', soup.html['xmlns']) - self.assertEqual( - 'http://www.w3.org/1998/Math/MathML', soup.html['xmlns:mathml']) - self.assertEqual( - 'http://www.w3.org/2000/svg', soup.html['xmlns:svg']) - - def test_multivalued_attribute_value_becomes_list(self): - markup = b'' - soup = self.soup(markup) - self.assertEqual(['foo', 'bar'], soup.a['class']) - - # - # Generally speaking, tests below this point are more tests of - # Beautiful Soup than tests of the tree builders. But parsers are - # weird, so we run these tests separately for every tree builder - # to detect any differences between them. - # - - def test_can_parse_unicode_document(self): - # A seemingly innocuous document... but it's in Unicode! And - # it contains characters that can't be represented in the - # encoding found in the declaration! The horror! - markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.body.string) - - def test_soupstrainer(self): - """Parsers should be able to work with SoupStrainers.""" - strainer = SoupStrainer("b") - soup = self.soup("A bold statement", - parse_only=strainer) - self.assertEqual(soup.decode(), "bold") - - def test_single_quote_attribute_values_become_double_quotes(self): - self.assertSoupEquals("", - '') - - def test_attribute_values_with_nested_quotes_are_left_alone(self): - text = """a""" - self.assertSoupEquals(text) - - def test_attribute_values_with_double_nested_quotes_get_quoted(self): - text = """a""" - soup = self.soup(text) - soup.foo['attr'] = 'Brawls happen at "Bob\'s Bar"' - self.assertSoupEquals( - soup.foo.decode(), - """a""") - - def test_ampersand_in_attribute_value_gets_escaped(self): - self.assertSoupEquals('', - '') - - self.assertSoupEquals( - 'foo', - 'foo') - - def test_escaped_ampersand_in_attribute_value_is_left_alone(self): - self.assertSoupEquals('') - - def test_entities_in_strings_converted_during_parsing(self): - # Both XML and HTML entities are converted to Unicode characters - # during parsing. - text = "

<<sacré bleu!>>

" - expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

" - self.assertSoupEquals(text, expected) - - def test_smart_quotes_converted_on_the_way_in(self): - # Microsoft smart quotes are converted to Unicode characters during - # parsing. - quote = b"

\x91Foo\x92

" - soup = self.soup(quote) - self.assertEqual( - soup.p.string, - u"\N{LEFT SINGLE QUOTATION MARK}Foo\N{RIGHT SINGLE QUOTATION MARK}") - - def test_non_breaking_spaces_converted_on_the_way_in(self): - soup = self.soup("  ") - self.assertEqual(soup.a.string, u"\N{NO-BREAK SPACE}" * 2) - - def test_entities_converted_on_the_way_out(self): - text = "

<<sacré bleu!>>

" - expected = u"

<<sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!>>

".encode("utf-8") - soup = self.soup(text) - self.assertEqual(soup.p.encode("utf-8"), expected) - - def test_real_iso_latin_document(self): - # Smoke test of interrelated functionality, using an - # easy-to-understand document. - - # Here it is in Unicode. Note that it claims to be in ISO-Latin-1. - unicode_html = u'

Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!

' - - # That's because we're going to encode it into ISO-Latin-1, and use - # that to test. - iso_latin_html = unicode_html.encode("iso-8859-1") - - # Parse the ISO-Latin-1 HTML. - soup = self.soup(iso_latin_html) - # Encode it to UTF-8. - result = soup.encode("utf-8") - - # What do we expect the result to look like? Well, it would - # look like unicode_html, except that the META tag would say - # UTF-8 instead of ISO-Latin-1. - expected = unicode_html.replace("ISO-Latin-1", "utf-8") - - # And, of course, it would be in UTF-8, not Unicode. - expected = expected.encode("utf-8") - - # Ta-da! - self.assertEqual(result, expected) - - def test_real_shift_jis_document(self): - # Smoke test to make sure the parser can handle a document in - # Shift-JIS encoding, without choking. - shift_jis_html = ( - b'
'
-            b'\x82\xb1\x82\xea\x82\xcdShift-JIS\x82\xc5\x83R\x81[\x83f'
-            b'\x83B\x83\x93\x83O\x82\xb3\x82\xea\x82\xbd\x93\xfa\x96{\x8c'
-            b'\xea\x82\xcc\x83t\x83@\x83C\x83\x8b\x82\xc5\x82\xb7\x81B'
-            b'
') - unicode_html = shift_jis_html.decode("shift-jis") - soup = self.soup(unicode_html) - - # Make sure the parse tree is correctly encoded to various - # encodings. - self.assertEqual(soup.encode("utf-8"), unicode_html.encode("utf-8")) - self.assertEqual(soup.encode("euc_jp"), unicode_html.encode("euc_jp")) - - def test_real_hebrew_document(self): - # A real-world test to make sure we can convert ISO-8859-9 (a - # Hebrew encoding) to UTF-8. - hebrew_document = b'Hebrew (ISO 8859-8) in Visual Directionality

Hebrew (ISO 8859-8) in Visual Directionality

\xed\xe5\xec\xf9' - soup = self.soup( - hebrew_document, from_encoding="iso8859-8") - # Some tree builders call it iso8859-8, others call it iso-8859-9. - # That's not a difference we really care about. - assert soup.original_encoding in ('iso8859-8', 'iso-8859-8') - self.assertEqual( - soup.encode('utf-8'), - hebrew_document.decode("iso8859-8").encode("utf-8")) - - def test_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', {'http-equiv': 'Content-type'}) - content = parsed_meta['content'] - self.assertEqual('text/html; charset=x-sjis', content) - - # But that value is actually a ContentMetaAttributeValue object. - self.assertTrue(isinstance(content, ContentMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('text/html; charset=utf8', content.encode("utf8")) - - # For the rest of the story, see TestSubstitutions in - # test_tree.py. - - def test_html5_style_meta_tag_reflects_current_encoding(self): - # Here's the tag saying that a document is - # encoded in Shift-JIS. - meta_tag = ('') - - # Here's a document incorporating that meta tag. - shift_jis_html = ( - '\n%s\n' - '' - 'Shift-JIS markup goes here.') % meta_tag - soup = self.soup(shift_jis_html) - - # Parse the document, and the charset is seemingly unaffected. - parsed_meta = soup.find('meta', id="encoding") - charset = parsed_meta['charset'] - self.assertEqual('x-sjis', charset) - - # But that value is actually a CharsetMetaAttributeValue object. - self.assertTrue(isinstance(charset, CharsetMetaAttributeValue)) - - # And it will take on a value that reflects its current - # encoding. - self.assertEqual('utf8', charset.encode("utf8")) - - def test_tag_with_no_attributes_can_have_attributes_added(self): - data = self.soup("text") - data.a['foo'] = 'bar' - self.assertEqual('text', data.a.decode()) - -class XMLTreeBuilderSmokeTest(object): - - def test_pickle_and_unpickle_identity(self): - # Pickling a tree, then unpickling it, yields a tree identical - # to the original. - tree = self.soup("foo") - dumped = pickle.dumps(tree, 2) - loaded = pickle.loads(dumped) - self.assertEqual(loaded.__class__, BeautifulSoup) - self.assertEqual(loaded.decode(), tree.decode()) - - def test_docstring_generated(self): - soup = self.soup("") - self.assertEqual( - soup.encode(), b'\n') - - def test_xml_declaration(self): - markup = b"""\n""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_processing_instruction(self): - markup = b"""\n""" - soup = self.soup(markup) - self.assertEqual(markup, soup.encode("utf8")) - - def test_real_xhtml_document(self): - """A real XHTML document should come out *exactly* the same as it went in.""" - markup = b""" - - -Hello. -Goodbye. -""" - soup = self.soup(markup) - self.assertEqual( - soup.encode("utf-8"), markup) - - def test_formatter_processes_script_tag_for_xml_documents(self): - doc = """ - -""" - soup = BeautifulSoup(doc, "lxml-xml") - # lxml would have stripped this while parsing, but we can add - # it later. - soup.script.string = 'console.log("< < hey > > ");' - encoded = soup.encode() - self.assertTrue(b"< < hey > >" in encoded) - - def test_can_parse_unicode_document(self): - markup = u'Sacr\N{LATIN SMALL LETTER E WITH ACUTE} bleu!' - soup = self.soup(markup) - self.assertEqual(u'Sacr\xe9 bleu!', soup.root.string) - - def test_popping_namespaced_tag(self): - markup = 'b2012-07-02T20:33:42Zcd' - soup = self.soup(markup) - self.assertEqual( - unicode(soup.rss), markup) - - def test_docstring_includes_correct_encoding(self): - soup = self.soup("") - self.assertEqual( - soup.encode("latin1"), - b'\n') - - def test_large_xml_document(self): - """A large XML document should come out the same as it went in.""" - markup = (b'\n' - + b'0' * (2**12) - + b'') - soup = self.soup(markup) - self.assertEqual(soup.encode("utf-8"), markup) - - - def test_tags_are_empty_element_if_and_only_if_they_are_empty(self): - self.assertSoupEquals("

", "

") - self.assertSoupEquals("

foo

") - - def test_namespaces_are_preserved(self): - markup = 'This tag is in the a namespaceThis tag is in the b namespace' - soup = self.soup(markup) - root = soup.root - self.assertEqual("http://example.com/", root['xmlns:a']) - self.assertEqual("http://example.net/", root['xmlns:b']) - - def test_closing_namespaced_tag(self): - markup = '

20010504

' - soup = self.soup(markup) - self.assertEqual(unicode(soup.p), markup) - - def test_namespaced_attributes(self): - markup = '' - soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) - - def test_namespaced_attributes_xml_namespace(self): - markup = 'bar' - soup = self.soup(markup) - self.assertEqual(unicode(soup.foo), markup) - - def test_find_by_prefixed_name(self): - doc = """ -foo - bar - baz - -""" - soup = self.soup(doc) - - # There are three tags. - self.assertEqual(3, len(soup.find_all('tag'))) - - # But two of them are ns1:tag and one of them is ns2:tag. - self.assertEqual(2, len(soup.find_all('ns1:tag'))) - self.assertEqual(1, len(soup.find_all('ns2:tag'))) - - self.assertEqual(1, len(soup.find_all('ns2:tag', key='value'))) - self.assertEqual(3, len(soup.find_all(['ns1:tag', 'ns2:tag']))) - - def test_copy_tag_preserves_namespace(self): - xml = """ -""" - - soup = self.soup(xml) - tag = soup.document - duplicate = copy.copy(tag) - - # The two tags have the same namespace prefix. - self.assertEqual(tag.prefix, duplicate.prefix) - - -class HTML5TreeBuilderSmokeTest(HTMLTreeBuilderSmokeTest): - """Smoke test for a tree builder that supports HTML5.""" - - def test_real_xhtml_document(self): - # Since XHTML is not HTML5, HTML5 parsers are not tested to handle - # XHTML documents in any particular way. - pass - - def test_html_tags_have_namespace(self): - markup = "" - soup = self.soup(markup) - self.assertEqual("http://www.w3.org/1999/xhtml", soup.a.namespace) - - def test_svg_tags_have_namespace(self): - markup = '' - soup = self.soup(markup) - namespace = "http://www.w3.org/2000/svg" - self.assertEqual(namespace, soup.svg.namespace) - self.assertEqual(namespace, soup.circle.namespace) - - - def test_mathml_tags_have_namespace(self): - markup = '5' - soup = self.soup(markup) - namespace = 'http://www.w3.org/1998/Math/MathML' - self.assertEqual(namespace, soup.math.namespace) - self.assertEqual(namespace, soup.msqrt.namespace) - - def test_xml_declaration_becomes_comment(self): - markup = '' - soup = self.soup(markup) - self.assertTrue(isinstance(soup.contents[0], Comment)) - self.assertEqual(soup.contents[0], '?xml version="1.0" encoding="utf-8"?') - self.assertEqual("html", soup.contents[0].next_element.name) - -def skipIf(condition, reason): - def nothing(test, *args, **kwargs): - return None - - def decorator(test_item): - if condition: - return nothing - else: - return test_item - - return decorator diff --git a/libs2/bs4/tests/__init__.py b/libs2/bs4/tests/__init__.py deleted file mode 100644 index 142c8cc3f..000000000 --- a/libs2/bs4/tests/__init__.py +++ /dev/null @@ -1 +0,0 @@ -"The beautifulsoup tests." diff --git a/libs2/bs4/tests/test_builder_registry.py b/libs2/bs4/tests/test_builder_registry.py deleted file mode 100644 index 90cad8293..000000000 --- a/libs2/bs4/tests/test_builder_registry.py +++ /dev/null @@ -1,147 +0,0 @@ -"""Tests of the builder registry.""" - -import unittest -import warnings - -from bs4 import BeautifulSoup -from bs4.builder import ( - builder_registry as registry, - HTMLParserTreeBuilder, - TreeBuilderRegistry, -) - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError: - HTML5LIB_PRESENT = False - -try: - from bs4.builder import ( - LXMLTreeBuilderForXML, - LXMLTreeBuilder, - ) - LXML_PRESENT = True -except ImportError: - LXML_PRESENT = False - - -class BuiltInRegistryTest(unittest.TestCase): - """Test the built-in registry with the default builders registered.""" - - def test_combination(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('fast', 'html'), - LXMLTreeBuilder) - - if LXML_PRESENT: - self.assertEqual(registry.lookup('permissive', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('strict', 'html'), - HTMLParserTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib', 'html'), - HTML5TreeBuilder) - - def test_lookup_by_markup_type(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('html'), LXMLTreeBuilder) - self.assertEqual(registry.lookup('xml'), LXMLTreeBuilderForXML) - else: - self.assertEqual(registry.lookup('xml'), None) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html'), HTML5TreeBuilder) - else: - self.assertEqual(registry.lookup('html'), HTMLParserTreeBuilder) - - def test_named_library(self): - if LXML_PRESENT: - self.assertEqual(registry.lookup('lxml', 'xml'), - LXMLTreeBuilderForXML) - self.assertEqual(registry.lookup('lxml', 'html'), - LXMLTreeBuilder) - if HTML5LIB_PRESENT: - self.assertEqual(registry.lookup('html5lib'), - HTML5TreeBuilder) - - self.assertEqual(registry.lookup('html.parser'), - HTMLParserTreeBuilder) - - def test_beautifulsoup_constructor_does_lookup(self): - - with warnings.catch_warnings(record=True) as w: - # This will create a warning about not explicitly - # specifying a parser, but we'll ignore it. - - # You can pass in a string. - BeautifulSoup("", features="html") - # Or a list of strings. - BeautifulSoup("", features=["html", "fast"]) - - # You'll get an exception if BS can't find an appropriate - # builder. - self.assertRaises(ValueError, BeautifulSoup, - "", features="no-such-feature") - -class RegistryTest(unittest.TestCase): - """Test the TreeBuilderRegistry class in general.""" - - def setUp(self): - self.registry = TreeBuilderRegistry() - - def builder_for_features(self, *feature_list): - cls = type('Builder_' + '_'.join(feature_list), - (object,), {'features' : feature_list}) - - self.registry.register(cls) - return cls - - def test_register_with_no_features(self): - builder = self.builder_for_features() - - # Since the builder advertises no features, you can't find it - # by looking up features. - self.assertEqual(self.registry.lookup('foo'), None) - - # But you can find it by doing a lookup with no features, if - # this happens to be the only registered builder. - self.assertEqual(self.registry.lookup(), builder) - - def test_register_with_features_makes_lookup_succeed(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('foo'), builder) - self.assertEqual(self.registry.lookup('bar'), builder) - - def test_lookup_fails_when_no_builder_implements_feature(self): - builder = self.builder_for_features('foo', 'bar') - self.assertEqual(self.registry.lookup('baz'), None) - - def test_lookup_gets_most_recent_registration_when_no_feature_specified(self): - builder1 = self.builder_for_features('foo') - builder2 = self.builder_for_features('bar') - self.assertEqual(self.registry.lookup(), builder2) - - def test_lookup_fails_when_no_tree_builders_registered(self): - self.assertEqual(self.registry.lookup(), None) - - def test_lookup_gets_most_recent_builder_supporting_all_features(self): - has_one = self.builder_for_features('foo') - has_the_other = self.builder_for_features('bar') - has_both_early = self.builder_for_features('foo', 'bar', 'baz') - has_both_late = self.builder_for_features('foo', 'bar', 'quux') - lacks_one = self.builder_for_features('bar') - has_the_other = self.builder_for_features('foo') - - # There are two builders featuring 'foo' and 'bar', but - # the one that also features 'quux' was registered later. - self.assertEqual(self.registry.lookup('foo', 'bar'), - has_both_late) - - # There is only one builder featuring 'foo', 'bar', and 'baz'. - self.assertEqual(self.registry.lookup('foo', 'bar', 'baz'), - has_both_early) - - def test_lookup_fails_when_cannot_reconcile_requested_features(self): - builder1 = self.builder_for_features('foo', 'bar') - builder2 = self.builder_for_features('foo', 'baz') - self.assertEqual(self.registry.lookup('bar', 'baz'), None) diff --git a/libs2/bs4/tests/test_docs.py b/libs2/bs4/tests/test_docs.py deleted file mode 100644 index 5b9f67709..000000000 --- a/libs2/bs4/tests/test_docs.py +++ /dev/null @@ -1,36 +0,0 @@ -"Test harness for doctests." - -# pylint: disable-msg=E0611,W0142 - -__metaclass__ = type -__all__ = [ - 'additional_tests', - ] - -import atexit -import doctest -import os -#from pkg_resources import ( -# resource_filename, resource_exists, resource_listdir, cleanup_resources) -import unittest - -DOCTEST_FLAGS = ( - doctest.ELLIPSIS | - doctest.NORMALIZE_WHITESPACE | - doctest.REPORT_NDIFF) - - -# def additional_tests(): -# "Run the doc tests (README.txt and docs/*, if any exist)" -# doctest_files = [ -# os.path.abspath(resource_filename('bs4', 'README.txt'))] -# if resource_exists('bs4', 'docs'): -# for name in resource_listdir('bs4', 'docs'): -# if name.endswith('.txt'): -# doctest_files.append( -# os.path.abspath( -# resource_filename('bs4', 'docs/%s' % name))) -# kwargs = dict(module_relative=False, optionflags=DOCTEST_FLAGS) -# atexit.register(cleanup_resources) -# return unittest.TestSuite(( -# doctest.DocFileSuite(*doctest_files, **kwargs))) diff --git a/libs2/bs4/tests/test_html5lib.py b/libs2/bs4/tests/test_html5lib.py deleted file mode 100644 index 0f89d6244..000000000 --- a/libs2/bs4/tests/test_html5lib.py +++ /dev/null @@ -1,130 +0,0 @@ -"""Tests to ensure that the html5lib tree builder generates good trees.""" - -import warnings - -try: - from bs4.builder import HTML5TreeBuilder - HTML5LIB_PRESENT = True -except ImportError, e: - HTML5LIB_PRESENT = False -from bs4.element import SoupStrainer -from bs4.testing import ( - HTML5TreeBuilderSmokeTest, - SoupTest, - skipIf, -) - -@skipIf( - not HTML5LIB_PRESENT, - "html5lib seems not to be present, not testing its tree builder.") -class HTML5LibBuilderSmokeTest(SoupTest, HTML5TreeBuilderSmokeTest): - """See ``HTML5TreeBuilderSmokeTest``.""" - - @property - def default_builder(self): - return HTML5TreeBuilder() - - def test_soupstrainer(self): - # The html5lib tree builder does not support SoupStrainers. - strainer = SoupStrainer("b") - markup = "

A bold statement.

" - with warnings.catch_warnings(record=True) as w: - soup = self.soup(markup, parse_only=strainer) - self.assertEqual( - soup.decode(), self.document_for(markup)) - - self.assertTrue( - "the html5lib tree builder doesn't support parse_only" in - str(w[0].message)) - - def test_correctly_nested_tables(self): - """html5lib inserts tags where other parsers don't.""" - markup = ('' - '' - "') - - self.assertSoupEquals( - markup, - '
Here's another table:" - '' - '' - '
foo
Here\'s another table:' - '
foo
' - '
') - - self.assertSoupEquals( - "" - "" - "
Foo
Bar
Baz
") - - def test_xml_declaration_followed_by_doctype(self): - markup = ''' - - - - - -

foo

- -''' - soup = self.soup(markup) - # Verify that we can reach the

tag; this means the tree is connected. - self.assertEqual(b"

foo

", soup.p.encode()) - - def test_reparented_markup(self): - markup = '

foo

\n

bar

' - soup = self.soup(markup) - self.assertEqual(u"

foo

\n

bar

", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) - - - def test_reparented_markup_ends_with_whitespace(self): - markup = '

foo

\n

bar

\n' - soup = self.soup(markup) - self.assertEqual(u"

foo

\n

bar

\n", soup.body.decode()) - self.assertEqual(2, len(soup.find_all('p'))) - - def test_reparented_markup_containing_identical_whitespace_nodes(self): - """Verify that we keep the two whitespace nodes in this - document distinct when reparenting the adjacent tags. - """ - markup = '
' - soup = self.soup(markup) - space1, space2 = soup.find_all(string=' ') - tbody1, tbody2 = soup.find_all('tbody') - assert space1.next_element is tbody1 - assert tbody2.next_element is space2 - - def test_reparented_markup_containing_children(self): - markup = '' - soup = self.soup(markup) - noscript = soup.noscript - self.assertEqual("target", noscript.next_element) - target = soup.find(string='target') - - # The 'aftermath' string was duplicated; we want the second one. - final_aftermath = soup.find_all(string='aftermath')[-1] - - # The