mirror of
https://github.com/sabnzbd/sabnzbd.git
synced 2026-01-03 04:59:50 -05:00
71 lines
3.1 KiB
Python
71 lines
3.1 KiB
Python
#!/usr/bin/python3 -OO
|
||
# Copyright 2007-2024 by The SABnzbd-Team (sabnzbd.org)
|
||
#
|
||
# This program is free software; you can redistribute it and/or
|
||
# modify it under the terms of the GNU General Public License
|
||
# as published by the Free Software Foundation; either version 2
|
||
# of the License, or (at your option) any later version.
|
||
#
|
||
# This program is distributed in the hope that it will be useful,
|
||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
# GNU General Public License for more details.
|
||
#
|
||
# You should have received a copy of the GNU General Public License
|
||
# along with this program; if not, write to the Free Software
|
||
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
|
||
|
||
"""
|
||
tests.test_misc - Testing functions in encoding.py
|
||
"""
|
||
|
||
import sabnzbd.encoding as enc
|
||
|
||
|
||
class TestEncoding:
|
||
def test_correct_unknown_encoding(self):
|
||
# Windows encoding in bytes
|
||
assert "frènch_german_demö" == enc.correct_unknown_encoding(b"fr\xe8nch_german_dem\xf6")
|
||
# Windows encoding in string that's already UTF8
|
||
assert "demotöwers" == enc.correct_unknown_encoding("demot\udcf6wers")
|
||
|
||
def test_correct_cherrypy_encoding(self):
|
||
raw_input = "aaazzz" # correct UTF8
|
||
corrected_output = enc.correct_cherrypy_encoding(raw_input)
|
||
assert corrected_output == "aaazzz"
|
||
|
||
# Let's create some "manual" strings of separate chars:
|
||
|
||
# typical use case: raw chars in a string: 2-byte UTF8
|
||
# Ω (capital omega) in UTF8: 0xCE 0xA9
|
||
raw_input = "aaa" + chr(0xCE) + chr(0xA9) + "zzz" # Ω (capital omega)
|
||
corrected_output = enc.correct_cherrypy_encoding(raw_input)
|
||
assert corrected_output == "aaaΩzzz"
|
||
|
||
# typical use case: raw chars in a string: 3-byte UTF8
|
||
# ∇ (nabla) in UTF8: 0xE2 0x88 0x87
|
||
raw_input = "aaa" + chr(0xE2) + chr(0x88) + chr(0x87) + "zzz" # ∇ (nabla)
|
||
corrected_output = enc.correct_cherrypy_encoding(raw_input)
|
||
assert corrected_output == "aaa∇zzz"
|
||
|
||
# typical use case: raw chars in a string: 4-byte UTF8
|
||
raw_input = "aaa" + chr(0xF0) + chr(0x9F) + chr(0x9A) + chr(0x80) + "zzz" # "correct" 4-byte UTF8 for "rocket"
|
||
corrected_output = enc.correct_cherrypy_encoding(raw_input)
|
||
assert corrected_output == "aaa🚀zzz"
|
||
|
||
# and now more automatic: craft from utf8
|
||
|
||
nice_utf8_string = "aaa你好🚀🤔zzzαβγ" # correct UTF8
|
||
# now break it
|
||
raw_input = ""
|
||
for i in nice_utf8_string.encode("utf-8"):
|
||
raw_input += chr(i)
|
||
assert raw_input != nice_utf8_string
|
||
corrected_output = enc.correct_cherrypy_encoding(raw_input)
|
||
assert corrected_output == nice_utf8_string
|
||
|
||
# this is not valid UTF8, so string cannot be repaired, so stay the same
|
||
raw_input = "aaa" + chr(0xF0) + chr(0x9F) + "zzz" # two bytes (instead of four) ... not valid UTF8
|
||
corrected_output = enc.correct_cherrypy_encoding(raw_input)
|
||
assert corrected_output == raw_input # check nothing changed
|