WIP on release313

2025-12-24 08:08:37 -05:00 · 2024-10-16 10:37:53 +02:00
7 changed files with 37 additions and 23 deletions
--- a/requirements.txt
+++ b/requirements.txt
@@ -23,6 +23,7 @@ pytz==2024.2
 sgmllib3k==1.0.0
 portend==3.2.0
 chardet==5.2.0
+pyunormalize==16.0.0
 PySocks==1.7.1
 puremagic==1.28
 guessit==3.8.0
--- a/sabnzbd/deobfuscate_filenames.py
+++ b/sabnzbd/deobfuscate_filenames.py
@@ -33,7 +33,7 @@ import os
 import re

 import sabnzbd
-from sabnzbd.filesystem import get_unique_filename, renamer, get_ext, get_basename
+from sabnzbd.filesystem import get_unique_filename, renamer, get_ext, get_basename, listdir_normalized
 from sabnzbd.par2file import is_parfile, parse_par2_file
 import sabnzbd.utils.file_extension as file_extension
 from sabnzbd.misc import match_str
@@ -59,7 +59,7 @@ def decode_par2(parfile: str) -> List[str]:
    # Parse all files in the folder
    dirname = os.path.dirname(parfile)
    new_files = []  # list of new files generated
-    for fn in os.listdir(dirname):
+    for fn in listdir_normalized(dirname):
        filepath = os.path.join(dirname, fn)
        # Only check files
        if os.path.isfile(filepath):
--- a/sabnzbd/encoding.py
+++ b/sabnzbd/encoding.py
@@ -20,6 +20,7 @@ sabnzbd.encoding - Unicode/byte translation functions
 """

 import locale
+import pyunormalize
 import chardet
 from xml.sax.saxutils import escape
 from typing import AnyStr
@@ -27,6 +28,11 @@ from typing import AnyStr
 CODEPAGE = locale.getpreferredencoding()


+def normalize_utf8(inputstring: str) -> str:
+    """Make sure we return an utf8 normalized version"""
+    return pyunormalize.NFC(inputstring)
+
+
 def utob(str_in: AnyStr) -> bytes:
    """Shorthand for converting UTF-8 string to bytes"""
    if isinstance(str_in, bytes):
@@ -37,22 +43,19 @@ def utob(str_in: AnyStr) -> bytes:
 def ubtou(str_in: AnyStr) -> str:
    """Shorthand for converting unicode bytes to UTF-8 string"""
    if isinstance(str_in, str):
-        return str_in
-    return str_in.decode("utf-8")
+        return normalize_utf8(str_in)
+    return normalize_utf8(str_in.decode("utf-8"))


 def platform_btou(str_in: AnyStr) -> str:
-    """Return Unicode string, if not already Unicode, decode with locale encoding.
-    NOTE: Used for POpen because universal_newlines/text parameter doesn't
-    always work! We cannot use encoding-parameter because it's Python 3.7+
-    """
+    """Return Unicode string, if not already Unicode, decode with locale encoding"""
    if isinstance(str_in, bytes):
        try:
            return ubtou(str_in)
        except UnicodeDecodeError:
-            return str_in.decode(CODEPAGE, errors="replace").replace("?", "!")
+            return normalize_utf8(str_in.decode(CODEPAGE, errors="replace").replace("?", "!"))
    else:
-        return str_in
+        return normalize_utf8(str_in)


 def correct_unknown_encoding(str_or_bytes_in: AnyStr) -> str:
@@ -71,10 +74,10 @@ def correct_unknown_encoding(str_or_bytes_in: AnyStr) -> str:
    except UnicodeDecodeError:
        try:
            # Try using 8-bit ASCII, if came from Windows
-            return str_or_bytes_in.decode("ISO-8859-1")
+            return normalize_utf8(str_or_bytes_in.decode("ISO-8859-1"))
        except ValueError:
            # Last resort we use the slow chardet package
-            return str_or_bytes_in.decode(chardet.detect(str_or_bytes_in)["encoding"])
+            return normalize_utf8(str_or_bytes_in.decode(chardet.detect(str_or_bytes_in)["encoding"]))


 def correct_cherrypy_encoding(inputstring: str) -> str:
--- a/sabnzbd/filesystem.py
+++ b/sabnzbd/filesystem.py
@@ -46,7 +46,7 @@ except ImportError:
 import sabnzbd
 from sabnzbd.decorators import synchronized, cache_maintainer
 from sabnzbd.constants import FUTURE_Q_FOLDER, JOB_ADMIN, GIGI, DEF_FILE_MAX, IGNORED_FILES_AND_FOLDERS, DEF_LOG_FILE
-from sabnzbd.encoding import correct_unknown_encoding, utob, ubtou
+from sabnzbd.encoding import correct_unknown_encoding, utob, ubtou, normalize_utf8
 from sabnzbd.utils import rarfile


@@ -561,7 +561,7 @@ def globber(path: str, pattern: str = "*") -> List[str]:
    """Return matching base file/folder names in folder `path`"""
    # Cannot use glob.glob() because it doesn't support Windows long name notation
    if os.path.exists(path):
-        return [f for f in os.listdir(path) if safe_fnmatch(f, pattern)]
+        return [f for f in listdir_normalized(path) if safe_fnmatch(f, pattern)]
    return []


@@ -569,7 +569,8 @@ def globber_full(path: str, pattern: str = "*") -> List[str]:
    """Return matching full file/folder names in folder `path`"""
    # Cannot use glob.glob() because it doesn't support Windows long name notation
    if os.path.exists(path):
-        return [os.path.join(path, f) for f in os.listdir(path) if safe_fnmatch(f, pattern)]
+        path = normalize_utf8(path)
+        return [os.path.join(path, f) for f in listdir_normalized(path) if safe_fnmatch(f, pattern)]
    return []


@@ -581,7 +582,7 @@ def fix_unix_encoding(folder: str):
    if not sabnzbd.WIN32 and not sabnzbd.MACOS:
        for root, dirs, files in os.walk(folder):
            for name in files:
-                new_name = correct_unknown_encoding(name)
+                new_name = normalize_utf8(correct_unknown_encoding(name))
                if name != new_name:
                    try:
                        renamer(os.path.join(root, name), os.path.join(root, new_name))
@@ -804,6 +805,12 @@ def get_unique_filename(path: str) -> str:
    return path


+def listdir_normalized(input_dir: str) -> List[str]:
+    """On macOS, the OS returns un-normalized results.
+    Always use the same normalization on all platforms"""
+    return [normalize_utf8(path) for path in os.listdir(input_dir)]
+
+
@synchronized(DIR_LOCK)
 def listdir_full(input_dir: str, recursive: bool = True) -> List[str]:
    """List all files in dirs and sub-dirs"""
@@ -812,7 +819,7 @@ def listdir_full(input_dir: str, recursive: bool = True) -> List[str]:
        for file in files:
            # Ignore special folders and resources files created by macOS
            if not sabnzbd.misc.match_str(root, IGNORED_FILES_AND_FOLDERS) and not file.startswith("._"):
-                filelist.append(os.path.join(root, file))
+                filelist.append(normalize_utf8(os.path.join(root, file)))
        if not recursive:
            break
    return filelist
@@ -1386,7 +1393,7 @@ def pathbrowser(path: str, show_hidden: bool = False, show_files: bool = False)

    # List all files and folders
    file_list = []
-    for filename in os.listdir(path):
+    for filename in listdir_normalized(path):
        fpath = os.path.join(path, filename)
        isdir = os.path.isdir(fpath)

--- a/sabnzbd/newsunpack.py
+++ b/sabnzbd/newsunpack.py
@@ -63,6 +63,7 @@ from sabnzbd.filesystem import (
    SEVENMULTI_RE,
    is_size,
    get_basename,
+    listdir_normalized,
 )
 from sabnzbd.nzbstuff import NzbObject
 import sabnzbd.cfg as cfg
@@ -1020,7 +1021,7 @@ def par2_repair(nzo: NzbObject, setname: str) -> Tuple[bool, bool]:
        return False, True

    parfile = os.path.join(nzo.download_path, parfile_nzf.filename)
-    old_dir_content = os.listdir(nzo.download_path)
+    old_dir_content = listdir_normalized(nzo.download_path)
    used_joinables = ()
    joinables = ()
    used_for_repair = ()
@@ -1080,7 +1081,7 @@ def par2_repair(nzo: NzbObject, setname: str) -> Tuple[bool, bool]:
    try:
        if cfg.enable_par_cleanup():
            deletables = []
-            new_dir_content = os.listdir(nzo.download_path)
+            new_dir_content = listdir_normalized(nzo.download_path)

            # If Multipar or par2cmdline repairs a broken part of a joinable, it doesn't list it as such.
            # So we need to manually add all joinables of the set to the list of used joinables.
--- a/sabnzbd/postproc.py
+++ b/sabnzbd/postproc.py
@@ -73,6 +73,7 @@ from sabnzbd.filesystem import (
    get_filename,
    directory_is_writable,
    check_filesystem_capabilities,
+    listdir_normalized,
 )
 from sabnzbd.nzbstuff import NzbObject
 from sabnzbd.sorting import Sorter
@@ -961,7 +962,7 @@ def rar_renamer(nzo: NzbObject) -> int:
    volnrext = {}

    # Scan rar files in workdir, but not subdirs
-    workdir_files = os.listdir(nzo.download_path)
+    workdir_files = listdir_normalized(nzo.download_path)
    for file_to_check in workdir_files:
        file_to_check = os.path.join(nzo.download_path, file_to_check)

@@ -1185,7 +1186,7 @@ def one_file_or_folder(folder: str) -> str:
    """If the dir only contains one file or folder, join that file/folder onto the path"""
    if os.path.exists(folder) and os.path.isdir(folder):
        try:
-            cont = os.listdir(folder)
+            cont = listdir_normalized(folder)
            if len(cont) == 1:
                folder = os.path.join(folder, cont[0])
                folder = one_file_or_folder(folder)
--- a/sabnzbd/sorting.py
+++ b/sabnzbd/sorting.py
@@ -37,6 +37,7 @@ from sabnzbd.filesystem import (
    renamer,
    sanitize_foldername,
    clip_path,
+    listdir_normalized,
 )
 import sabnzbd.config as config
 import sabnzbd.cfg as cfg
@@ -616,7 +617,7 @@ def move_to_parent_directory(workdir: str) -> Tuple[str, bool]:
    logging.debug("Moving all files from %s to %s", workdir, dest)

    # Check for DVD folders and bail out if found
-    for item in os.listdir(workdir):
+    for item in listdir_normalized(workdir):
        if item.lower() in IGNORED_MOVIE_FOLDERS:
            return workdir, True