diff --git a/src/ocrmypdf/exec/__init__.py b/src/ocrmypdf/exec/__init__.py index f235b6e8..145cc43b 100644 --- a/src/ocrmypdf/exec/__init__.py +++ b/src/ocrmypdf/exec/__init__.py @@ -25,6 +25,7 @@ import sys from collections.abc import Mapping from distutils.version import LooseVersion from functools import lru_cache +from pathlib import Path from subprocess import PIPE, STDOUT, CalledProcessError from subprocess import run as subprocess_run @@ -138,24 +139,25 @@ def shim_paths_with_program_files(env=None): program_files = env.get('PROGRAMFILES', '') if not program_files: return env.get('PATH', '') - paths = [] - try: - for dirname in os.listdir(program_files): - if dirname.lower() == 'tesseract-ocr': - paths.append(os.path.join(program_files, dirname)) - elif dirname.lower() == 'gs': - try: - latest_gs = max( - os.listdir(os.path.join(program_files, dirname)), - key=lambda d: float(d[2:]), - ) - except (FileNotFoundError, NotADirectoryError): - continue - paths.append(os.path.join(program_files, dirname, latest_gs, 'bin')) - except EnvironmentError: - pass - paths.extend(path for path in os.get_exec_path(env) if path not in set(paths)) - return os.pathsep.join(paths) + + def path_walker(): + for path in Path(program_files).iterdir(): + if not path.is_dir(): + continue + if path.name.lower() == 'tesseract-ocr': + yield path + elif path.name.lower() == 'gs': + yield from (p for p in path.glob('**/bin') if p.is_dir()) + + paths = sorted( + (p for p in path_walker()), key=lambda p: (p.name, p.parent.name), reverse=True + ) + paths.extend( + Path(str_path) + for str_path in os.get_exec_path(env) + if Path(str_path) not in set(paths) + ) + return os.pathsep.join(str(p) for p in paths) missing_program = ''' diff --git a/tests/test_helpers.py b/tests/test_helpers.py index f3c964d7..b7355903 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -17,6 +17,7 @@ import logging import multiprocessing +import os from pathlib import Path from unittest.mock import MagicMock @@ -95,3 +96,21 @@ class TestFileIsWritable: pathmock.exists.return_value = True pathmock.is_file.side_effect = PermissionError assert not helpers.is_file_writable(pathmock) + + +def test_shim_paths(tmp_path): + progfiles = tmp_path / 'Program Files' + progfiles.mkdir() + (progfiles / 'tesseract-ocr').mkdir() + (progfiles / 'gs' / '9.51' / 'bin').mkdir(parents=True) + (progfiles / 'gs' / '9.52' / 'bin').mkdir(parents=True) + syspath = tmp_path / 'bin' + env = {'PROGRAMFILES': str(progfiles), 'PATH': str(syspath)} + from ocrmypdf.exec import shim_paths_with_program_files + + result_str = shim_paths_with_program_files(env=env) + results = result_str.split(os.pathsep) + assert results[0].endswith('tesseract-ocr') + assert results[1].endswith('gs/9.52/bin') + assert results[2].endswith('gs/9.51/bin') + assert results[3] == str(syspath)