From ea36aedb5fed635cb8d6417665de5eab5721710f Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 25 Sep 2023 00:59:16 -0700 Subject: [PATCH] Overhaul version checkers to prefer Version to str --- src/ocrmypdf/_exec/ghostscript.py | 7 ++--- src/ocrmypdf/_exec/jbig2enc.py | 6 ++-- src/ocrmypdf/_exec/pngquant.py | 5 ++-- src/ocrmypdf/_exec/tesseract.py | 6 ++-- src/ocrmypdf/_exec/unpaper.py | 5 ++-- src/ocrmypdf/builtin_plugins/tesseract_ocr.py | 2 +- src/ocrmypdf/subprocess/__init__.py | 19 +++++------- tests/test_main.py | 2 +- tests/test_metadata.py | 7 ++++- tests/test_tesseract.py | 4 ++- tests/test_unpaper.py | 3 +- tests/test_validation.py | 30 ++++++++++++------- 12 files changed, 57 insertions(+), 39 deletions(-) diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index 75a92884..e28d23e9 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -70,8 +70,8 @@ log.addFilter(DuplicateFilter(log)) GS = 'gswin64c' if os.name == 'nt' else 'gs' -def version(): - return get_version(GS) +def version() -> Version: + return Version(get_version(GS)) def _gs_error_reported(stream) -> bool: @@ -216,8 +216,7 @@ def generate_pdfa( "-dAutoFilterGrayImages=true", ] - strategy = 'LeaveColorUnchanged' - gs_version = Version(version()) + gs_version = version() if gs_version == Version('9.56.0'): # 9.56.0 breaks our OCR, should be fixed in 9.56.1 # https://bugs.ghostscript.com/show_bug.cgi?id=705187 diff --git a/src/ocrmypdf/_exec/jbig2enc.py b/src/ocrmypdf/_exec/jbig2enc.py index ae75ead3..28d3dd1c 100644 --- a/src/ocrmypdf/_exec/jbig2enc.py +++ b/src/ocrmypdf/_exec/jbig2enc.py @@ -7,12 +7,14 @@ from __future__ import annotations from subprocess import PIPE +from packaging.version import Version + from ocrmypdf.exceptions import MissingDependencyError from ocrmypdf.subprocess import get_version, run -def version(): - return get_version('jbig2', regex=r'jbig2enc (\d+(\.\d+)*).*') +def version() -> Version: + return Version(get_version('jbig2', regex=r'jbig2enc (\d+(\.\d+)*).*')) def available(): diff --git a/src/ocrmypdf/_exec/pngquant.py b/src/ocrmypdf/_exec/pngquant.py index f399b2f1..8425caec 100644 --- a/src/ocrmypdf/_exec/pngquant.py +++ b/src/ocrmypdf/_exec/pngquant.py @@ -10,14 +10,15 @@ from io import BytesIO from pathlib import Path from subprocess import PIPE +from packaging.version import Version from PIL import Image from ocrmypdf.exceptions import MissingDependencyError from ocrmypdf.subprocess import get_version, run -def version(): - return get_version('pngquant', regex=r'(\d+(\.\d+)*).*') +def version() -> Version: + return Version(get_version('pngquant', regex=r'(\d+(\.\d+)*).*')) def available(): diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py index 0863b13f..4eac3470 100644 --- a/src/ocrmypdf/_exec/tesseract.py +++ b/src/ocrmypdf/_exec/tesseract.py @@ -113,13 +113,13 @@ class TesseractVersion(Version): ) -def version() -> str: - return get_version('tesseract', regex=r'tesseract\s(.+)') +def version() -> Version: + return TesseractVersion(get_version('tesseract', regex=r'tesseract\s(.+)')) def has_thresholding() -> bool: """Does Tesseract have -c thresholding method capability?""" - return version() >= '5.0' + return version() >= Version('5.0') def get_languages() -> set[str]: diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py index fb4ccd38..2944b4f4 100644 --- a/src/ocrmypdf/_exec/unpaper.py +++ b/src/ocrmypdf/_exec/unpaper.py @@ -15,6 +15,7 @@ from pathlib import Path from subprocess import PIPE, STDOUT from typing import Iterator, Union +from packaging.version import Version from PIL import Image from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError @@ -67,8 +68,8 @@ class UnpaperImageTooLargeError(Exception): super().__init__(self.message) -def version() -> str: - return get_version('unpaper') +def version() -> Version: + return Version(get_version('unpaper')) SUPPORTED_MODES = {'1', 'L', 'RGB'} diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py index 81692831..a20da3df 100644 --- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py +++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py @@ -212,7 +212,7 @@ class TesseractOcrEngine(OcrEngine): @staticmethod def version(): - return tesseract.version() + return str(tesseract.version()) @staticmethod def creator_tag(options): diff --git a/src/ocrmypdf/subprocess/__init__.py b/src/ocrmypdf/subprocess/__init__.py index 3efaab39..c1918cf4 100644 --- a/src/ocrmypdf/subprocess/__init__.py +++ b/src/ocrmypdf/subprocess/__init__.py @@ -292,16 +292,12 @@ def _error_old_version( _error_trailer(**locals()) -def _remove_leading_v(s: str) -> str: - return s.removeprefix('v') - - def check_external_program( *, program: str, package: str, - version_checker: Callable[[], str], - need_version: str, + version_checker: Callable[[], Version], + need_version: str | Version, required_for: str | None = None, recommended: bool = False, version_parser: type[Version] = Version, @@ -321,6 +317,8 @@ def check_external_program( version_parser: A class that should be used to parse and compare version numbers. Used when version numbers do not follow standard conventions. """ + if not isinstance(need_version, Version): + need_version = version_parser(need_version) try: found_version = version_checker() except (CalledProcessError, FileNotFoundError) as e: @@ -334,11 +332,10 @@ def check_external_program( raise return - found_version = _remove_leading_v(found_version) - need_version = _remove_leading_v(need_version) - - if found_version and version_parser(found_version) < version_parser(need_version): - _error_old_version(program, package, need_version, found_version, required_for) + if found_version and found_version < need_version: + _error_old_version( + program, package, str(need_version), str(found_version), required_for + ) if not recommended: raise MissingDependencyError(program) diff --git a/tests/test_main.py b/tests/test_main.py index 8f310059..48f69050 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -178,7 +178,7 @@ def test_maximum_options(renderer, output_type, multipage, outpdf): @pytest.mark.skipif( - tesseract.TesseractVersion(tesseract.version()) >= tesseract.TesseractVersion('5'), + tesseract.version() >= tesseract.TesseractVersion('5'), reason="tess 5 tries harder to find its files", ) def test_tesseract_missing_tessdata(monkeypatch, resources, no_outpdf, tmpdir): diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 02b741c7..1ccc8e7a 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -10,12 +10,13 @@ from shutil import copyfile import pikepdf import pytest +from packaging.version import Version from pikepdf.models.metadata import decode_pdf_date +from ocrmypdf._exec import ghostscript from ocrmypdf._jobcontext import PdfContext from ocrmypdf._pipeline import convert_to_pdfa, metadata_fixup from ocrmypdf._plugin_manager import get_parser_options_plugins, get_plugin_manager -from ocrmypdf.cli import get_parser from ocrmypdf.exceptions import ExitCode from ocrmypdf.pdfa import file_claims_pdfa, generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo @@ -393,6 +394,10 @@ def test_prevent_gs_invalid_xml(resources, outdir): assert contents.find(b'\x00', xmp_start, xmp_end) == -1 +@pytest.mark.skipif( + ghostscript.version() >= Version('10.2.0'), + reason="Ghostscript 10.2.0+ exit with an error on invalid DocumentInfo", +) def test_malformed_docinfo(caplog, resources, outdir): generate_pdfa_ps(outdir / 'pdfa.ps') diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py index 73ef3c80..f5fe01d6 100644 --- a/tests/test_tesseract.py +++ b/tests/test_tesseract.py @@ -58,7 +58,9 @@ def test_content_preservation(resources, outpdf): assert len(page.images) > 1, "masks were rasterized" -@pytest.mark.skipif(tesseract.version() > '5', reason="doesn't fool Tess 5") +@pytest.mark.skipif( + tesseract.version() > tesseract.TesseractVersion('5'), reason="doesn't fool Tess 5" +) def test_no_languages(tmp_path, monkeypatch): (tmp_path / 'tessdata').mkdir() monkeypatch.setenv('TESSDATA_PREFIX', fspath(tmp_path)) diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py index 7009698c..ef662ff1 100644 --- a/tests/test_unpaper.py +++ b/tests/test_unpaper.py @@ -8,6 +8,7 @@ from os import fspath from unittest.mock import patch import pytest +from packaging.version import Version from ocrmypdf._exec import unpaper from ocrmypdf._plugin_manager import get_parser_options_plugins @@ -40,7 +41,7 @@ def test_old_unpaper(resources, no_outpdf): _parser, options, pm = get_parser_options_plugins(["--clean", input_, output]) with patch("ocrmypdf._exec.unpaper.version") as mock: - mock.return_value = '0.5' + mock.return_value = Version('0.5') with pytest.raises(MissingDependencyError): check_options(options, pm) diff --git a/tests/test_validation.py b/tests/test_validation.py index 013f973b..f0e16541 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -49,7 +49,10 @@ def test_hocr_notlatin_warning(caplog): def test_old_tesseract_error(): - with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'): + with patch( + 'ocrmypdf._exec.tesseract.version', + return_value=TesseractVersion('4.00.00alpha'), + ): with pytest.raises(MissingDependencyError): vd.check_options(*make_opts_pm(pdf_renderer='sandwich', language='eng')) @@ -181,59 +184,66 @@ def test_language_warning(caplog): mock.assert_called_once() +def make_version(version): + def _make_version(): + return TesseractVersion(version) + + return _make_version + + def test_version_comparison(): vd.check_external_program( program="dummy_basic", package="dummy", - version_checker=lambda: '9.0', + version_checker=make_version('9.0'), need_version='8.0.2', ) vd.check_external_program( program="dummy_doubledigit", package="dummy", - version_checker=lambda: '10.0', + version_checker=make_version('10.0'), need_version='8.0.2', ) with pytest.raises(MissingDependencyError): vd.check_external_program( program="tesseract", package="tesseract", - version_checker=lambda: '4.0.0-beta.1', + version_checker=make_version('4.0.0-beta.1'), need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", - version_checker=lambda: 'v5.0.0-alpha.20200201', + version_checker=make_version('v5.0.0-alpha.20200201'), need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", - version_checker=lambda: '5.0.0-rc1.20211030', + version_checker=make_version('5.0.0-rc1.20211030'), need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", - version_checker=lambda: 'v4.1.1.20181030', # Used in some Windows builds + version_checker=make_version('v4.1.1.20181030'), # Used in some Windows builds need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="gs", package="ghostscript", - version_checker=lambda: '10.0', + version_checker=make_version('10.0'), need_version='9.50', ) with pytest.raises(MissingDependencyError): vd.check_external_program( program="tesseract", package="tesseract", - version_checker=lambda: '4.1.1-rc2-25-g9707', + version_checker=make_version('4.1.1-rc2-25-g9707'), need_version='4.1.1', version_parser=TesseractVersion, ) @@ -241,7 +251,7 @@ def test_version_comparison(): vd.check_external_program( program="dummy_fails", package="dummy", - version_checker=lambda: '1.0', + version_checker=make_version('1.0'), need_version='2.0', )