From 8a3b82e364aec0c55d52a697ac06745c25c871ce Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 14:46:01 -0700 Subject: [PATCH 01/20] Make Python 3.8 minimum requirement --- .github/workflows/build.yml | 2 -- .readthedocs.yaml | 2 +- README.md | 2 +- docs/api.rst | 8 -------- docs/installation.rst | 4 ++-- setup.cfg | 6 ++---- src/ocrmypdf/__main__.py | 2 -- src/ocrmypdf/subprocess/_windows.py | 5 ----- tests/test_stdio.py | 4 ---- 9 files changed, 6 insertions(+), 29 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index a895b0fc..0615005e 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -20,8 +20,6 @@ jobs: strategy: matrix: include: - - os: ubuntu-18.04 - python: "3.7" - os: ubuntu-20.04 python: "3.8" - os: ubuntu-20.04 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 38fa8dce..cc3e49f4 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -14,7 +14,7 @@ formats: # Optionally set the version of Python and requirements required to build your docs python: - version: "3.7" + version: "3.8" install: - method: pip path: . diff --git a/README.md b/README.md index 6d7a0a35..e90526c8 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl | macOS (Homebrew) | ``brew install ocrmypdf`` | | macOS (nix) | ``nix-env -i ocrmypdf`` | | LinuxBrew | ``brew install ocrmypdf`` | -| FreeBSD | ``pkg install py37-ocrmypdf`` | +| FreeBSD | ``pkg install py38-ocrmypdf`` | | Conda | ``conda install ocrmypdf`` | | Ubuntu Snap | ``snap install ocrmypdf`` | diff --git a/docs/api.rst b/docs/api.rst index 0478dd1c..1f185446 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -72,14 +72,6 @@ OCRmyPDF, use processes. not take at least one of these steps, process semantics will prevent OCRmyPDF from working correctly. -.. warning:: - - On macOS with Python 3.7, you must call - :func:`multiprocessing.set_start_method("spawn")`. Without this, multiprocessing - will be unstable. From the command line, OCRmyPDF does this automatically, - but as an API user you must do this. See Python bpo-33725 for details. - Python 3.8+ also resolve this automatically. - Logging ------- diff --git a/docs/installation.rst b/docs/installation.rst index 79bfb7c4..dd53dc88 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -417,7 +417,7 @@ Native Windows You must install the following for Windows: -* Python 3.7 (64-bit) or later +* Python 3.8 (64-bit) or later * Tesseract 4.0 or later * Ghostscript 9.50 or later @@ -649,7 +649,7 @@ unfortunately, the ``pip install`` command cannot satisfy all of them. Installing HEAD revision from sources ===================================== -If you have ``git`` and Python 3.7 or newer installed, you can install +If you have ``git`` and Python 3.8 or newer installed, you can install from source. When the ``pip`` installer runs, it will alert you if dependencies are missing. diff --git a/setup.cfg b/setup.cfg index 3dc89171..20207083 100644 --- a/setup.cfg +++ b/setup.cfg @@ -24,7 +24,6 @@ classifiers = Operating System :: POSIX :: Linux Programming Language :: Python :: 3 Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.7 Programming Language :: Python :: 3.8 Programming Language :: Python :: 3.9 Programming Language :: Python :: 3.10 @@ -54,10 +53,9 @@ install_requires = pluggy>=0.13.0 reportlab>=3.5.66 tqdm>=4 - importlib-metadata>=4;python_version<'3.8' # until Python 3.8 importlib-resources>=5;python_version<'3.9' # until Python 3.9 typing-extensions>=4;python_version<'3.10' -python_requires = >=3.7 +python_requires = >=3.8 include_package_data = True package_dir = =src @@ -100,7 +98,7 @@ ocrmypdf = py.typed [bdist_wheel] -python-tag = py37 +python-tag = py38 [aliases] test = pytest diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index 24d620c8..24a64ae9 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -71,6 +71,4 @@ def run(args=None): if __name__ == '__main__': - if sys.platform == 'darwin' and sys.version_info < (3, 8): - set_start_method('spawn') # see python bpo-33725 sys.exit(run()) diff --git a/src/ocrmypdf/subprocess/_windows.py b/src/ocrmypdf/subprocess/_windows.py index 3a6c2a39..46f68931 100644 --- a/src/ocrmypdf/subprocess/_windows.py +++ b/src/ocrmypdf/subprocess/_windows.py @@ -171,11 +171,6 @@ SHIMS = [ def fix_windows_args(program: str, args, env): """Adjust our desired program and command line arguments for use on Windows""" - if sys.version_info < (3, 8): - # bpo-33617 - Windows needs manual Path -> str conversion - args = [os.fspath(arg) for arg in args] - program = os.fspath(program) - # If we are running a .py on Windows, ensure we call it with this Python # (to support test suite shims) if program.lower().endswith('.py'): diff --git a/tests/test_stdio.py b/tests/test_stdio.py index 17536145..577ec607 100644 --- a/tests/test_stdio.py +++ b/tests/test_stdio.py @@ -51,10 +51,6 @@ def test_stdout(ocrmypdf_exec, resources, outpdf): assert check_pdf(output_file) -@pytest.mark.xfail( - os.name == 'nt' and sys.version_info < (3, 8), - reason="Windows does not like this; not sure how to fix", -) def test_dev_null(resources): if 'COV_CORE_DATAFILE' in os.environ: pytest.skip("Coverage uses stdout") From 9ffe829a107f3cabeafa793e0359aba4fdfe08ea Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 14:48:11 -0700 Subject: [PATCH 02/20] Remove external importlib_metadata since Python 3.8 provides it directly --- docs/conf.py | 6 ++---- pyproject.toml | 3 +-- src/ocrmypdf/_version.py | 5 +---- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/docs/conf.py b/docs/conf.py index 4b6bbbb9..ee636db6 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -76,6 +76,8 @@ author = 'James R. Barlow' # The short X.Y version. import os +from importlib.metadata import version as package_version + on_rtd = os.environ.get('READTHEDOCS') == 'True' @@ -96,10 +98,6 @@ if on_rtd: ] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) -try: - from importlib_metadata import version as package_version -except ModuleNotFoundError: - from importlib.metadata import version as package_version # The full version, including alpha/beta/rc tags. release = package_version('ocrmypdf') diff --git a/pyproject.toml b/pyproject.toml index 4d0aba2a..7e9ba8df 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -94,8 +94,7 @@ module = [ 'pdfminer.*', 'reportlab.*', 'fitz', - 'libxmp.utils', - 'importlib_metadata' + 'libxmp.utils' ] ignore_missing_imports = true diff --git a/src/ocrmypdf/_version.py b/src/ocrmypdf/_version.py index 11f8ed3f..4b368f3e 100644 --- a/src/ocrmypdf/_version.py +++ b/src/ocrmypdf/_version.py @@ -8,10 +8,7 @@ OCRmyPDF uses setuptools_scm to derive version from git tags. from __future__ import annotations -try: - from importlib.metadata import version as _package_version -except ImportError: - from importlib_metadata import version as _package_version # type: ignore +from importlib.metadata import version as _package_version PROGRAM_NAME = 'ocrmypdf' From d5a9861d5c4fd043bcfa6d0893b6973ab08b65d5 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 14:49:09 -0700 Subject: [PATCH 03/20] readme: freebsd calls it py-ocrmypdf now --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e90526c8..30fe0693 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl | macOS (Homebrew) | ``brew install ocrmypdf`` | | macOS (nix) | ``nix-env -i ocrmypdf`` | | LinuxBrew | ``brew install ocrmypdf`` | -| FreeBSD | ``pkg install py38-ocrmypdf`` | +| FreeBSD | ``pkg install py-ocrmypdf`` | | Conda | ``conda install ocrmypdf`` | | Ubuntu Snap | ``snap install ocrmypdf`` | From 67773da309d54a7a9b7821a77782c94fbbef8cce Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 15:01:10 -0700 Subject: [PATCH 04/20] Drop support for Ghostscript <9.50 --- docs/installation.rst | 4 +-- docs/introduction.rst | 3 +-- src/ocrmypdf/_exec/ghostscript.py | 30 ++------------------- src/ocrmypdf/builtin_plugins/ghostscript.py | 22 ++------------- tests/plugins/gs_feature_elision.py | 4 +-- tests/test_main.py | 8 +++--- tests/test_validation.py | 16 ----------- 7 files changed, 12 insertions(+), 75 deletions(-) diff --git a/docs/installation.rst b/docs/installation.rst index dd53dc88..cb006566 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -616,8 +616,8 @@ manager. ``pip`` cannot provide them. The following versions are required: -- Python 3.7 or newer -- Ghostscript 9.23 or newer +- Python 3.8 or newer +- Ghostscript 9.50 or newer - Tesseract 4.0.0 or newer - jbig2enc 0.29 or newer - pngquant 2.5 or newer diff --git a/docs/introduction.rst b/docs/introduction.rst index 7c11818b..563844bb 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -190,8 +190,7 @@ Ghostscript also imposes some limitations: behavior can be suppressed by setting ``--pdfa-image-compression`` to ``jpeg`` or ``lossless`` to set all images to one type or the other. Ghostscript has no option to maintain the input image's format. - (Ghostscript 9.25+ can copy JPEG images without transcoding them; - earlier versions will transcode.) + (Modern Ghostscript can copy JPEG images without transcoding them.) - Ghostscript's PDF/A conversion removes any XMP metadata that is not one of the standard XMP metadata namespaces for PDFs. In particular, PRISM Metdata is removed. diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index 9e21c33c..d67044e7 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -47,21 +47,6 @@ def version(): return get_version(GS) -def jpeg_passthrough_available() -> bool: - """Returns True if the installed version of Ghostscript supports JPEG passthru - - Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23 - it gained the ability to keep JPEGs unmodified. However, the 9.23 - implementation was buggy and would deletes the last two bytes of images in - some cases, as reported here. - https://bugs.ghostscript.com/show_bug.cgi?id=699216 - - The issue was fixed for 9.24, hence that is the first version we consider - the feature available. (Ghostscript 9.24 has its own problems is blacklisted.) - """ - return version() >= '9.24' - - def _gs_error_reported(stream) -> bool: match = re.search(r'error', stream, flags=re.IGNORECASE) return bool(match) @@ -201,20 +186,9 @@ def generate_pdfa( ] strategy = 'LeaveColorUnchanged' - # Older versions of Ghostscript expect a leading slash in - # sColorConversionStrategy, newer ones should not have it. See Ghostscript - # git commit fe1c025d. gs_version = version() - strategy = ('/' + strategy) if gs_version < '9.19' else strategy - - if gs_version == '9.23': - # 9.23: added JPEG passthrough as a new feature, but with a bug that - # incorrectly formats some images. Fixed as of 9.24. So we disable this - # feature for 9.23. - # https://bugs.ghostscript.com/show_bug.cgi?id=699216 - compression_args.append('-dPassThroughJPEGImages=false') - elif gs_version == '9.56.0': - # 9.56.0 breaks our OCR...? + if gs_version == '9.56.0': + # 9.56.0 introduced a new rendering mode that breaks our OCR compression_args.append('-dNEWPDF=false') # nb no need to specify ProcessColorModel when ColorConversionStrategy diff --git a/src/ocrmypdf/builtin_plugins/ghostscript.py b/src/ocrmypdf/builtin_plugins/ghostscript.py index 64375ad8..7ab8d7c9 100644 --- a/src/ocrmypdf/builtin_plugins/ghostscript.py +++ b/src/ocrmypdf/builtin_plugins/ghostscript.py @@ -21,37 +21,19 @@ def check_options(options): program='gs', package='ghostscript', version_checker=ghostscript.version, - need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports + need_version='9.50', # Ubuntu 20.04's version ) gs_version = ghostscript.version() - if gs_version in ('9.24', '9.51'): + if gs_version in ('9.51',): raise MissingDependencyError( f"Ghostscript {gs_version} contains serious regressions and is not " "supported. Please upgrade to a newer version, or downgrade to the " "previous version." ) - # We have these constraints to check for. - # 1. Ghostscript < 9.20 mangles multibyte Unicode - # 2. hocr doesn't work on non-Latin languages (so don't select it) - is_latin = options.languages.issubset(HOCR_OK_LANGS) - if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin: - # https://bugs.ghostscript.com/show_bug.cgi?id=696874 - # Ghostscript < 9.20 fails to encode multibyte characters properly - log.warning( - f"The installed version of Ghostscript ({gs_version}) does not work " - "correctly with the OCR languages you specified. Use --output-type pdf or " - "upgrade to Ghostscript 9.20 or later to avoid this issue." - ) - if options.output_type == 'pdfa': options.output_type = 'pdfa-2' - if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19': - raise MissingDependencyError( - "--output-type pdfa-3 requires Ghostscript 9.19 or later" - ) - @hookimpl def rasterize_pdf_page( diff --git a/tests/plugins/gs_feature_elision.py b/tests/plugins/gs_feature_elision.py index 0f63c502..ce829f5f 100644 --- a/tests/plugins/gs_feature_elision.py +++ b/tests/plugins/gs_feature_elision.py @@ -9,13 +9,13 @@ from ocrmypdf import hookimpl from ocrmypdf.builtin_plugins import ghostscript from ocrmypdf.subprocess import run_polling_stderr -elision_warning = """GPL Ghostscript 9.20: Setting Overprint Mode to 1 +ELISION_WARNING = """GPL Ghostscript 9.50: Setting Overprint Mode to 1 not permitted in PDF/A-2, overprint mode not set""" def run_append_stderr(*args, **kwargs): proc = run_polling_stderr(*args, **kwargs) - proc.stderr += '\n' + elision_warning + '\n' + proc.stderr += '\n' + ELISION_WARNING + '\n' return proc diff --git a/tests/test_main.py b/tests/test_main.py index 739df7b7..c975665b 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -716,11 +716,9 @@ def test_compression_changed(ocrmypdf_exec, resources, image, compression, outpd if compression == "jpeg": assert pdfimage.enc == Encoding.jpeg else: - if ghostscript.jpeg_passthrough_available(): - # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be - # copied without transcoding - so report - if image.endswith('jpg'): - assert pdfimage.enc == Encoding.jpeg + if image.endswith('jpg'): + # Ghostscript JPEG passthrough - no issue + assert pdfimage.enc == Encoding.jpeg else: assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000) diff --git a/tests/test_validation.py b/tests/test_validation.py index fca0cc32..ee29f312 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -48,22 +48,6 @@ def test_hocr_notlatin_warning(caplog): assert 'PDF renderer is known to cause' in caplog.text -def test_old_ghostscript(caplog): - with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'), patch( - 'ocrmypdf._exec.tesseract.get_languages', return_value={'eng', 'chi_sim'} - ): - vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa')) - assert 'does not work correctly' in caplog.text - - with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'): - with pytest.raises(MissingDependencyError): - vd.check_options(*make_opts_pm(output_type='pdfa-3')) - - with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'): - with pytest.raises(MissingDependencyError): - vd.check_options(*make_opts_pm()) - - def test_old_tesseract_error(): with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'): with pytest.raises(MissingDependencyError): From 8a8c06c79c52105414fe4c4f2258e976140df8bc Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 15:05:43 -0700 Subject: [PATCH 05/20] Update pre-commit for py3.8+ --- .pre-commit-config.yaml | 2 +- src/ocrmypdf/_exec/tesseract.py | 2 +- src/ocrmypdf/_validation.py | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 8fb0a293..b222746b 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: rev: v2.37.2 hooks: - id: pyupgrade - args: ["--py37-plus"] + args: ["--py38-plus"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.971 hooks: diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py index ad98836a..78ec318a 100644 --- a/src/ocrmypdf/_exec/tesseract.py +++ b/src/ocrmypdf/_exec/tesseract.py @@ -244,7 +244,7 @@ def get_deskew( def tesseract_log_output(stream: bytes) -> None: tlog = TesseractLoggerAdapter( - log, extra=log.extra if hasattr(log, 'extra') else None + log, extra=log.extra if hasattr(log, 'extra') else None # type: ignore ) if not stream: diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py index 0533b928..977ea14e 100644 --- a/src/ocrmypdf/_validation.py +++ b/src/ocrmypdf/_validation.py @@ -134,7 +134,7 @@ def check_options_preprocessing(options: Namespace) -> None: package='unpaper', version_checker=unpaper.version, need_version='6.1', - required_for=['--clean, --clean-final'], + required_for="--clean, --clean-final", # Problem arguments ) try: if options.unpaper_args: @@ -221,7 +221,7 @@ def check_options_metadata(options: Namespace) -> None: def check_options_pillow(options: Namespace) -> None: PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000) if PIL.Image.MAX_IMAGE_PIXELS == 0: - PIL.Image.MAX_IMAGE_PIXELS = None + PIL.Image.MAX_IMAGE_PIXELS = None # type: ignore def _check_plugin_invariant_options(options: Namespace) -> None: From 80b7cf63307830e4dcafe7a6e71678dc8b0a8a5b Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 15:07:43 -0700 Subject: [PATCH 06/20] Update black target versions --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 7e9ba8df..f6e07f29 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ build-backend = "setuptools.build_meta" [tool.black] line-length = 88 -target-version = ["py37", "py38"] +target-version = ["py38", "py39", "py310", "py311"] skip-string-normalization = true include = '\.pyi?$' exclude = ''' From acc70036cc9ddca7986c520dc627b204ccc5b1f1 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 15:20:29 -0700 Subject: [PATCH 07/20] Set minimum Tesseract to 4.1.1 --- README.md | 5 +-- docs/cookbook.rst | 2 +- docs/installation.rst | 6 +-- src/ocrmypdf/_exec/tesseract.py | 11 +---- src/ocrmypdf/builtin_plugins/tesseract_ocr.py | 9 +--- tests/plugins/tesseract_cache.py | 2 +- tests/plugins/tesseract_debug_rotate.py | 4 +- tests/plugins/tesseract_noop.py | 4 +- tests/test_main.py | 1 - tests/test_validation.py | 41 ++++++------------- 10 files changed, 26 insertions(+), 59 deletions(-) diff --git a/README.md b/README.md index 30fe0693..4a2b66b1 100644 --- a/README.md +++ b/README.md @@ -92,10 +92,7 @@ brew install tesseract-lang You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested. -OCRmyPDF supports Tesseract 4.0 and the beta versions of Tesseract 5.0. It will -automatically use whichever version it finds first on the `PATH` environment -variable. On Windows, if `PATH` does not provide a Tesseract binary, we use -the highest version number that is installed according to the Windows Registry. +OCRmyPDF supports Tesseract 4.1.1+. It will automatically use whichever version it finds first on the `PATH` environment variable. On Windows, if `PATH` does not provide a Tesseract binary, we use the highest version number that is installed according to the Windows Registry. ## Documentation and support diff --git a/docs/cookbook.rst b/docs/cookbook.rst index 1360d17e..581c6474 100644 --- a/docs/cookbook.rst +++ b/docs/cookbook.rst @@ -283,7 +283,7 @@ argument. (Normally, OCRmyPDF will exit with an error if asked to modify a file with OCR.) This may be helpful for users who want to take advantage of accuracy -improvements in Tesseract 4.0 for files they previously OCRed with an +improvements in Tesseract for files they previously OCRed with an earlier version of Tesseract and OCRmyPDF. .. code-block:: bash diff --git a/docs/installation.rst b/docs/installation.rst index cb006566..a352ac6b 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -418,7 +418,7 @@ Native Windows You must install the following for Windows: * Python 3.8 (64-bit) or later -* Tesseract 4.0 or later +* Tesseract 4.1.1 or later * Ghostscript 9.50 or later Using the `Chocolatey `_ package manager, install the @@ -481,7 +481,7 @@ Cygwin64 First install the the following prerequisite Cygwin packages using ``setup-x86_64.exe``:: - python37 (or later) + python38 (or later) python3?-devel python3?-pip python3?-lxml @@ -618,7 +618,7 @@ The following versions are required: - Python 3.8 or newer - Ghostscript 9.50 or newer -- Tesseract 4.0.0 or newer +- Tesseract 4.1.1 or newer - jbig2enc 0.29 or newer - pngquant 2.5 or newer - unpaper 6.1 diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py index 78ec318a..adb1ec17 100644 --- a/src/ocrmypdf/_exec/tesseract.py +++ b/src/ocrmypdf/_exec/tesseract.py @@ -33,7 +33,7 @@ HOCR_TEMPLATE = """ - + @@ -114,15 +114,6 @@ def version() -> str: return get_version('tesseract', regex=r'tesseract\s(.+)') -def has_user_words() -> bool: - """Does Tesseract have --user-words capability? - - Not available in 4.0, but available in 4.1. Also available in 3.x, but - we no longer support 3.x. - """ - return version() >= '4.1' - - def has_thresholding() -> bool: """Does Tesseract have -c thresholding method capability?""" return version() >= '5.0' diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py index d9e24e4e..8372577c 100644 --- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py +++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py @@ -43,7 +43,7 @@ def add_options(parser): metavar='MODE', choices=range(0, 4), help=( - "Set Tesseract 4.0+ OCR engine mode: " + "Set Tesseract 4+ OCR engine mode: " "0 - original Tesseract only; " "1 - neural nets LSTM only; " "2 - Tesseract + LSTM; " @@ -93,7 +93,7 @@ def check_options(options): program='tesseract', package={'linux': 'tesseract-ocr'}, version_checker=tesseract.version, - need_version='4.0.0-beta.1', # using backport for Travis CI + need_version='4.1.1', # Ubuntu 20.04 version version_parser=tesseract.TesseractVersion, ) @@ -101,11 +101,6 @@ def check_options(options): if options.pdf_renderer == 'auto': options.pdf_renderer = 'sandwich' - if not tesseract.has_user_words() and (options.user_words or options.user_patterns): - log.warning( - "Tesseract 4.0 (which you have installed) ignores --user-words and " - "--user-patterns, so these arguments have no effect." - ) if not tesseract.has_thresholding() and options.tesseract_thresholding != 0: log.warning( "The installed version of Tesseract does not support changes to its " diff --git a/tests/plugins/tesseract_cache.py b/tests/plugins/tesseract_cache.py index b32d2498..bc2750f9 100644 --- a/tests/plugins/tesseract_cache.py +++ b/tests/plugins/tesseract_cache.py @@ -21,7 +21,7 @@ were produced. Certain operations are not cached and routed to Tesseract OCR directly. -Assumes Tesseract 4.0.0-alpha or higher. +Assumes Tesseract 4+. """ diff --git a/tests/plugins/tesseract_debug_rotate.py b/tests/plugins/tesseract_debug_rotate.py index f278cb3f..1e10cfb7 100644 --- a/tests/plugins/tesseract_debug_rotate.py +++ b/tests/plugins/tesseract_debug_rotate.py @@ -27,7 +27,7 @@ HOCR_TEMPLATE = ''' - + @@ -46,7 +46,7 @@ HOCR_TEMPLATE = ''' class FixedRotateNoopOcrEngine(OcrEngine): @staticmethod def version(): - return '4.0.0' + return '4.1.1' @staticmethod def creator_tag(options): diff --git a/tests/plugins/tesseract_noop.py b/tests/plugins/tesseract_noop.py index 4dd18dfd..68a0bfe6 100644 --- a/tests/plugins/tesseract_noop.py +++ b/tests/plugins/tesseract_noop.py @@ -25,7 +25,7 @@ HOCR_TEMPLATE = ''' - + @@ -44,7 +44,7 @@ HOCR_TEMPLATE = ''' class NoopOcrEngine(OcrEngine): @staticmethod def version(): - return '4.0.0' + return '4.1.1' @staticmethod def creator_tag(options): diff --git a/tests/test_main.py b/tests/test_main.py index c975665b..d4e3135e 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -539,7 +539,6 @@ def test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outp assert p.returncode == ExitCode.invalid_config -@pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0') def test_user_words_ocr(resources, outdir): # Does not actually test if --user-words causes output to differ word_list = outdir / 'wordlist.txt' diff --git a/tests/test_validation.py b/tests/test_validation.py index ee29f312..192925f8 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -87,22 +87,6 @@ def test_optimizing(caplog): assert 'will be ignored because' in caplog.text -def test_user_words(caplog): - with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False): - vd.check_options(*make_opts_pm(user_words='foo')) - assert ( - 'Tesseract 4.0 (which you have installed) ignores --user-words' - in caplog.text - ) - caplog.clear() - with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True): - vd.check_options(*make_opts_pm(user_patterns='foo')) - assert ( - 'Tesseract 4.0 (which you have installed) ignores --user-words' - not in caplog.text - ) - - def test_pillow_options(): vd.check_options_pillow(make_opts(max_image_mpixels=0)) @@ -213,37 +197,38 @@ def test_version_comparison(): program="tesseract", package="tesseract", version_checker=lambda: '4.0.0-beta.1', - need_version='4.0.0', + need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: 'v5.0.0-alpha.20200201', - need_version='4.0.0', + need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: '5.0.0-rc1.20211030', - need_version='4.0.0', + need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", - version_checker=lambda: 'v4.0.0.20181030', # Some Windows builds use this format - need_version='4.0.0', - version_parser=TesseractVersion, - ) - vd.check_external_program( - program="tesseract", - package="tesseract", - version_checker=lambda: '4.1.1-rc2-25-g9707', - need_version='4.0.0', + version_checker=lambda: 'v4.1.1.20181030', # Some Windows builds use this format + need_version='4.1.1', version_parser=TesseractVersion, ) + with pytest.raises(MissingDependencyError): + vd.check_external_program( + program="tesseract", + package="tesseract", + version_checker=lambda: '4.1.1-rc2-25-g9707', + need_version='4.1.1', + version_parser=TesseractVersion, + ) with pytest.raises(MissingDependencyError): vd.check_external_program( program="dummy_fails", From d619fac0bd94915426930b7ba1e207f168da2349 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 2 Aug 2022 15:30:20 -0700 Subject: [PATCH 08/20] unpaper: tidy file --- src/ocrmypdf/_exec/unpaper.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py index d7f24265..dd3a5116 100644 --- a/src/ocrmypdf/_exec/unpaper.py +++ b/src/ocrmypdf/_exec/unpaper.py @@ -1,12 +1,10 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -from __future__ import annotations - -# unpaper documentation: -# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md """Interface to unpaper executable""" +from __future__ import annotations + import logging import os import shlex @@ -22,6 +20,10 @@ from PIL import Image from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError from ocrmypdf.subprocess import get_version, run +# unpaper documentation: +# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md + + if sys.version_info >= (3, 10): from tempfile import TemporaryDirectory else: From 53db866ef96f98e442421f65e1662bb244047240 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 03:54:55 -0700 Subject: [PATCH 09/20] Remove deprecated exception PdfMergeFailedError --- src/ocrmypdf/__init__.py | 1 - src/ocrmypdf/api.py | 2 -- src/ocrmypdf/exceptions.py | 20 -------------------- 3 files changed, 23 deletions(-) diff --git a/src/ocrmypdf/__init__.py b/src/ocrmypdf/__init__.py index 7f4a4cd2..25c3039d 100644 --- a/src/ocrmypdf/__init__.py +++ b/src/ocrmypdf/__init__.py @@ -21,7 +21,6 @@ from ocrmypdf.exceptions import ( InputFileError, MissingDependencyError, OutputFileAccessError, - PdfMergeFailedError, PriorOcrFoundError, SubprocessOutputError, TesseractConfigError, diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index 8d92eb71..67febaeb 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -284,8 +284,6 @@ def ocr( # pylint: disable=unused-argument ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). Raises: - ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging - with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that diff --git a/src/ocrmypdf/exceptions.py b/src/ocrmypdf/exceptions.py index d5a9ae8e..b56e8a1a 100644 --- a/src/ocrmypdf/exceptions.py +++ b/src/ocrmypdf/exceptions.py @@ -47,26 +47,6 @@ class BadArgsError(ExitCodeException): exit_code = ExitCode.bad_args -class PdfMergeFailedError(ExitCodeException): # deprecated - """An intermediate PDF can't be merged. - - No longer in use. - """ - - exit_code = ExitCode.input_file - message = dedent( - '''\ - Failed to merge PDF image layer with OCR layer - - Usually this happens because the input PDF file is malformed and - ocrmypdf cannot correct the problem on its own. - - Try using - ocrmypdf --pdf-renderer sandwich [..other args..] - ''' - ) - - class MissingDependencyError(ExitCodeException): """A third-party dependency is missing.""" From 7e97981114d15990651a50878ff9702b97473171 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 03:56:44 -0700 Subject: [PATCH 10/20] hocrtx: unused imports --- src/ocrmypdf/hocrtransform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ocrmypdf/hocrtransform.py b/src/ocrmypdf/hocrtransform.py index 409aa689..b05eaa9e 100755 --- a/src/ocrmypdf/hocrtransform.py +++ b/src/ocrmypdf/hocrtransform.py @@ -14,7 +14,7 @@ import re import warnings from math import atan, cos, sin from pathlib import Path -from typing import Any, NamedTuple, Optional, Tuple, Union +from typing import Any, NamedTuple from xml.etree import ElementTree with warnings.catch_warnings(): From 670ce2b969b1d88e6e8f037738dd5582f61d03a5 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 03:58:10 -0700 Subject: [PATCH 11/20] Remove support for non-callable version checker --- src/ocrmypdf/subprocess/__init__.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/ocrmypdf/subprocess/__init__.py b/src/ocrmypdf/subprocess/__init__.py index cc5b3550..1254727a 100644 --- a/src/ocrmypdf/subprocess/__init__.py +++ b/src/ocrmypdf/subprocess/__init__.py @@ -314,7 +314,7 @@ def check_external_program( program: The name of the program to test. package: The name of a software package that typically supplies this program. Usually the same as program. - version_check: A callable without arguments that retrieves the installed + version_checker: A callable without arguments that retrieves the installed version of program. need_version: The minimum required version. required_for: The name of an argument of feature that requires this program. @@ -325,10 +325,7 @@ def check_external_program( """ try: - if callable(version_checker): - found_version = version_checker() - else: # deprecated - found_version = version_checker + found_version = version_checker() except (CalledProcessError, FileNotFoundError) as e: _error_missing_program(program, package, required_for, recommended) if not recommended: From 1a0a797ca64ab78717d4019ccce50ac370cc606c Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 04:00:25 -0700 Subject: [PATCH 12/20] Remove our @deprecated decorator and use standard package --- setup.cfg | 1 + src/ocrmypdf/helpers.py | 18 ------------------ tests/test_helpers.py | 9 --------- 3 files changed, 1 insertion(+), 27 deletions(-) diff --git a/setup.cfg b/setup.cfg index 20207083..70309431 100644 --- a/setup.cfg +++ b/setup.cfg @@ -46,6 +46,7 @@ packages = find: install_requires = Pillow>=8.2.0 coloredlogs>=14.0 # strictly optional + deprecation>=2.1.0 img2pdf>=0.3.0 # pure Python packaging>=20 pdfminer.six!=20200720,>=20191110 diff --git a/src/ocrmypdf/helpers.py b/src/ocrmypdf/helpers.py index 481b2f1b..033f5553 100644 --- a/src/ocrmypdf/helpers.py +++ b/src/ocrmypdf/helpers.py @@ -13,7 +13,6 @@ import warnings from collections import namedtuple from collections.abc import Iterable from contextlib import suppress -from functools import wraps from io import StringIO from math import isclose, isfinite from pathlib import Path @@ -275,20 +274,3 @@ def pikepdf_enable_mmap(): # Fix is not in pybind11 2.6.0 # log.debug("pikepdf mmap disabled") return - - -def deprecated(func): - """Warn that function is deprecated.""" - - @wraps(func) - def new_func(*args, **kwargs): - warnings.simplefilter('always', DeprecationWarning) # turn off filter - warnings.warn( - f"Call to deprecated function {func.__name__}.", - category=DeprecationWarning, - stacklevel=2, - ) - warnings.simplefilter('default', DeprecationWarning) # reset filter - return func(*args, **kwargs) - - return new_func diff --git a/tests/test_helpers.py b/tests/test_helpers.py index bd43b5b5..e1992b32 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -54,15 +54,6 @@ def test_no_cpu_count(monkeypatch): assert invoked, "Patched function called during test" -def test_deprecated(): - @helpers.deprecated - def old_function(): - return 42 - - with pytest.deprecated_call(): - assert old_function() == 42 - - skipif_docker = pytest.mark.skipif(running_in_docker(), reason="fails on Docker") From 4104904a1e7e0ac955d0e0f51d4adc3019a59a1d Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 04:13:02 -0700 Subject: [PATCH 13/20] Document reason for suppress some third party deprecation warnings --- src/ocrmypdf/hocrtransform.py | 1 + tests/test_metadata.py | 10 ++++++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/src/ocrmypdf/hocrtransform.py b/src/ocrmypdf/hocrtransform.py index b05eaa9e..306dfa24 100755 --- a/src/ocrmypdf/hocrtransform.py +++ b/src/ocrmypdf/hocrtransform.py @@ -18,6 +18,7 @@ from typing import Any, NamedTuple from xml.etree import ElementTree with warnings.catch_warnings(): + # reportlab uses deprecated load_module warnings.filterwarnings( 'ignore', category=DeprecationWarning, message=r".*load_module.*" ) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index dbd9f7f9..80b97c78 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -28,9 +28,6 @@ except ImportError: fitz = None -pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning') - - @pytest.mark.parametrize("output_type", ['pdfa', 'pdf']) def test_preserve_docinfo(output_type, resources, outpdf): pdf_before = pikepdf.open(resources / 'graph.pdf') @@ -174,7 +171,12 @@ def test_creation_date_preserved(output_type, resources, infile, outpdf): def libxmp_file_to_dict(): try: with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) + # libxmp imports distutils.Version, which is deprecated + warnings.filterwarnings( + "ignore", + category=DeprecationWarning, + message=r".*distutils Version classes are deprecated.*", + ) from libxmp.utils import ( file_to_dict, # pylint: disable=import-outside-toplevel ) From 4d2f499f97c123c9c21d46cba3ce3486fe913c46 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 04:15:56 -0700 Subject: [PATCH 14/20] Remove optional status of coloredlogs Everything optional is a possible complication. Better to remove the option. --- setup.cfg | 2 +- src/ocrmypdf/api.py | 14 ++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/setup.cfg b/setup.cfg index 70309431..933c767f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -45,7 +45,7 @@ project_urls = packages = find: install_requires = Pillow>=8.2.0 - coloredlogs>=14.0 # strictly optional + coloredlogs>=14.0 deprecation>=2.1.0 img2pdf>=0.3.0 # pure Python packaging>=20 diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index 67febaeb..52821869 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -15,6 +15,9 @@ from pathlib import Path from typing import AnyStr, BinaryIO, Iterable, Union from warnings import warn +import coloredlogs +from humanfriendly.terminal import enable_ansi_support + from ocrmypdf._logging import PageNumberFilter, TqdmConsole from ocrmypdf._plugin_manager import get_plugin_manager from ocrmypdf._sync import run_pipeline @@ -22,15 +25,6 @@ from ocrmypdf._validation import check_options from ocrmypdf.cli import ArgumentParser, get_parser from ocrmypdf.helpers import is_iterable_notstr -try: - import coloredlogs -except ModuleNotFoundError: - coloredlogs = None # pylint: disable=invalid-name - -if coloredlogs: - from humanfriendly.terminal import enable_ansi_support - - StrPath = Union[Path, AnyStr] PathOrIO = Union[BinaryIO, StrPath] @@ -121,7 +115,7 @@ def configure_logging( use_colors = progress_bar_friendly formatter = None - if coloredlogs and use_colors: + if use_colors: use_colors = enable_ansi_support() if use_colors: use_colors = coloredlogs.terminal_supports_colors() From c9389c77138808f02a4de3fb3b5e4fcf7cec7c81 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 04:21:15 -0700 Subject: [PATCH 15/20] Remove deprecated falsy handling of filter_page_image --- src/ocrmypdf/_sync.py | 4 ++-- src/ocrmypdf/pluginspec.py | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 6a9a85a8..5843f3b4 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -206,8 +206,8 @@ def exec_page_sync(page_context: PageContext) -> PageResult: filtered_image = page_context.plugin_manager.hook.filter_page_image( page=page_context, image_filename=visible_image_out ) - if filtered_image: - visible_image_out = filtered_image + + visible_image_out = filtered_image pdf_page_from_image_out = create_pdf_page_from_image( visible_image_out, page_context, orientation_correction ) diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index c3298e4c..551d42e6 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -276,17 +276,19 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path: If the return value is a file that does not exist, ``FileNotFoundError`` will occur. The return value should be a path to a file in the same folder - as ``image_filename``. - - Implementation detail: If the value returned is falsy, OCRmyPDF will ignore - the return value and assume the input file was unmodified. This is deprecated. - To leave the image unmodified, ``image_filename`` should be returned. + as ``image_filename``. To leave the image unmodified, ``image_filename`` + should be returned. Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. Note: This is a :ref:`firstresult hook`. + + .. versionchanged:: 14.0 + Previously, OCRmyPDF would treat as a falsy value as a request to leave + the image unmodified. This is no longer supported and will trigger an + exception. """ From 88d2949e6b94683963c3b58398be34e6b3172fef Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Aug 2022 05:00:46 -0700 Subject: [PATCH 16/20] Abolish setup.cfg and migrate to pyproject.toml --- pyproject.toml | 86 +++++++++++++++++++++++++++++- setup.cfg | 115 ---------------------------------------- src/ocrmypdf/RELEASE.md | 4 +- 3 files changed, 87 insertions(+), 118 deletions(-) delete mode 100644 setup.cfg diff --git a/pyproject.toml b/pyproject.toml index f6e07f29..afaa60b3 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,13 +1,97 @@ [build-system] requires = [ - "setuptools >= 52", + "setuptools >= 61", "setuptools_scm[toml] >= 7.0.5", "wheel" ] build-backend = "setuptools.build_meta" +[project] +name = "ocrmypdf" +dynamic = ["version"] +description = "OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched" +readme = "README.md" +license = {text = "MPL-2.0"} +requires-python = ">=3.8" +dependencies = [ + "Pillow>=8.2.0", + "coloredlogs>=14.0", + "deprecation>=2.1.0", + "img2pdf>=0.3.0", # pure Python + "packaging>=20", + "pdfminer.six!=20200720,>=20191110", + "pikepdf!=5.0.0,>=4.0.0", + "pluggy>=0.13.0", + "reportlab>=3.5.66", + "tqdm>=4", + "importlib-resources>=5;python_version<'3.9'", # until Python 3.9 + "typing-extensions>=4;python_version<'3.10'", +] +authors = [{name = "James R. Barlow", email="james@purplerock.ca"}] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: Science/Research", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows :: Windows 10", + "Operating System :: POSIX", + "Operating System :: POSIX :: BSD", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering :: Image Recognition", + "Topic :: Text Processing :: Indexing", + "Topic :: Text Processing :: Linguistic", +] +keywords = [ + "PDF", + "OCR", + "optical character recognition", + "PDF/A", + "scanning", +] + +[project.urls] +Documentation = "https://ocrmypdf.readthedocs.io/" +Source = "https://github.com/ocrmypdf/OCRmyPDF" +Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues" + +[project.optional-dependencies] +docs = ["sphinx", "sphinx-issues", "sphinx-rtd-theme"] +extended_test = ["PyMuPDF==1.19.1"] +test = [ + "coverage[toml]>=5", + "pytest>=6.0.0", + "pytest-cov>=2.11.1", + "pytest-xdist>=2.2.0", + "python-xmp-toolkit==2.0.1", # also requires apt-get install libexempi3 + "types-Pillow", + "types-humanfriendly", +] +watcher = ["watchdog>=1.0.2"] +webservice = ["Flask>=1"] + +[project.scripts] +ocrmypdf = "ocrmypdf.__main__:run" + +[tool.setuptools.package-data] +ocrmypdf = ["data/sRGB.icc", "py.typed"] + +[tool.setuptools.packages.find] +where = ["src"] +namespaces = false + [tool.setuptools_scm] +[tool.distutils.bdist_wheel] +python-tag = "py38" + [tool.black] line-length = 88 target-version = ["py38", "py39", "py310", "py311"] diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 933c767f..00000000 --- a/setup.cfg +++ /dev/null @@ -1,115 +0,0 @@ -[metadata] -name = ocrmypdf -description = OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/ocrmypdf/OCRmyPDF -author = James R. Barlow -author_email = james@purplerock.ca -license = MPL-2.0 -license_file = LICENSE -license_files = - LICENSE -classifiers = - Development Status :: 5 - Production/Stable - Environment :: Console - Intended Audience :: End Users/Desktop - Intended Audience :: Science/Research - Intended Audience :: System Administrators - License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0) - Operating System :: MacOS :: MacOS X - Operating System :: Microsoft :: Windows :: Windows 10 - Operating System :: POSIX - Operating System :: POSIX :: BSD - Operating System :: POSIX :: Linux - Programming Language :: Python :: 3 - Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Topic :: Scientific/Engineering :: Image Recognition - Topic :: Text Processing :: Indexing - Topic :: Text Processing :: Linguistic -keywords = - PDF - OCR - optical character recognition - PDF/A - scanning -project_urls = - Documentation = https://ocrmypdf.readthedocs.io/ - Source = https://github.com/ocrmypdf/OCRmyPDF - Tracker = https://github.com/ocrmypdf/OCRmyPDF/issues - -[options] -packages = find: -install_requires = - Pillow>=8.2.0 - coloredlogs>=14.0 - deprecation>=2.1.0 - img2pdf>=0.3.0 # pure Python - packaging>=20 - pdfminer.six!=20200720,>=20191110 - pikepdf!=5.0.0,>=4.0.0 - pluggy>=0.13.0 - reportlab>=3.5.66 - tqdm>=4 - importlib-resources>=5;python_version<'3.9' # until Python 3.9 - typing-extensions>=4;python_version<'3.10' -python_requires = >=3.8 -include_package_data = True -package_dir = - =src -platforms = any -setup_requires = - setuptools-scm - setuptools-scm-git-archive -zip_safe = False - -[options.packages.find] -where = src - -[options.entry_points] -console_scripts = - ocrmypdf = ocrmypdf.__main__:run - -[options.extras_require] -docs = - sphinx - sphinx-issues - sphinx-rtd-theme -extended_test = - PyMuPDF==1.19.1 -test = - coverage[toml]>=5 - pytest>=6.0.0 - pytest-cov>=2.11.1 - pytest-xdist>=2.2.0 - python-xmp-toolkit==2.0.1 # also requires apt-get install libexempi3 - types-Pillow - types-humanfriendly -watcher = - watchdog>=1.0.2 -webservice = - Flask>=1 - -[options.package_data] -ocrmypdf = - data/sRGB.icc - py.typed - -[bdist_wheel] -python-tag = py38 - -[aliases] -test = pytest - -[check-manifest] -ignore = - .github - -[flake8] -ignore = D203,F401,W503,E501,E203,F841 -exclude = .git,__pycache__,docs/conf.py,build,dist,.venv,.venvpp,.eggs,tmp,src/ocrmypdf/lib/ -max-complexity = 10 -max-line-length = 100 diff --git a/src/ocrmypdf/RELEASE.md b/src/ocrmypdf/RELEASE.md index 2cb89b78..744cddd6 100644 --- a/src/ocrmypdf/RELEASE.md +++ b/src/ocrmypdf/RELEASE.md @@ -14,11 +14,11 @@ - Check README.md -- Check setup.py +- Check pyproject.toml - Are classifiers up to date? - Is `python_requires` correct? - - Python 3.6 is EOL on December 2021-12. Could drop support then. + - Is it to drop support for older Pythons? - Can we tighten any `install_requires` dependencies? - Search for old version shims we can remove From 47dcb6fcd069168b8cb09c61e03e1fba36b81e98 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Fri, 5 Aug 2022 01:01:09 -0700 Subject: [PATCH 17/20] Revert "Remove deprecated falsy handling of filter_page_image" This reverts commit c9389c77138808f02a4de3fb3b5e4fcf7cec7c81. --- src/ocrmypdf/_sync.py | 4 ++-- src/ocrmypdf/pluginspec.py | 12 +++++------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 5843f3b4..6a9a85a8 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -206,8 +206,8 @@ def exec_page_sync(page_context: PageContext) -> PageResult: filtered_image = page_context.plugin_manager.hook.filter_page_image( page=page_context, image_filename=visible_image_out ) - - visible_image_out = filtered_image + if filtered_image: + visible_image_out = filtered_image pdf_page_from_image_out = create_pdf_page_from_image( visible_image_out, page_context, orientation_correction ) diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index 551d42e6..c3298e4c 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -276,19 +276,17 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path: If the return value is a file that does not exist, ``FileNotFoundError`` will occur. The return value should be a path to a file in the same folder - as ``image_filename``. To leave the image unmodified, ``image_filename`` - should be returned. + as ``image_filename``. + + Implementation detail: If the value returned is falsy, OCRmyPDF will ignore + the return value and assume the input file was unmodified. This is deprecated. + To leave the image unmodified, ``image_filename`` should be returned. Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. Note: This is a :ref:`firstresult hook`. - - .. versionchanged:: 14.0 - Previously, OCRmyPDF would treat as a falsy value as a request to leave - the image unmodified. This is no longer supported and will trigger an - exception. """ From ef70c9499ea773d66d6a2e014f2db6164cee372e Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Fri, 5 Aug 2022 01:07:46 -0700 Subject: [PATCH 18/20] Don't deprecate falsy filter_page_image --- src/ocrmypdf/_sync.py | 2 +- src/ocrmypdf/pluginspec.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 6a9a85a8..1c971484 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -206,7 +206,7 @@ def exec_page_sync(page_context: PageContext) -> PageResult: filtered_image = page_context.plugin_manager.hook.filter_page_image( page=page_context, image_filename=visible_image_out ) - if filtered_image: + if filtered_image is not None: # None if no hook is present visible_image_out = filtered_image pdf_page_from_image_out = create_pdf_page_from_image( visible_image_out, page_context, orientation_correction diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index c3298e4c..dfa2e3ea 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -278,10 +278,6 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path: will occur. The return value should be a path to a file in the same folder as ``image_filename``. - Implementation detail: If the value returned is falsy, OCRmyPDF will ignore - the return value and assume the input file was unmodified. This is deprecated. - To leave the image unmodified, ``image_filename`` should be returned. - Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. From 76bd8cab135af61a0754fbcdc9f4b6ee45e03052 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 6 Aug 2022 02:58:26 -0700 Subject: [PATCH 19/20] Drop Ubuntu 18.04 content --- .github/workflows/build.yml | 6 ----- docs/installation.rst | 49 +++---------------------------------- 2 files changed, 3 insertions(+), 52 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0615005e..57d1b6d6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -71,12 +71,6 @@ jobs: unpaper \ zlib1g - - name: Install Ubuntu 18.04 packages - if: matrix.os == 'ubuntu-18.04' - run: | - sudo apt-get install -y --no-install-recommends \ - libexempi3 - - name: Install Ubuntu 20.04 packages if: matrix.os == 'ubuntu-20.04' || matrix.os == 'ubuntu-latest' run: | diff --git a/docs/installation.rst b/docs/installation.rst index a352ac6b..07c25abd 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -44,7 +44,7 @@ install, or install a more recent version than your platform provides, read on. Installing on Linux =================== -Debian and Ubuntu 18.04 or newer +Debian and Ubuntu 20.04 or newer -------------------------------- .. |deb-11| image:: https://repology.org/badge/version-for-repo/debian_11/ocrmypdf.svg @@ -56,9 +56,6 @@ Debian and Ubuntu 18.04 or newer .. |deb-unstable| image:: https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg :alt: Debian unstable -.. |ubu-1804| image:: https://repology.org/badge/version-for-repo/ubuntu_18_04/ocrmypdf.svg - :alt: Ubuntu 18.04 LTS - .. |ubu-2004| image:: https://repology.org/badge/version-for-repo/ubuntu_20_04/ocrmypdf.svg :alt: Ubuntu 20.04 LTS @@ -72,7 +69,7 @@ Debian and Ubuntu 18.04 or newer +-----------------------------------------------+ | |deb-11| |deb-12| |deb-unstable| | +-----------------------------------------------+ -| |ubu-1804| |ubu-2004| |ubu-2204| | +| |ubu-2004| |ubu-2204| | +-----------------------------------------------+ Users of Debian 9 ("stretch") or later, or Ubuntu 18.04 or later, including users @@ -80,7 +77,7 @@ of Windows Subsystem for Linux, may simply .. code-block:: bash - apt-get install ocrmypdf + apt install ocrmypdf As indicated in the table above, Debian and Ubuntu releases may lag behind the latest version. If the version available for your platform is @@ -198,46 +195,6 @@ To install for the current user only: To add JBIG2 encoding, see :ref:`jbig2`. -Ubuntu 18.04 LTS ----------------- - -Ubuntu 18.04 includes ocrmypdf 6.1.2 - you can install that with ``apt``, but -it is quite old now. To install a more recent version, uninstall the old version -of ocrmypdf, and install the following dependencies: - -.. code-block:: bash - - sudo apt-get -y remove ocrmypdf - sudo apt-get -y update - sudo apt-get -y install \ - ghostscript \ - icc-profiles-free \ - libxml2 \ - pngquant \ - python3-distutils \ - python3-pkg-resources \ - python3-reportlab \ - qpdf \ - tesseract-ocr \ - zlib1g \ - unpaper - -We will need a newer version of ``pip`` then was available for Ubuntu 18.04: - -.. code-block:: bash - - wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py - -Then install the most recent ocrmypdf for the local user and set the -user's ``PATH`` to check for the user's Python packages. - -.. code-block:: bash - - export PATH=$HOME/.local/bin:$PATH - python3 -m pip install --user ocrmypdf - -To add JBIG2 encoding, see :ref:`jbig2`. - Arch Linux (AUR) ---------------- From 5156fe7662bd1ed316e59e29cc05cedcdaa594e0 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 15 Sep 2022 22:56:44 -0700 Subject: [PATCH 20/20] Test PyPy 3.8 and 3.9 --- .github/workflows/build.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 76f31b35..f9a15f41 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -31,7 +31,9 @@ jobs: - os: ubuntu-latest python: "3.9" - os: ubuntu-latest - python: "pypy-3.8" + python: "pypy3.8" + - os: ubuntu-latest + python: "pypy3.9" - os: ubuntu-latest python: "3.9" tesseract5: true