Merge branch 'feature/drop-3.7'

2026-05-07 06:07:58 -04:00 · 2022-09-20 22:32:59 -07:00
parent 1709e23701 6dbaebdc0c
commit 2ebc36fcec
38 changed files with 164 additions and 424 deletions
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -22,8 +22,6 @@ jobs:
    strategy:
      matrix:
        include:
-          - os: ubuntu-18.04
-            python: "3.7"
          - os: ubuntu-20.04
            python: "3.8"
          - os: ubuntu-20.04
@@ -33,7 +31,9 @@ jobs:
          - os: ubuntu-latest
            python: "3.9"
          - os: ubuntu-latest
-            python: "pypy-3.8"
+            python: "pypy3.8"
+          - os: ubuntu-latest
+            python: "pypy3.9"
          - os: ubuntu-latest
            python: "3.9"
            tesseract5: true
@@ -75,12 +75,6 @@ jobs:
            unpaper \
            zlib1g

-      - name: Install Ubuntu 18.04 packages
-        if: matrix.os == 'ubuntu-18.04'
-        run: |
-          sudo apt-get install -y --no-install-recommends \
-            libexempi3
-
      - name: Install Ubuntu 20.04 packages
        if: matrix.os == 'ubuntu-20.04' || matrix.os == 'ubuntu-latest'
        run: |
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
    rev: v2.37.2
    hooks:
      - id: pyupgrade
-        args: ["--py37-plus"]
+        args: ["--py38-plus"]
  - repo: https://github.com/pre-commit/mirrors-mypy
    rev: v0.971
    hooks:
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -17,7 +17,7 @@ formats:

 # Optionally set the version of Python and requirements required to build your docs
 python:
-  version: "3.7"
+  version: "3.8"
  install:
    - method: pip
      path: .
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl
 | macOS (Homebrew)              | ``brew install ocrmypdf``     |
 | macOS (nix)                   | ``nix-env -i  ocrmypdf``      |
 | LinuxBrew                     | ``brew install ocrmypdf``     |
-| FreeBSD                       | ``pkg install py37-ocrmypdf`` |
+| FreeBSD                       | ``pkg install py-ocrmypdf`` |
 | Conda                         | ``conda install ocrmypdf``    |
 | Ubuntu Snap                   | ``snap install ocrmypdf``     |

@@ -96,10 +96,7 @@ brew install tesseract-lang

 You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.

-OCRmyPDF supports Tesseract 4.0 and the beta versions of Tesseract 5.0. It will
-automatically use whichever version it finds first on the `PATH` environment
-variable. On Windows, if `PATH` does not provide a Tesseract binary, we use
-the highest version number that is installed according to the Windows Registry.
+OCRmyPDF supports Tesseract 4.1.1+. It will automatically use whichever version it finds first on the `PATH` environment variable. On Windows, if `PATH` does not provide a Tesseract binary, we use the highest version number that is installed according to the Windows Registry.

 ## Documentation and support

--- a/docs/api.rst
+++ b/docs/api.rst
@@ -72,14 +72,6 @@ OCRmyPDF, use processes.
    not take at least one of these steps, process semantics will prevent
    OCRmyPDF from working correctly.

-.. warning::
-
-    On macOS with Python 3.7, you must call
-    :func:`multiprocessing.set_start_method("spawn")`. Without this, multiprocessing
-    will be unstable. From the command line, OCRmyPDF does this automatically,
-    but as an API user you must do this. See Python bpo-33725 for details.
-    Python 3.8+ also resolve this automatically.
-
 Logging
 -------

--- a/docs/conf.py
+++ b/docs/conf.py
@@ -76,6 +76,8 @@ author = 'James R. Barlow'
 # The short X.Y version.

 import os
+from importlib.metadata import version as package_version
+

 on_rtd = os.environ.get('READTHEDOCS') == 'True'

@@ -96,10 +98,6 @@ if on_rtd:
    ]
    sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)

-try:
-    from importlib_metadata import version as package_version
-except ModuleNotFoundError:
-    from importlib.metadata import version as package_version

 # The full version, including alpha/beta/rc tags.
 release = package_version('ocrmypdf')
--- a/docs/cookbook.rst
+++ b/docs/cookbook.rst
@@ -283,7 +283,7 @@ argument. (Normally, OCRmyPDF will exit with an error if asked to modify
 a file with OCR.)

 This may be helpful for users who want to take advantage of accuracy
-improvements in Tesseract 4.0 for files they previously OCRed with an
+improvements in Tesseract for files they previously OCRed with an
 earlier version of Tesseract and OCRmyPDF.

 .. code-block:: bash
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -44,7 +44,7 @@ install, or install a more recent version than your platform provides, read on.
 Installing on Linux
 ===================

-Debian and Ubuntu 18.04 or newer
+Debian and Ubuntu 20.04 or newer
 --------------------------------

 .. |deb-11| image:: https://repology.org/badge/version-for-repo/debian_11/ocrmypdf.svg
@@ -56,9 +56,6 @@ Debian and Ubuntu 18.04 or newer
 .. |deb-unstable| image:: https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg
    :alt: Debian unstable

-.. |ubu-1804| image:: https://repology.org/badge/version-for-repo/ubuntu_18_04/ocrmypdf.svg
-    :alt: Ubuntu 18.04 LTS
-
 .. |ubu-2004| image:: https://repology.org/badge/version-for-repo/ubuntu_20_04/ocrmypdf.svg
    :alt: Ubuntu 20.04 LTS

@@ -72,7 +69,7 @@ Debian and Ubuntu 18.04 or newer
 +-----------------------------------------------+
 | |deb-11| |deb-12| |deb-unstable|              |
 +-----------------------------------------------+
-| |ubu-1804| |ubu-2004| |ubu-2204|              |
+| |ubu-2004| |ubu-2204|                         |
 +-----------------------------------------------+

 Users of Debian 9 ("stretch") or later, or Ubuntu 18.04 or later, including users
@@ -80,7 +77,7 @@ of Windows Subsystem for Linux, may simply

 .. code-block:: bash

-    apt-get install ocrmypdf
+    apt install ocrmypdf

 As indicated in the table above, Debian and Ubuntu releases may lag
 behind the latest version. If the version available for your platform is
@@ -198,46 +195,6 @@ To install for the current user only:

 To add JBIG2 encoding, see :ref:`jbig2`.

-Ubuntu 18.04 LTS
----------------
-
-Ubuntu 18.04 includes ocrmypdf 6.1.2 - you can install that with ``apt``, but
-it is quite old now. To install a more recent version, uninstall the old version
-of ocrmypdf, and install the following dependencies:
-
-.. code-block:: bash
-
-    sudo apt-get -y remove ocrmypdf
-    sudo apt-get -y update
-    sudo apt-get -y install \
-        ghostscript \
-        icc-profiles-free \
-        libxml2 \
-        pngquant \
-        python3-distutils \
-        python3-pkg-resources \
-        python3-reportlab \
-        qpdf \
-        tesseract-ocr \
-        zlib1g \
-        unpaper
-
-We will need a newer version of ``pip`` then was available for Ubuntu 18.04:
-
-.. code-block:: bash
-
-    wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py
-
-Then install the most recent ocrmypdf for the local user and set the
-user's ``PATH`` to check for the user's Python packages.
-
-.. code-block:: bash
-
-    export PATH=$HOME/.local/bin:$PATH
-    python3 -m pip install --user ocrmypdf
-
-To add JBIG2 encoding, see :ref:`jbig2`.
-
 Arch Linux (AUR)
 ----------------

@@ -417,8 +374,8 @@ Native Windows

 You must install the following for Windows:

-* Python 3.7 (64-bit) or later
-* Tesseract 4.0 or later
+* Python 3.8 (64-bit) or later
+* Tesseract 4.1.1 or later
 * Ghostscript 9.50 or later

 Using the `Chocolatey <https://chocolatey.org/>`_ package manager, install the
@@ -481,7 +438,7 @@ Cygwin64

 First install the the following prerequisite Cygwin packages using ``setup-x86_64.exe``::

-    python37 (or later)
+    python38 (or later)
    python3?-devel
    python3?-pip
    python3?-lxml
@@ -616,9 +573,9 @@ manager. ``pip`` cannot provide them.

 The following versions are required:

-  Python 3.7 or newer
-  Ghostscript 9.23 or newer
-  Tesseract 4.0.0 or newer
+-  Python 3.8 or newer
+-  Ghostscript 9.50 or newer
+-  Tesseract 4.1.1 or newer
 -  jbig2enc 0.29 or newer
 -  pngquant 2.5 or newer
 -  unpaper 6.1
@@ -649,7 +606,7 @@ unfortunately, the ``pip install`` command cannot satisfy all of them.
 Installing HEAD revision from sources
 =====================================

-If you have ``git`` and Python 3.7 or newer installed, you can install
+If you have ``git`` and Python 3.8 or newer installed, you can install
 from source. When the ``pip`` installer runs, it will alert you if
 dependencies are missing.

--- a/docs/introduction.rst
+++ b/docs/introduction.rst
@@ -190,8 +190,7 @@ Ghostscript also imposes some limitations:
   behavior can be suppressed by setting ``--pdfa-image-compression`` to
   ``jpeg`` or ``lossless`` to set all images to one type or the other.
   Ghostscript has no option to maintain the input image's format.
-   (Ghostscript 9.25+ can copy JPEG images without transcoding them;
-   earlier versions will transcode.)
+   (Modern Ghostscript can copy JPEG images without transcoding them.)
 -  Ghostscript's PDF/A conversion removes any XMP metadata that is not
   one of the standard XMP metadata namespaces for PDFs. In particular,
   PRISM Metdata is removed.
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,17 +2,101 @@
 # SPDX-License-Identifier: MPL-2.0
 [build-system]
 requires = [
-  "setuptools >= 52",
+  "setuptools >= 61",
  "setuptools_scm[toml] >= 7.0.5",
  "wheel"
 ]
 build-backend = "setuptools.build_meta"

+[project]
+name = "ocrmypdf"
+dynamic = ["version"]
+description = "OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched"
+readme = "README.md"
+license = {text = "MPL-2.0"}
+requires-python = ">=3.8"
+dependencies = [
+  "Pillow>=8.2.0",
+  "coloredlogs>=14.0",
+  "deprecation>=2.1.0",
+  "img2pdf>=0.3.0", # pure Python
+  "packaging>=20",
+  "pdfminer.six!=20200720,>=20191110",
+  "pikepdf!=5.0.0,>=4.0.0",
+  "pluggy>=0.13.0",
+  "reportlab>=3.5.66",
+  "tqdm>=4",
+  "importlib-resources>=5;python_version<'3.9'",  # until Python 3.9
+  "typing-extensions>=4;python_version<'3.10'",
+]
+authors = [{name = "James R. Barlow", email="james@purplerock.ca"}]
+classifiers = [
+  "Development Status :: 5 - Production/Stable",
+  "Environment :: Console",
+  "Intended Audience :: End Users/Desktop",
+  "Intended Audience :: Science/Research",
+  "Intended Audience :: System Administrators",
+  "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
+  "Operating System :: MacOS :: MacOS X",
+  "Operating System :: Microsoft :: Windows :: Windows 10",
+  "Operating System :: POSIX",
+  "Operating System :: POSIX :: BSD",
+  "Operating System :: POSIX :: Linux",
+  "Programming Language :: Python :: 3",
+  "Programming Language :: Python :: 3 :: Only",
+  "Programming Language :: Python :: 3.8",
+  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.10",
+  "Topic :: Scientific/Engineering :: Image Recognition",
+  "Topic :: Text Processing :: Indexing",
+  "Topic :: Text Processing :: Linguistic",
+]
+keywords = [
+  "PDF",
+  "OCR",
+  "optical character recognition",
+  "PDF/A",
+  "scanning",
+]
+
+[project.urls]
+Documentation = "https://ocrmypdf.readthedocs.io/"
+Source = "https://github.com/ocrmypdf/OCRmyPDF"
+Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues"
+
+[project.optional-dependencies]
+docs = ["sphinx", "sphinx-issues", "sphinx-rtd-theme"]
+extended_test = ["PyMuPDF==1.19.1"]
+test = [
+  "coverage[toml]>=5",
+  "pytest>=6.0.0",
+  "pytest-cov>=2.11.1",
+  "pytest-xdist>=2.2.0",
+  "python-xmp-toolkit==2.0.1",  # also requires apt-get install libexempi3
+  "types-Pillow",
+  "types-humanfriendly",
+]
+watcher = ["watchdog>=1.0.2"]
+webservice = ["Flask>=1"]
+
+[project.scripts]
+ocrmypdf = "ocrmypdf.__main__:run"
+
+[tool.setuptools.package-data]
+ocrmypdf = ["data/sRGB.icc", "py.typed"]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+namespaces = false
+
 [tool.setuptools_scm]

+[tool.distutils.bdist_wheel]
+python-tag = "py38"
+
 [tool.black]
 line-length = 88
-target-version = ["py37", "py38"]
+target-version = ["py38", "py39", "py310", "py311"]
 skip-string-normalization = true
 include = '\.pyi?$'
 exclude = '''
@@ -96,8 +180,7 @@ module = [
  'pdfminer.*',
  'reportlab.*',
  'fitz',
-  'libxmp.utils',
-  'importlib_metadata'
+  'libxmp.utils'
 ]
 ignore_missing_imports = true

--- a/setup.cfg
+++ b/setup.cfg
@@ -1,116 +0,0 @@
-[metadata]
-name = ocrmypdf
-description = OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/ocrmypdf/OCRmyPDF
-author = James R. Barlow
-author_email = james@purplerock.ca
-license = MPL-2.0
-license_file = LICENSE
-license_files =
-    LICENSE
-classifiers =
-    Development Status :: 5 - Production/Stable
-    Environment :: Console
-    Intended Audience :: End Users/Desktop
-    Intended Audience :: Science/Research
-    Intended Audience :: System Administrators
-    License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
-    Operating System :: MacOS :: MacOS X
-    Operating System :: Microsoft :: Windows :: Windows 10
-    Operating System :: POSIX
-    Operating System :: POSIX :: BSD
-    Operating System :: POSIX :: Linux
-    Programming Language :: Python :: 3
-    Programming Language :: Python :: 3 :: Only
-    Programming Language :: Python :: 3.7
-    Programming Language :: Python :: 3.8
-    Programming Language :: Python :: 3.9
-    Programming Language :: Python :: 3.10
-    Topic :: Scientific/Engineering :: Image Recognition
-    Topic :: Text Processing :: Indexing
-    Topic :: Text Processing :: Linguistic
-keywords =
-    PDF
-    OCR
-    optical character recognition
-    PDF/A
-    scanning
-project_urls =
-    Documentation = https://ocrmypdf.readthedocs.io/
-    Source = https://github.com/ocrmypdf/OCRmyPDF
-    Tracker = https://github.com/ocrmypdf/OCRmyPDF/issues
-
-[options]
-packages = find:
-install_requires =
-    Pillow>=8.2.0
-    coloredlogs>=14.0  # strictly optional
-    img2pdf>=0.3.0  # pure Python
-    packaging>=20
-    pdfminer.six!=20200720,>=20191110
-    pikepdf!=5.0.0,>=4.0.0
-    pluggy>=0.13.0
-    reportlab>=3.5.66
-    tqdm>=4
-    importlib-metadata>=4;python_version<'3.8'  # until Python 3.8
-    importlib-resources>=5;python_version<'3.9'  # until Python 3.9
-    typing-extensions>=4;python_version<'3.10'
-python_requires = >=3.7
-include_package_data = True
-package_dir =
-    =src
-platforms = any
-setup_requires =
-    setuptools-scm
-    setuptools-scm-git-archive
-zip_safe = False
-
-[options.packages.find]
-where = src
-
-[options.entry_points]
-console_scripts =
-    ocrmypdf = ocrmypdf.__main__:run
-
-[options.extras_require]
-docs =
-    sphinx
-    sphinx-issues
-    sphinx-rtd-theme
-extended_test =
-    PyMuPDF==1.19.1
-test =
-    coverage[toml]>=5
-    pytest>=6.0.0
-    pytest-cov>=2.11.1
-    pytest-xdist>=2.2.0
-    python-xmp-toolkit==2.0.1  # also requires apt-get install libexempi3
-    types-Pillow
-    types-humanfriendly
-watcher =
-    watchdog>=1.0.2
-webservice =
-    Flask>=1
-
-[options.package_data]
-ocrmypdf =
-    data/sRGB.icc
-    py.typed
-
-[bdist_wheel]
-python-tag = py37
-
-[aliases]
-test = pytest
-
-[check-manifest]
-ignore =
-    .github
-
-[flake8]
-ignore = D203,F401,W503,E501,E203,F841
-exclude = .git,__pycache__,docs/conf.py,build,dist,.venv,.venvpp,.eggs,tmp,src/ocrmypdf/lib/
-max-complexity = 10
-max-line-length = 100
--- a/src/ocrmypdf/RELEASE.md
+++ b/src/ocrmypdf/RELEASE.md
@@ -17,11 +17,11 @@

 - Check README.md

- Check setup.py
+- Check pyproject.toml

    - Are classifiers up to date?
    - Is `python_requires` correct?
-    - Python 3.6 is EOL on December 2021-12. Could drop support then.
+    - Is it to drop support for older Pythons?
    - Can we tighten any `install_requires` dependencies?

 - Search for old version shims we can remove
--- a/src/ocrmypdf/init.py
+++ b/src/ocrmypdf/init.py
@@ -21,7 +21,6 @@ from ocrmypdf.exceptions import (
    InputFileError,
    MissingDependencyError,
    OutputFileAccessError,
-    PdfMergeFailedError,
    PriorOcrFoundError,
    SubprocessOutputError,
    TesseractConfigError,
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@@ -71,6 +71,4 @@ def run(args=None):


 if __name__ == '__main__':
-    if sys.platform == 'darwin' and sys.version_info < (3, 8):
-        set_start_method('spawn')  # see python bpo-33725
    sys.exit(run())
--- a/src/ocrmypdf/_exec/ghostscript.py
+++ b/src/ocrmypdf/_exec/ghostscript.py
@@ -47,21 +47,6 @@ def version():
    return get_version(GS)


-def jpeg_passthrough_available() -> bool:
-    """Returns True if the installed version of Ghostscript supports JPEG passthru
-
-    Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23
-    it gained the ability to keep JPEGs unmodified. However, the 9.23
-    implementation was buggy and would deletes the last two bytes of images in
-    some cases, as reported here.
-    https://bugs.ghostscript.com/show_bug.cgi?id=699216
-
-    The issue was fixed for 9.24, hence that is the first version we consider
-    the feature available. (Ghostscript 9.24 has its own problems is blacklisted.)
-    """
-    return version() >= '9.24'
-
-
 def _gs_error_reported(stream) -> bool:
    match = re.search(r'error', stream, flags=re.IGNORECASE)
    return bool(match)
@@ -201,19 +186,8 @@ def generate_pdfa(
        ]

    strategy = 'LeaveColorUnchanged'
-    # Older versions of Ghostscript expect a leading slash in
-    # sColorConversionStrategy, newer ones should not have it. See Ghostscript
-    # git commit fe1c025d.
    gs_version = version()
-    strategy = ('/' + strategy) if gs_version < '9.19' else strategy
-
-    if gs_version == '9.23':
-        # 9.23: added JPEG passthrough as a new feature, but with a bug that
-        # incorrectly formats some images. Fixed as of 9.24. So we disable this
-        # feature for 9.23.
-        # https://bugs.ghostscript.com/show_bug.cgi?id=699216
-        compression_args.append('-dPassThroughJPEGImages=false')
-    elif gs_version == '9.56.0':
+    if gs_version == '9.56.0':
        # 9.56.0 breaks our OCR, should be fixed in 9.56.1
        # https://bugs.ghostscript.com/show_bug.cgi?id=705187
        compression_args.append('-dNEWPDF=false')
--- a/src/ocrmypdf/_exec/tesseract.py
+++ b/src/ocrmypdf/_exec/tesseract.py
@@ -33,7 +33,7 @@ HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
 <head>
  <title></title>
 <meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
-  <meta name='ocr-system' content='tesseract 4.0.0' />
+  <meta name='ocr-system' content='tesseract 4.1.1' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
 </head>
 <body>
@@ -114,15 +114,6 @@ def version() -> str:
    return get_version('tesseract', regex=r'tesseract\s(.+)')


-def has_user_words() -> bool:
-    """Does Tesseract have --user-words capability?
-
-    Not available in 4.0, but available in 4.1. Also available in 3.x, but
-    we no longer support 3.x.
-    """
-    return version() >= '4.1'
-
-
 def has_thresholding() -> bool:
    """Does Tesseract have -c thresholding method capability?"""
    return version() >= '5.0'
@@ -244,7 +235,7 @@ def get_deskew(

 def tesseract_log_output(stream: bytes) -> None:
    tlog = TesseractLoggerAdapter(
-        log, extra=log.extra if hasattr(log, 'extra') else None
+        log, extra=log.extra if hasattr(log, 'extra') else None  # type: ignore
    )

    if not stream:
--- a/src/ocrmypdf/_exec/unpaper.py
+++ b/src/ocrmypdf/_exec/unpaper.py
@@ -1,12 +1,10 @@
 # SPDX-FileCopyrightText: 2022 James R. Barlow
 # SPDX-License-Identifier: MPL-2.0

-from __future__ import annotations
-
-# unpaper documentation:
-# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
 """Interface to unpaper executable"""

+from __future__ import annotations
+
 import logging
 import os
 import shlex
@@ -22,6 +20,10 @@ from PIL import Image
 from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
 from ocrmypdf.subprocess import get_version, run

+# unpaper documentation:
+# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
+
+
 if sys.version_info >= (3, 10):
    from tempfile import TemporaryDirectory
 else:
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -207,7 +207,7 @@ def exec_page_sync(page_context: PageContext) -> PageResult:
        filtered_image = page_context.plugin_manager.hook.filter_page_image(
            page=page_context, image_filename=visible_image_out
        )
-        if filtered_image:
+        if filtered_image is not None:  # None if no hook is present
            visible_image_out = filtered_image
        pdf_page_from_image_out = create_pdf_page_from_image(
            visible_image_out, page_context, orientation_correction
--- a/src/ocrmypdf/_validation.py
+++ b/src/ocrmypdf/_validation.py
@@ -134,7 +134,7 @@ def check_options_preprocessing(options: Namespace) -> None:
            package='unpaper',
            version_checker=unpaper.version,
            need_version='6.1',
-            required_for=['--clean, --clean-final'],
+            required_for="--clean, --clean-final",  # Problem arguments
        )
        try:
            if options.unpaper_args:
@@ -221,7 +221,7 @@ def check_options_metadata(options: Namespace) -> None:
 def check_options_pillow(options: Namespace) -> None:
    PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
    if PIL.Image.MAX_IMAGE_PIXELS == 0:
-        PIL.Image.MAX_IMAGE_PIXELS = None
+        PIL.Image.MAX_IMAGE_PIXELS = None  # type: ignore


 def _check_plugin_invariant_options(options: Namespace) -> None:
--- a/src/ocrmypdf/_version.py
+++ b/src/ocrmypdf/_version.py
@@ -8,10 +8,7 @@ OCRmyPDF uses setuptools_scm to derive version from git tags.

 from __future__ import annotations

-try:
-    from importlib.metadata import version as _package_version
-except ImportError:
-    from importlib_metadata import version as _package_version  # type: ignore
+from importlib.metadata import version as _package_version

 PROGRAM_NAME = 'ocrmypdf'

--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@@ -15,6 +15,9 @@ from pathlib import Path
 from typing import AnyStr, BinaryIO, Iterable, Union
 from warnings import warn

+import coloredlogs
+from humanfriendly.terminal import enable_ansi_support
+
 from ocrmypdf._logging import PageNumberFilter, TqdmConsole
 from ocrmypdf._plugin_manager import get_plugin_manager
 from ocrmypdf._sync import run_pipeline
@@ -22,15 +25,6 @@ from ocrmypdf._validation import check_options
 from ocrmypdf.cli import ArgumentParser, get_parser
 from ocrmypdf.helpers import is_iterable_notstr

-try:
-    import coloredlogs
-except ModuleNotFoundError:
-    coloredlogs = None  # pylint: disable=invalid-name
-
-if coloredlogs:
-    from humanfriendly.terminal import enable_ansi_support
-
-
 StrPath = Union[Path, AnyStr]
 PathOrIO = Union[BinaryIO, StrPath]

@@ -121,7 +115,7 @@ def configure_logging(

    use_colors = progress_bar_friendly
    formatter = None
-    if coloredlogs and use_colors:
+    if use_colors:
        use_colors = enable_ansi_support()
        if use_colors:
            use_colors = coloredlogs.terminal_supports_colors()
@@ -284,8 +278,6 @@ def ocr(  # pylint: disable=unused-argument
            ``"-"``, some final validation steps are not performed (we do not read
            back the stream after it is written).
    Raises:
-        ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
-            with the OCR layer.
        ocrmypdf.MissingDependencyError: If a required dependency program is missing or
            was not found on PATH.
        ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
--- a/src/ocrmypdf/builtin_plugins/ghostscript.py
+++ b/src/ocrmypdf/builtin_plugins/ghostscript.py
@@ -21,37 +21,19 @@ def check_options(options):
        program='gs',
        package='ghostscript',
        version_checker=ghostscript.version,
-        need_version='9.15',  # limited by Travis CI / Ubuntu 14.04 backports
+        need_version='9.50',  # Ubuntu 20.04's version
    )
    gs_version = ghostscript.version()
-    if gs_version in ('9.24', '9.51'):
+    if gs_version in ('9.51',):
        raise MissingDependencyError(
            f"Ghostscript {gs_version} contains serious regressions and is not "
            "supported. Please upgrade to a newer version, or downgrade to the "
            "previous version."
        )

-    # We have these constraints to check for.
-    # 1. Ghostscript < 9.20 mangles multibyte Unicode
-    # 2. hocr doesn't work on non-Latin languages (so don't select it)
-    is_latin = options.languages.issubset(HOCR_OK_LANGS)
-    if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin:
-        # https://bugs.ghostscript.com/show_bug.cgi?id=696874
-        # Ghostscript < 9.20 fails to encode multibyte characters properly
-        log.warning(
-            f"The installed version of Ghostscript ({gs_version}) does not work "
-            "correctly with the OCR languages you specified. Use --output-type pdf or "
-            "upgrade to Ghostscript 9.20 or later to avoid this issue."
-        )
-
    if options.output_type == 'pdfa':
        options.output_type = 'pdfa-2'

-    if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
-        raise MissingDependencyError(
-            "--output-type pdfa-3 requires Ghostscript 9.19 or later"
-        )
-

@hookimpl
 def rasterize_pdf_page(
--- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
+++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
@@ -43,7 +43,7 @@ def add_options(parser):
        metavar='MODE',
        choices=range(0, 4),
        help=(
-            "Set Tesseract 4.0+ OCR engine mode: "
+            "Set Tesseract 4+ OCR engine mode: "
            "0 - original Tesseract only; "
            "1 - neural nets LSTM only; "
            "2 - Tesseract + LSTM; "
@@ -93,7 +93,7 @@ def check_options(options):
        program='tesseract',
        package={'linux': 'tesseract-ocr'},
        version_checker=tesseract.version,
-        need_version='4.0.0-beta.1',  # using backport for Travis CI
+        need_version='4.1.1',  # Ubuntu 20.04 version
        version_parser=tesseract.TesseractVersion,
    )

@@ -101,11 +101,6 @@ def check_options(options):
    if options.pdf_renderer == 'auto':
        options.pdf_renderer = 'sandwich'

-    if not tesseract.has_user_words() and (options.user_words or options.user_patterns):
-        log.warning(
-            "Tesseract 4.0 (which you have installed) ignores --user-words and "
-            "--user-patterns, so these arguments have no effect."
-        )
    if not tesseract.has_thresholding() and options.tesseract_thresholding != 0:
        log.warning(
            "The installed version of Tesseract does not support changes to its "
--- a/src/ocrmypdf/exceptions.py
+++ b/src/ocrmypdf/exceptions.py
@@ -47,26 +47,6 @@ class BadArgsError(ExitCodeException):
    exit_code = ExitCode.bad_args


-class PdfMergeFailedError(ExitCodeException):  # deprecated
-    """An intermediate PDF can't be merged.
-
-    No longer in use.
-    """
-
-    exit_code = ExitCode.input_file
-    message = dedent(
-        '''\
-        Failed to merge PDF image layer with OCR layer
-
-        Usually this happens because the input PDF file is malformed and
-        ocrmypdf cannot correct the problem on its own.
-
-        Try using
-            ocrmypdf --pdf-renderer sandwich  [..other args..]
-        '''
-    )
-
-
 class MissingDependencyError(ExitCodeException):
    """A third-party dependency is missing."""

--- a/src/ocrmypdf/helpers.py
+++ b/src/ocrmypdf/helpers.py
@@ -12,7 +12,6 @@ import shutil
 import warnings
 from collections.abc import Iterable
 from contextlib import suppress
-from functools import wraps
 from io import StringIO
 from math import isclose, isfinite
 from pathlib import Path
@@ -291,20 +290,3 @@ def pikepdf_enable_mmap():
    # Fix is not in pybind11 2.6.0
    # log.debug("pikepdf mmap disabled")
    return
-
-
-def deprecated(func):
-    """Warn that function is deprecated."""
-
-    @wraps(func)
-    def new_func(*args, **kwargs):
-        warnings.simplefilter('always', DeprecationWarning)  # turn off filter
-        warnings.warn(
-            f"Call to deprecated function {func.__name__}.",
-            category=DeprecationWarning,
-            stacklevel=2,
-        )
-        warnings.simplefilter('default', DeprecationWarning)  # reset filter
-        return func(*args, **kwargs)
-
-    return new_func
--- a/src/ocrmypdf/hocrtransform.py
+++ b/src/ocrmypdf/hocrtransform.py
@@ -14,10 +14,11 @@ import re
 import warnings
 from math import atan, cos, sin
 from pathlib import Path
-from typing import Any, NamedTuple, Optional, Tuple, Union
+from typing import Any, NamedTuple
 from xml.etree import ElementTree

 with warnings.catch_warnings():
+    # reportlab uses deprecated load_module
    warnings.filterwarnings(
        'ignore', category=DeprecationWarning, message=r".*load_module.*"
    )
--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@@ -278,10 +278,6 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path:
    will occur. The return value should be a path to a file in the same folder
    as ``image_filename``.

-    Implementation detail: If the value returned is falsy, OCRmyPDF will ignore
-    the return value and assume the input file was unmodified. This is deprecated.
-    To leave the image unmodified, ``image_filename`` should be returned.
-
    Note:
        This hook will be called from child processes. Modifying global state
        will not affect the main process or other child processes.
--- a/src/ocrmypdf/subprocess/init.py
+++ b/src/ocrmypdf/subprocess/init.py
@@ -314,7 +314,7 @@ def check_external_program(
        program: The name of the program to test.
        package: The name of a software package that typically supplies this program.
            Usually the same as program.
-        version_check: A callable without arguments that retrieves the installed
+        version_checker: A callable without arguments that retrieves the installed
            version of program.
        need_version: The minimum required version.
        required_for: The name of an argument of feature that requires this program.
@@ -325,10 +325,7 @@ def check_external_program(
    """

    try:
-        if callable(version_checker):
-            found_version = version_checker()
-        else:  # deprecated
-            found_version = version_checker
+        found_version = version_checker()
    except (CalledProcessError, FileNotFoundError) as e:
        _error_missing_program(program, package, required_for, recommended)
        if not recommended:
--- a/src/ocrmypdf/subprocess/_windows.py
+++ b/src/ocrmypdf/subprocess/_windows.py
@@ -171,11 +171,6 @@ SHIMS = [
 def fix_windows_args(program: str, args, env):
    """Adjust our desired program and command line arguments for use on Windows"""

-    if sys.version_info < (3, 8):
-        # bpo-33617 - Windows needs manual Path -> str conversion
-        args = [os.fspath(arg) for arg in args]
-        program = os.fspath(program)
-
    # If we are running a .py on Windows, ensure we call it with this Python
    # (to support test suite shims)
    if program.lower().endswith('.py'):
--- a/tests/plugins/gs_feature_elision.py
+++ b/tests/plugins/gs_feature_elision.py
@@ -9,13 +9,13 @@ from ocrmypdf import hookimpl
 from ocrmypdf.builtin_plugins import ghostscript
 from ocrmypdf.subprocess import run_polling_stderr

-elision_warning = """GPL Ghostscript 9.20: Setting Overprint Mode to 1
+ELISION_WARNING = """GPL Ghostscript 9.50: Setting Overprint Mode to 1
 not permitted in PDF/A-2, overprint mode not set"""


 def run_append_stderr(*args, **kwargs):
    proc = run_polling_stderr(*args, **kwargs)
-    proc.stderr += '\n' + elision_warning + '\n'
+    proc.stderr += '\n' + ELISION_WARNING + '\n'
    return proc


--- a/tests/plugins/tesseract_cache.py
+++ b/tests/plugins/tesseract_cache.py
@@ -21,7 +21,7 @@ were produced.

 Certain operations are not cached and routed to Tesseract OCR directly.

-Assumes Tesseract 4.0.0-alpha or higher.
+Assumes Tesseract 4+.

 """

--- a/tests/plugins/tesseract_debug_rotate.py
+++ b/tests/plugins/tesseract_debug_rotate.py
@@ -27,7 +27,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  <meta name='ocr-system' content='tesseract 4.0.0' />
+  <meta name='ocr-system' content='tesseract 4.1.1' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
@@ -46,7 +46,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
 class FixedRotateNoopOcrEngine(OcrEngine):
    @staticmethod
    def version():
-        return '4.0.0'
+        return '4.1.1'

    @staticmethod
    def creator_tag(options):
--- a/tests/plugins/tesseract_noop.py
+++ b/tests/plugins/tesseract_noop.py
@@ -25,7 +25,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
 <head>
  <title></title>
  <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
-  <meta name='ocr-system' content='tesseract 4.0.0' />
+  <meta name='ocr-system' content='tesseract 4.1.1' />
  <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
 </head>
 <body>
@@ -44,7 +44,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
 class NoopOcrEngine(OcrEngine):
    @staticmethod
    def version():
-        return '4.0.0'
+        return '4.1.1'

    @staticmethod
    def creator_tag(options):
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -54,15 +54,6 @@ def test_no_cpu_count(monkeypatch):
    assert invoked, "Patched function called during test"


-def test_deprecated():
-    @helpers.deprecated
-    def old_function():
-        return 42
-
-    with pytest.deprecated_call():
-        assert old_function() == 42
-
-
 skipif_docker = pytest.mark.skipif(running_in_docker(), reason="fails on Docker")


--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -545,7 +545,6 @@ def test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outp
    assert p.returncode == ExitCode.invalid_config


-@pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0')
 def test_user_words_ocr(resources, outdir):
    # Does not actually test if --user-words causes output to differ
    word_list = outdir / 'wordlist.txt'
@@ -722,11 +721,9 @@ def test_compression_changed(ocrmypdf_exec, resources, image, compression, outpd
    if compression == "jpeg":
        assert pdfimage.enc == Encoding.jpeg
    else:
-        if ghostscript.jpeg_passthrough_available():
-            # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be
-            # copied without transcoding - so report
-            if image.endswith('jpg'):
-                assert pdfimage.enc == Encoding.jpeg
+        if image.endswith('jpg'):
+            # Ghostscript JPEG passthrough - no issue
+            assert pdfimage.enc == Encoding.jpeg
        else:
            assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)

--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -28,9 +28,6 @@ except ImportError:
    fitz = None


-pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning')
-
-
@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
 def test_preserve_docinfo(output_type, resources, outpdf):
    pdf_before = pikepdf.open(resources / 'graph.pdf')
@@ -174,7 +171,12 @@ def test_creation_date_preserved(output_type, resources, infile, outpdf):
 def libxmp_file_to_dict():
    try:
        with warnings.catch_warnings():
-            warnings.simplefilter("ignore", DeprecationWarning)
+            # libxmp imports distutils.Version, which is deprecated
+            warnings.filterwarnings(
+                "ignore",
+                category=DeprecationWarning,
+                message=r".*distutils Version classes are deprecated.*",
+            )
            from libxmp.utils import (
                file_to_dict,  # pylint: disable=import-outside-toplevel
            )
--- a/tests/test_stdio.py
+++ b/tests/test_stdio.py
@@ -51,10 +51,6 @@ def test_stdout(ocrmypdf_exec, resources, outpdf):
    assert check_pdf(output_file)


-@pytest.mark.xfail(
-    os.name == 'nt' and sys.version_info < (3, 8),
-    reason="Windows does not like this; not sure how to fix",
-)
 def test_dev_null(resources):
    if 'COV_CORE_DATAFILE' in os.environ:
        pytest.skip("Coverage uses stdout")
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -48,22 +48,6 @@ def test_hocr_notlatin_warning(caplog):
    assert 'PDF renderer is known to cause' in caplog.text


-def test_old_ghostscript(caplog):
-    with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'), patch(
-        'ocrmypdf._exec.tesseract.get_languages', return_value={'eng', 'chi_sim'}
-    ):
-        vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa'))
-        assert 'does not work correctly' in caplog.text
-
-    with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'):
-        with pytest.raises(MissingDependencyError):
-            vd.check_options(*make_opts_pm(output_type='pdfa-3'))
-
-    with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'):
-        with pytest.raises(MissingDependencyError):
-            vd.check_options(*make_opts_pm())
-
-
 def test_old_tesseract_error():
    with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'):
        with pytest.raises(MissingDependencyError):
@@ -103,22 +87,6 @@ def test_optimizing(caplog):
    assert 'will be ignored because' in caplog.text


-def test_user_words(caplog):
-    with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False):
-        vd.check_options(*make_opts_pm(user_words='foo'))
-        assert (
-            'Tesseract 4.0 (which you have installed) ignores --user-words'
-            in caplog.text
-        )
-    caplog.clear()
-    with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True):
-        vd.check_options(*make_opts_pm(user_patterns='foo'))
-        assert (
-            'Tesseract 4.0 (which you have installed) ignores --user-words'
-            not in caplog.text
-        )
-
-
 def test_pillow_options():
    vd.check_options_pillow(make_opts(max_image_mpixels=0))

@@ -229,37 +197,38 @@ def test_version_comparison():
            program="tesseract",
            package="tesseract",
            version_checker=lambda: '4.0.0-beta.1',
-            need_version='4.0.0',
+            need_version='4.1.1',
            version_parser=TesseractVersion,
        )
    vd.check_external_program(
        program="tesseract",
        package="tesseract",
        version_checker=lambda: 'v5.0.0-alpha.20200201',
-        need_version='4.0.0',
+        need_version='4.1.1',
        version_parser=TesseractVersion,
    )
    vd.check_external_program(
        program="tesseract",
        package="tesseract",
        version_checker=lambda: '5.0.0-rc1.20211030',
-        need_version='4.0.0',
+        need_version='4.1.1',
        version_parser=TesseractVersion,
    )
    vd.check_external_program(
        program="tesseract",
        package="tesseract",
-        version_checker=lambda: 'v4.0.0.20181030',  # Some Windows builds use this format
-        need_version='4.0.0',
-        version_parser=TesseractVersion,
-    )
-    vd.check_external_program(
-        program="tesseract",
-        package="tesseract",
-        version_checker=lambda: '4.1.1-rc2-25-g9707',
-        need_version='4.0.0',
+        version_checker=lambda: 'v4.1.1.20181030',  # Some Windows builds use this format
+        need_version='4.1.1',
        version_parser=TesseractVersion,
    )
+    with pytest.raises(MissingDependencyError):
+        vd.check_external_program(
+            program="tesseract",
+            package="tesseract",
+            version_checker=lambda: '4.1.1-rc2-25-g9707',
+            need_version='4.1.1',
+            version_parser=TesseractVersion,
+        )
    with pytest.raises(MissingDependencyError):
        vd.check_external_program(
            program="dummy_fails",