diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index dbb96baf..f9a15f41 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -22,8 +22,6 @@ jobs: strategy: matrix: include: - - os: ubuntu-18.04 - python: "3.7" - os: ubuntu-20.04 python: "3.8" - os: ubuntu-20.04 @@ -33,7 +31,9 @@ jobs: - os: ubuntu-latest python: "3.9" - os: ubuntu-latest - python: "pypy-3.8" + python: "pypy3.8" + - os: ubuntu-latest + python: "pypy3.9" - os: ubuntu-latest python: "3.9" tesseract5: true @@ -75,12 +75,6 @@ jobs: unpaper \ zlib1g - - name: Install Ubuntu 18.04 packages - if: matrix.os == 'ubuntu-18.04' - run: | - sudo apt-get install -y --no-install-recommends \ - libexempi3 - - name: Install Ubuntu 20.04 packages if: matrix.os == 'ubuntu-20.04' || matrix.os == 'ubuntu-latest' run: | diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 1b99b96c..b6a207c8 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,7 +28,7 @@ repos: rev: v2.37.2 hooks: - id: pyupgrade - args: ["--py37-plus"] + args: ["--py38-plus"] - repo: https://github.com/pre-commit/mirrors-mypy rev: v0.971 hooks: diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 68859db8..7c505ff5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -17,7 +17,7 @@ formats: # Optionally set the version of Python and requirements required to build your docs python: - version: "3.7" + version: "3.8" install: - method: pip path: . diff --git a/README.md b/README.md index 815d2031..a2eb4c5c 100644 --- a/README.md +++ b/README.md @@ -70,7 +70,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl | macOS (Homebrew) | ``brew install ocrmypdf`` | | macOS (nix) | ``nix-env -i ocrmypdf`` | | LinuxBrew | ``brew install ocrmypdf`` | -| FreeBSD | ``pkg install py37-ocrmypdf`` | +| FreeBSD | ``pkg install py-ocrmypdf`` | | Conda | ``conda install ocrmypdf`` | | Ubuntu Snap | ``snap install ocrmypdf`` | @@ -96,10 +96,7 @@ brew install tesseract-lang You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested. -OCRmyPDF supports Tesseract 4.0 and the beta versions of Tesseract 5.0. It will -automatically use whichever version it finds first on the `PATH` environment -variable. On Windows, if `PATH` does not provide a Tesseract binary, we use -the highest version number that is installed according to the Windows Registry. +OCRmyPDF supports Tesseract 4.1.1+. It will automatically use whichever version it finds first on the `PATH` environment variable. On Windows, if `PATH` does not provide a Tesseract binary, we use the highest version number that is installed according to the Windows Registry. ## Documentation and support diff --git a/docs/api.rst b/docs/api.rst index 0478dd1c..1f185446 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -72,14 +72,6 @@ OCRmyPDF, use processes. not take at least one of these steps, process semantics will prevent OCRmyPDF from working correctly. -.. warning:: - - On macOS with Python 3.7, you must call - :func:`multiprocessing.set_start_method("spawn")`. Without this, multiprocessing - will be unstable. From the command line, OCRmyPDF does this automatically, - but as an API user you must do this. See Python bpo-33725 for details. - Python 3.8+ also resolve this automatically. - Logging ------- diff --git a/docs/conf.py b/docs/conf.py index 4b6bbbb9..ee636db6 100755 --- a/docs/conf.py +++ b/docs/conf.py @@ -76,6 +76,8 @@ author = 'James R. Barlow' # The short X.Y version. import os +from importlib.metadata import version as package_version + on_rtd = os.environ.get('READTHEDOCS') == 'True' @@ -96,10 +98,6 @@ if on_rtd: ] sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES) -try: - from importlib_metadata import version as package_version -except ModuleNotFoundError: - from importlib.metadata import version as package_version # The full version, including alpha/beta/rc tags. release = package_version('ocrmypdf') diff --git a/docs/cookbook.rst b/docs/cookbook.rst index 1360d17e..581c6474 100644 --- a/docs/cookbook.rst +++ b/docs/cookbook.rst @@ -283,7 +283,7 @@ argument. (Normally, OCRmyPDF will exit with an error if asked to modify a file with OCR.) This may be helpful for users who want to take advantage of accuracy -improvements in Tesseract 4.0 for files they previously OCRed with an +improvements in Tesseract for files they previously OCRed with an earlier version of Tesseract and OCRmyPDF. .. code-block:: bash diff --git a/docs/installation.rst b/docs/installation.rst index 79bfb7c4..07c25abd 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -44,7 +44,7 @@ install, or install a more recent version than your platform provides, read on. Installing on Linux =================== -Debian and Ubuntu 18.04 or newer +Debian and Ubuntu 20.04 or newer -------------------------------- .. |deb-11| image:: https://repology.org/badge/version-for-repo/debian_11/ocrmypdf.svg @@ -56,9 +56,6 @@ Debian and Ubuntu 18.04 or newer .. |deb-unstable| image:: https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg :alt: Debian unstable -.. |ubu-1804| image:: https://repology.org/badge/version-for-repo/ubuntu_18_04/ocrmypdf.svg - :alt: Ubuntu 18.04 LTS - .. |ubu-2004| image:: https://repology.org/badge/version-for-repo/ubuntu_20_04/ocrmypdf.svg :alt: Ubuntu 20.04 LTS @@ -72,7 +69,7 @@ Debian and Ubuntu 18.04 or newer +-----------------------------------------------+ | |deb-11| |deb-12| |deb-unstable| | +-----------------------------------------------+ -| |ubu-1804| |ubu-2004| |ubu-2204| | +| |ubu-2004| |ubu-2204| | +-----------------------------------------------+ Users of Debian 9 ("stretch") or later, or Ubuntu 18.04 or later, including users @@ -80,7 +77,7 @@ of Windows Subsystem for Linux, may simply .. code-block:: bash - apt-get install ocrmypdf + apt install ocrmypdf As indicated in the table above, Debian and Ubuntu releases may lag behind the latest version. If the version available for your platform is @@ -198,46 +195,6 @@ To install for the current user only: To add JBIG2 encoding, see :ref:`jbig2`. -Ubuntu 18.04 LTS ----------------- - -Ubuntu 18.04 includes ocrmypdf 6.1.2 - you can install that with ``apt``, but -it is quite old now. To install a more recent version, uninstall the old version -of ocrmypdf, and install the following dependencies: - -.. code-block:: bash - - sudo apt-get -y remove ocrmypdf - sudo apt-get -y update - sudo apt-get -y install \ - ghostscript \ - icc-profiles-free \ - libxml2 \ - pngquant \ - python3-distutils \ - python3-pkg-resources \ - python3-reportlab \ - qpdf \ - tesseract-ocr \ - zlib1g \ - unpaper - -We will need a newer version of ``pip`` then was available for Ubuntu 18.04: - -.. code-block:: bash - - wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py - -Then install the most recent ocrmypdf for the local user and set the -user's ``PATH`` to check for the user's Python packages. - -.. code-block:: bash - - export PATH=$HOME/.local/bin:$PATH - python3 -m pip install --user ocrmypdf - -To add JBIG2 encoding, see :ref:`jbig2`. - Arch Linux (AUR) ---------------- @@ -417,8 +374,8 @@ Native Windows You must install the following for Windows: -* Python 3.7 (64-bit) or later -* Tesseract 4.0 or later +* Python 3.8 (64-bit) or later +* Tesseract 4.1.1 or later * Ghostscript 9.50 or later Using the `Chocolatey `_ package manager, install the @@ -481,7 +438,7 @@ Cygwin64 First install the the following prerequisite Cygwin packages using ``setup-x86_64.exe``:: - python37 (or later) + python38 (or later) python3?-devel python3?-pip python3?-lxml @@ -616,9 +573,9 @@ manager. ``pip`` cannot provide them. The following versions are required: -- Python 3.7 or newer -- Ghostscript 9.23 or newer -- Tesseract 4.0.0 or newer +- Python 3.8 or newer +- Ghostscript 9.50 or newer +- Tesseract 4.1.1 or newer - jbig2enc 0.29 or newer - pngquant 2.5 or newer - unpaper 6.1 @@ -649,7 +606,7 @@ unfortunately, the ``pip install`` command cannot satisfy all of them. Installing HEAD revision from sources ===================================== -If you have ``git`` and Python 3.7 or newer installed, you can install +If you have ``git`` and Python 3.8 or newer installed, you can install from source. When the ``pip`` installer runs, it will alert you if dependencies are missing. diff --git a/docs/introduction.rst b/docs/introduction.rst index 7c11818b..563844bb 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -190,8 +190,7 @@ Ghostscript also imposes some limitations: behavior can be suppressed by setting ``--pdfa-image-compression`` to ``jpeg`` or ``lossless`` to set all images to one type or the other. Ghostscript has no option to maintain the input image's format. - (Ghostscript 9.25+ can copy JPEG images without transcoding them; - earlier versions will transcode.) + (Modern Ghostscript can copy JPEG images without transcoding them.) - Ghostscript's PDF/A conversion removes any XMP metadata that is not one of the standard XMP metadata namespaces for PDFs. In particular, PRISM Metdata is removed. diff --git a/pyproject.toml b/pyproject.toml index 68acb173..24ab6659 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,17 +2,101 @@ # SPDX-License-Identifier: MPL-2.0 [build-system] requires = [ - "setuptools >= 52", + "setuptools >= 61", "setuptools_scm[toml] >= 7.0.5", "wheel" ] build-backend = "setuptools.build_meta" +[project] +name = "ocrmypdf" +dynamic = ["version"] +description = "OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched" +readme = "README.md" +license = {text = "MPL-2.0"} +requires-python = ">=3.8" +dependencies = [ + "Pillow>=8.2.0", + "coloredlogs>=14.0", + "deprecation>=2.1.0", + "img2pdf>=0.3.0", # pure Python + "packaging>=20", + "pdfminer.six!=20200720,>=20191110", + "pikepdf!=5.0.0,>=4.0.0", + "pluggy>=0.13.0", + "reportlab>=3.5.66", + "tqdm>=4", + "importlib-resources>=5;python_version<'3.9'", # until Python 3.9 + "typing-extensions>=4;python_version<'3.10'", +] +authors = [{name = "James R. Barlow", email="james@purplerock.ca"}] +classifiers = [ + "Development Status :: 5 - Production/Stable", + "Environment :: Console", + "Intended Audience :: End Users/Desktop", + "Intended Audience :: Science/Research", + "Intended Audience :: System Administrators", + "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)", + "Operating System :: MacOS :: MacOS X", + "Operating System :: Microsoft :: Windows :: Windows 10", + "Operating System :: POSIX", + "Operating System :: POSIX :: BSD", + "Operating System :: POSIX :: Linux", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3 :: Only", + "Programming Language :: Python :: 3.8", + "Programming Language :: Python :: 3.9", + "Programming Language :: Python :: 3.10", + "Topic :: Scientific/Engineering :: Image Recognition", + "Topic :: Text Processing :: Indexing", + "Topic :: Text Processing :: Linguistic", +] +keywords = [ + "PDF", + "OCR", + "optical character recognition", + "PDF/A", + "scanning", +] + +[project.urls] +Documentation = "https://ocrmypdf.readthedocs.io/" +Source = "https://github.com/ocrmypdf/OCRmyPDF" +Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues" + +[project.optional-dependencies] +docs = ["sphinx", "sphinx-issues", "sphinx-rtd-theme"] +extended_test = ["PyMuPDF==1.19.1"] +test = [ + "coverage[toml]>=5", + "pytest>=6.0.0", + "pytest-cov>=2.11.1", + "pytest-xdist>=2.2.0", + "python-xmp-toolkit==2.0.1", # also requires apt-get install libexempi3 + "types-Pillow", + "types-humanfriendly", +] +watcher = ["watchdog>=1.0.2"] +webservice = ["Flask>=1"] + +[project.scripts] +ocrmypdf = "ocrmypdf.__main__:run" + +[tool.setuptools.package-data] +ocrmypdf = ["data/sRGB.icc", "py.typed"] + +[tool.setuptools.packages.find] +where = ["src"] +namespaces = false + [tool.setuptools_scm] +[tool.distutils.bdist_wheel] +python-tag = "py38" + [tool.black] line-length = 88 -target-version = ["py37", "py38"] +target-version = ["py38", "py39", "py310", "py311"] skip-string-normalization = true include = '\.pyi?$' exclude = ''' @@ -96,8 +180,7 @@ module = [ 'pdfminer.*', 'reportlab.*', 'fitz', - 'libxmp.utils', - 'importlib_metadata' + 'libxmp.utils' ] ignore_missing_imports = true diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index 3dc89171..00000000 --- a/setup.cfg +++ /dev/null @@ -1,116 +0,0 @@ -[metadata] -name = ocrmypdf -description = OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched -long_description = file: README.md -long_description_content_type = text/markdown -url = https://github.com/ocrmypdf/OCRmyPDF -author = James R. Barlow -author_email = james@purplerock.ca -license = MPL-2.0 -license_file = LICENSE -license_files = - LICENSE -classifiers = - Development Status :: 5 - Production/Stable - Environment :: Console - Intended Audience :: End Users/Desktop - Intended Audience :: Science/Research - Intended Audience :: System Administrators - License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0) - Operating System :: MacOS :: MacOS X - Operating System :: Microsoft :: Windows :: Windows 10 - Operating System :: POSIX - Operating System :: POSIX :: BSD - Operating System :: POSIX :: Linux - Programming Language :: Python :: 3 - Programming Language :: Python :: 3 :: Only - Programming Language :: Python :: 3.7 - Programming Language :: Python :: 3.8 - Programming Language :: Python :: 3.9 - Programming Language :: Python :: 3.10 - Topic :: Scientific/Engineering :: Image Recognition - Topic :: Text Processing :: Indexing - Topic :: Text Processing :: Linguistic -keywords = - PDF - OCR - optical character recognition - PDF/A - scanning -project_urls = - Documentation = https://ocrmypdf.readthedocs.io/ - Source = https://github.com/ocrmypdf/OCRmyPDF - Tracker = https://github.com/ocrmypdf/OCRmyPDF/issues - -[options] -packages = find: -install_requires = - Pillow>=8.2.0 - coloredlogs>=14.0 # strictly optional - img2pdf>=0.3.0 # pure Python - packaging>=20 - pdfminer.six!=20200720,>=20191110 - pikepdf!=5.0.0,>=4.0.0 - pluggy>=0.13.0 - reportlab>=3.5.66 - tqdm>=4 - importlib-metadata>=4;python_version<'3.8' # until Python 3.8 - importlib-resources>=5;python_version<'3.9' # until Python 3.9 - typing-extensions>=4;python_version<'3.10' -python_requires = >=3.7 -include_package_data = True -package_dir = - =src -platforms = any -setup_requires = - setuptools-scm - setuptools-scm-git-archive -zip_safe = False - -[options.packages.find] -where = src - -[options.entry_points] -console_scripts = - ocrmypdf = ocrmypdf.__main__:run - -[options.extras_require] -docs = - sphinx - sphinx-issues - sphinx-rtd-theme -extended_test = - PyMuPDF==1.19.1 -test = - coverage[toml]>=5 - pytest>=6.0.0 - pytest-cov>=2.11.1 - pytest-xdist>=2.2.0 - python-xmp-toolkit==2.0.1 # also requires apt-get install libexempi3 - types-Pillow - types-humanfriendly -watcher = - watchdog>=1.0.2 -webservice = - Flask>=1 - -[options.package_data] -ocrmypdf = - data/sRGB.icc - py.typed - -[bdist_wheel] -python-tag = py37 - -[aliases] -test = pytest - -[check-manifest] -ignore = - .github - -[flake8] -ignore = D203,F401,W503,E501,E203,F841 -exclude = .git,__pycache__,docs/conf.py,build,dist,.venv,.venvpp,.eggs,tmp,src/ocrmypdf/lib/ -max-complexity = 10 -max-line-length = 100 diff --git a/src/ocrmypdf/RELEASE.md b/src/ocrmypdf/RELEASE.md index f9eab6db..f034c82e 100644 --- a/src/ocrmypdf/RELEASE.md +++ b/src/ocrmypdf/RELEASE.md @@ -17,11 +17,11 @@ - Check README.md -- Check setup.py +- Check pyproject.toml - Are classifiers up to date? - Is `python_requires` correct? - - Python 3.6 is EOL on December 2021-12. Could drop support then. + - Is it to drop support for older Pythons? - Can we tighten any `install_requires` dependencies? - Search for old version shims we can remove diff --git a/src/ocrmypdf/__init__.py b/src/ocrmypdf/__init__.py index 7f4a4cd2..25c3039d 100644 --- a/src/ocrmypdf/__init__.py +++ b/src/ocrmypdf/__init__.py @@ -21,7 +21,6 @@ from ocrmypdf.exceptions import ( InputFileError, MissingDependencyError, OutputFileAccessError, - PdfMergeFailedError, PriorOcrFoundError, SubprocessOutputError, TesseractConfigError, diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index 24d620c8..24a64ae9 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -71,6 +71,4 @@ def run(args=None): if __name__ == '__main__': - if sys.platform == 'darwin' and sys.version_info < (3, 8): - set_start_method('spawn') # see python bpo-33725 sys.exit(run()) diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py index d0b007c2..b4b0cd29 100644 --- a/src/ocrmypdf/_exec/ghostscript.py +++ b/src/ocrmypdf/_exec/ghostscript.py @@ -47,21 +47,6 @@ def version(): return get_version(GS) -def jpeg_passthrough_available() -> bool: - """Returns True if the installed version of Ghostscript supports JPEG passthru - - Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23 - it gained the ability to keep JPEGs unmodified. However, the 9.23 - implementation was buggy and would deletes the last two bytes of images in - some cases, as reported here. - https://bugs.ghostscript.com/show_bug.cgi?id=699216 - - The issue was fixed for 9.24, hence that is the first version we consider - the feature available. (Ghostscript 9.24 has its own problems is blacklisted.) - """ - return version() >= '9.24' - - def _gs_error_reported(stream) -> bool: match = re.search(r'error', stream, flags=re.IGNORECASE) return bool(match) @@ -201,19 +186,8 @@ def generate_pdfa( ] strategy = 'LeaveColorUnchanged' - # Older versions of Ghostscript expect a leading slash in - # sColorConversionStrategy, newer ones should not have it. See Ghostscript - # git commit fe1c025d. gs_version = version() - strategy = ('/' + strategy) if gs_version < '9.19' else strategy - - if gs_version == '9.23': - # 9.23: added JPEG passthrough as a new feature, but with a bug that - # incorrectly formats some images. Fixed as of 9.24. So we disable this - # feature for 9.23. - # https://bugs.ghostscript.com/show_bug.cgi?id=699216 - compression_args.append('-dPassThroughJPEGImages=false') - elif gs_version == '9.56.0': + if gs_version == '9.56.0': # 9.56.0 breaks our OCR, should be fixed in 9.56.1 # https://bugs.ghostscript.com/show_bug.cgi?id=705187 compression_args.append('-dNEWPDF=false') diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py index ad98836a..adb1ec17 100644 --- a/src/ocrmypdf/_exec/tesseract.py +++ b/src/ocrmypdf/_exec/tesseract.py @@ -33,7 +33,7 @@ HOCR_TEMPLATE = """ - + @@ -114,15 +114,6 @@ def version() -> str: return get_version('tesseract', regex=r'tesseract\s(.+)') -def has_user_words() -> bool: - """Does Tesseract have --user-words capability? - - Not available in 4.0, but available in 4.1. Also available in 3.x, but - we no longer support 3.x. - """ - return version() >= '4.1' - - def has_thresholding() -> bool: """Does Tesseract have -c thresholding method capability?""" return version() >= '5.0' @@ -244,7 +235,7 @@ def get_deskew( def tesseract_log_output(stream: bytes) -> None: tlog = TesseractLoggerAdapter( - log, extra=log.extra if hasattr(log, 'extra') else None + log, extra=log.extra if hasattr(log, 'extra') else None # type: ignore ) if not stream: diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py index d7f24265..dd3a5116 100644 --- a/src/ocrmypdf/_exec/unpaper.py +++ b/src/ocrmypdf/_exec/unpaper.py @@ -1,12 +1,10 @@ # SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 -from __future__ import annotations - -# unpaper documentation: -# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md """Interface to unpaper executable""" +from __future__ import annotations + import logging import os import shlex @@ -22,6 +20,10 @@ from PIL import Image from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError from ocrmypdf.subprocess import get_version, run +# unpaper documentation: +# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md + + if sys.version_info >= (3, 10): from tempfile import TemporaryDirectory else: diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 26d986f1..8521d1c6 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -207,7 +207,7 @@ def exec_page_sync(page_context: PageContext) -> PageResult: filtered_image = page_context.plugin_manager.hook.filter_page_image( page=page_context, image_filename=visible_image_out ) - if filtered_image: + if filtered_image is not None: # None if no hook is present visible_image_out = filtered_image pdf_page_from_image_out = create_pdf_page_from_image( visible_image_out, page_context, orientation_correction diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py index 0533b928..977ea14e 100644 --- a/src/ocrmypdf/_validation.py +++ b/src/ocrmypdf/_validation.py @@ -134,7 +134,7 @@ def check_options_preprocessing(options: Namespace) -> None: package='unpaper', version_checker=unpaper.version, need_version='6.1', - required_for=['--clean, --clean-final'], + required_for="--clean, --clean-final", # Problem arguments ) try: if options.unpaper_args: @@ -221,7 +221,7 @@ def check_options_metadata(options: Namespace) -> None: def check_options_pillow(options: Namespace) -> None: PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000) if PIL.Image.MAX_IMAGE_PIXELS == 0: - PIL.Image.MAX_IMAGE_PIXELS = None + PIL.Image.MAX_IMAGE_PIXELS = None # type: ignore def _check_plugin_invariant_options(options: Namespace) -> None: diff --git a/src/ocrmypdf/_version.py b/src/ocrmypdf/_version.py index 11f8ed3f..4b368f3e 100644 --- a/src/ocrmypdf/_version.py +++ b/src/ocrmypdf/_version.py @@ -8,10 +8,7 @@ OCRmyPDF uses setuptools_scm to derive version from git tags. from __future__ import annotations -try: - from importlib.metadata import version as _package_version -except ImportError: - from importlib_metadata import version as _package_version # type: ignore +from importlib.metadata import version as _package_version PROGRAM_NAME = 'ocrmypdf' diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index 8d92eb71..52821869 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -15,6 +15,9 @@ from pathlib import Path from typing import AnyStr, BinaryIO, Iterable, Union from warnings import warn +import coloredlogs +from humanfriendly.terminal import enable_ansi_support + from ocrmypdf._logging import PageNumberFilter, TqdmConsole from ocrmypdf._plugin_manager import get_plugin_manager from ocrmypdf._sync import run_pipeline @@ -22,15 +25,6 @@ from ocrmypdf._validation import check_options from ocrmypdf.cli import ArgumentParser, get_parser from ocrmypdf.helpers import is_iterable_notstr -try: - import coloredlogs -except ModuleNotFoundError: - coloredlogs = None # pylint: disable=invalid-name - -if coloredlogs: - from humanfriendly.terminal import enable_ansi_support - - StrPath = Union[Path, AnyStr] PathOrIO = Union[BinaryIO, StrPath] @@ -121,7 +115,7 @@ def configure_logging( use_colors = progress_bar_friendly formatter = None - if coloredlogs and use_colors: + if use_colors: use_colors = enable_ansi_support() if use_colors: use_colors = coloredlogs.terminal_supports_colors() @@ -284,8 +278,6 @@ def ocr( # pylint: disable=unused-argument ``"-"``, some final validation steps are not performed (we do not read back the stream after it is written). Raises: - ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging - with the OCR layer. ocrmypdf.MissingDependencyError: If a required dependency program is missing or was not found on PATH. ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that diff --git a/src/ocrmypdf/builtin_plugins/ghostscript.py b/src/ocrmypdf/builtin_plugins/ghostscript.py index 64375ad8..7ab8d7c9 100644 --- a/src/ocrmypdf/builtin_plugins/ghostscript.py +++ b/src/ocrmypdf/builtin_plugins/ghostscript.py @@ -21,37 +21,19 @@ def check_options(options): program='gs', package='ghostscript', version_checker=ghostscript.version, - need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports + need_version='9.50', # Ubuntu 20.04's version ) gs_version = ghostscript.version() - if gs_version in ('9.24', '9.51'): + if gs_version in ('9.51',): raise MissingDependencyError( f"Ghostscript {gs_version} contains serious regressions and is not " "supported. Please upgrade to a newer version, or downgrade to the " "previous version." ) - # We have these constraints to check for. - # 1. Ghostscript < 9.20 mangles multibyte Unicode - # 2. hocr doesn't work on non-Latin languages (so don't select it) - is_latin = options.languages.issubset(HOCR_OK_LANGS) - if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin: - # https://bugs.ghostscript.com/show_bug.cgi?id=696874 - # Ghostscript < 9.20 fails to encode multibyte characters properly - log.warning( - f"The installed version of Ghostscript ({gs_version}) does not work " - "correctly with the OCR languages you specified. Use --output-type pdf or " - "upgrade to Ghostscript 9.20 or later to avoid this issue." - ) - if options.output_type == 'pdfa': options.output_type = 'pdfa-2' - if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19': - raise MissingDependencyError( - "--output-type pdfa-3 requires Ghostscript 9.19 or later" - ) - @hookimpl def rasterize_pdf_page( diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py index d9e24e4e..8372577c 100644 --- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py +++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py @@ -43,7 +43,7 @@ def add_options(parser): metavar='MODE', choices=range(0, 4), help=( - "Set Tesseract 4.0+ OCR engine mode: " + "Set Tesseract 4+ OCR engine mode: " "0 - original Tesseract only; " "1 - neural nets LSTM only; " "2 - Tesseract + LSTM; " @@ -93,7 +93,7 @@ def check_options(options): program='tesseract', package={'linux': 'tesseract-ocr'}, version_checker=tesseract.version, - need_version='4.0.0-beta.1', # using backport for Travis CI + need_version='4.1.1', # Ubuntu 20.04 version version_parser=tesseract.TesseractVersion, ) @@ -101,11 +101,6 @@ def check_options(options): if options.pdf_renderer == 'auto': options.pdf_renderer = 'sandwich' - if not tesseract.has_user_words() and (options.user_words or options.user_patterns): - log.warning( - "Tesseract 4.0 (which you have installed) ignores --user-words and " - "--user-patterns, so these arguments have no effect." - ) if not tesseract.has_thresholding() and options.tesseract_thresholding != 0: log.warning( "The installed version of Tesseract does not support changes to its " diff --git a/src/ocrmypdf/exceptions.py b/src/ocrmypdf/exceptions.py index d5a9ae8e..b56e8a1a 100644 --- a/src/ocrmypdf/exceptions.py +++ b/src/ocrmypdf/exceptions.py @@ -47,26 +47,6 @@ class BadArgsError(ExitCodeException): exit_code = ExitCode.bad_args -class PdfMergeFailedError(ExitCodeException): # deprecated - """An intermediate PDF can't be merged. - - No longer in use. - """ - - exit_code = ExitCode.input_file - message = dedent( - '''\ - Failed to merge PDF image layer with OCR layer - - Usually this happens because the input PDF file is malformed and - ocrmypdf cannot correct the problem on its own. - - Try using - ocrmypdf --pdf-renderer sandwich [..other args..] - ''' - ) - - class MissingDependencyError(ExitCodeException): """A third-party dependency is missing.""" diff --git a/src/ocrmypdf/helpers.py b/src/ocrmypdf/helpers.py index 3882497c..aed5a750 100644 --- a/src/ocrmypdf/helpers.py +++ b/src/ocrmypdf/helpers.py @@ -12,7 +12,6 @@ import shutil import warnings from collections.abc import Iterable from contextlib import suppress -from functools import wraps from io import StringIO from math import isclose, isfinite from pathlib import Path @@ -291,20 +290,3 @@ def pikepdf_enable_mmap(): # Fix is not in pybind11 2.6.0 # log.debug("pikepdf mmap disabled") return - - -def deprecated(func): - """Warn that function is deprecated.""" - - @wraps(func) - def new_func(*args, **kwargs): - warnings.simplefilter('always', DeprecationWarning) # turn off filter - warnings.warn( - f"Call to deprecated function {func.__name__}.", - category=DeprecationWarning, - stacklevel=2, - ) - warnings.simplefilter('default', DeprecationWarning) # reset filter - return func(*args, **kwargs) - - return new_func diff --git a/src/ocrmypdf/hocrtransform.py b/src/ocrmypdf/hocrtransform.py index 409aa689..306dfa24 100755 --- a/src/ocrmypdf/hocrtransform.py +++ b/src/ocrmypdf/hocrtransform.py @@ -14,10 +14,11 @@ import re import warnings from math import atan, cos, sin from pathlib import Path -from typing import Any, NamedTuple, Optional, Tuple, Union +from typing import Any, NamedTuple from xml.etree import ElementTree with warnings.catch_warnings(): + # reportlab uses deprecated load_module warnings.filterwarnings( 'ignore', category=DeprecationWarning, message=r".*load_module.*" ) diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py index c3298e4c..dfa2e3ea 100644 --- a/src/ocrmypdf/pluginspec.py +++ b/src/ocrmypdf/pluginspec.py @@ -278,10 +278,6 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path: will occur. The return value should be a path to a file in the same folder as ``image_filename``. - Implementation detail: If the value returned is falsy, OCRmyPDF will ignore - the return value and assume the input file was unmodified. This is deprecated. - To leave the image unmodified, ``image_filename`` should be returned. - Note: This hook will be called from child processes. Modifying global state will not affect the main process or other child processes. diff --git a/src/ocrmypdf/subprocess/__init__.py b/src/ocrmypdf/subprocess/__init__.py index cc5b3550..1254727a 100644 --- a/src/ocrmypdf/subprocess/__init__.py +++ b/src/ocrmypdf/subprocess/__init__.py @@ -314,7 +314,7 @@ def check_external_program( program: The name of the program to test. package: The name of a software package that typically supplies this program. Usually the same as program. - version_check: A callable without arguments that retrieves the installed + version_checker: A callable without arguments that retrieves the installed version of program. need_version: The minimum required version. required_for: The name of an argument of feature that requires this program. @@ -325,10 +325,7 @@ def check_external_program( """ try: - if callable(version_checker): - found_version = version_checker() - else: # deprecated - found_version = version_checker + found_version = version_checker() except (CalledProcessError, FileNotFoundError) as e: _error_missing_program(program, package, required_for, recommended) if not recommended: diff --git a/src/ocrmypdf/subprocess/_windows.py b/src/ocrmypdf/subprocess/_windows.py index 3a6c2a39..46f68931 100644 --- a/src/ocrmypdf/subprocess/_windows.py +++ b/src/ocrmypdf/subprocess/_windows.py @@ -171,11 +171,6 @@ SHIMS = [ def fix_windows_args(program: str, args, env): """Adjust our desired program and command line arguments for use on Windows""" - if sys.version_info < (3, 8): - # bpo-33617 - Windows needs manual Path -> str conversion - args = [os.fspath(arg) for arg in args] - program = os.fspath(program) - # If we are running a .py on Windows, ensure we call it with this Python # (to support test suite shims) if program.lower().endswith('.py'): diff --git a/tests/plugins/gs_feature_elision.py b/tests/plugins/gs_feature_elision.py index 0f63c502..ce829f5f 100644 --- a/tests/plugins/gs_feature_elision.py +++ b/tests/plugins/gs_feature_elision.py @@ -9,13 +9,13 @@ from ocrmypdf import hookimpl from ocrmypdf.builtin_plugins import ghostscript from ocrmypdf.subprocess import run_polling_stderr -elision_warning = """GPL Ghostscript 9.20: Setting Overprint Mode to 1 +ELISION_WARNING = """GPL Ghostscript 9.50: Setting Overprint Mode to 1 not permitted in PDF/A-2, overprint mode not set""" def run_append_stderr(*args, **kwargs): proc = run_polling_stderr(*args, **kwargs) - proc.stderr += '\n' + elision_warning + '\n' + proc.stderr += '\n' + ELISION_WARNING + '\n' return proc diff --git a/tests/plugins/tesseract_cache.py b/tests/plugins/tesseract_cache.py index 3725f410..eec8f4c9 100644 --- a/tests/plugins/tesseract_cache.py +++ b/tests/plugins/tesseract_cache.py @@ -21,7 +21,7 @@ were produced. Certain operations are not cached and routed to Tesseract OCR directly. -Assumes Tesseract 4.0.0-alpha or higher. +Assumes Tesseract 4+. """ diff --git a/tests/plugins/tesseract_debug_rotate.py b/tests/plugins/tesseract_debug_rotate.py index f278cb3f..1e10cfb7 100644 --- a/tests/plugins/tesseract_debug_rotate.py +++ b/tests/plugins/tesseract_debug_rotate.py @@ -27,7 +27,7 @@ HOCR_TEMPLATE = ''' - + @@ -46,7 +46,7 @@ HOCR_TEMPLATE = ''' class FixedRotateNoopOcrEngine(OcrEngine): @staticmethod def version(): - return '4.0.0' + return '4.1.1' @staticmethod def creator_tag(options): diff --git a/tests/plugins/tesseract_noop.py b/tests/plugins/tesseract_noop.py index 4dd18dfd..68a0bfe6 100644 --- a/tests/plugins/tesseract_noop.py +++ b/tests/plugins/tesseract_noop.py @@ -25,7 +25,7 @@ HOCR_TEMPLATE = ''' - + @@ -44,7 +44,7 @@ HOCR_TEMPLATE = ''' class NoopOcrEngine(OcrEngine): @staticmethod def version(): - return '4.0.0' + return '4.1.1' @staticmethod def creator_tag(options): diff --git a/tests/test_helpers.py b/tests/test_helpers.py index bd43b5b5..e1992b32 100644 --- a/tests/test_helpers.py +++ b/tests/test_helpers.py @@ -54,15 +54,6 @@ def test_no_cpu_count(monkeypatch): assert invoked, "Patched function called during test" -def test_deprecated(): - @helpers.deprecated - def old_function(): - return 42 - - with pytest.deprecated_call(): - assert old_function() == 42 - - skipif_docker = pytest.mark.skipif(running_in_docker(), reason="fails on Docker") diff --git a/tests/test_main.py b/tests/test_main.py index 676c17cb..198ed3f1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -545,7 +545,6 @@ def test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outp assert p.returncode == ExitCode.invalid_config -@pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0') def test_user_words_ocr(resources, outdir): # Does not actually test if --user-words causes output to differ word_list = outdir / 'wordlist.txt' @@ -722,11 +721,9 @@ def test_compression_changed(ocrmypdf_exec, resources, image, compression, outpd if compression == "jpeg": assert pdfimage.enc == Encoding.jpeg else: - if ghostscript.jpeg_passthrough_available(): - # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be - # copied without transcoding - so report - if image.endswith('jpg'): - assert pdfimage.enc == Encoding.jpeg + if image.endswith('jpg'): + # Ghostscript JPEG passthrough - no issue + assert pdfimage.enc == Encoding.jpeg else: assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000) diff --git a/tests/test_metadata.py b/tests/test_metadata.py index dbd9f7f9..80b97c78 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -28,9 +28,6 @@ except ImportError: fitz = None -pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning') - - @pytest.mark.parametrize("output_type", ['pdfa', 'pdf']) def test_preserve_docinfo(output_type, resources, outpdf): pdf_before = pikepdf.open(resources / 'graph.pdf') @@ -174,7 +171,12 @@ def test_creation_date_preserved(output_type, resources, infile, outpdf): def libxmp_file_to_dict(): try: with warnings.catch_warnings(): - warnings.simplefilter("ignore", DeprecationWarning) + # libxmp imports distutils.Version, which is deprecated + warnings.filterwarnings( + "ignore", + category=DeprecationWarning, + message=r".*distutils Version classes are deprecated.*", + ) from libxmp.utils import ( file_to_dict, # pylint: disable=import-outside-toplevel ) diff --git a/tests/test_stdio.py b/tests/test_stdio.py index 17536145..577ec607 100644 --- a/tests/test_stdio.py +++ b/tests/test_stdio.py @@ -51,10 +51,6 @@ def test_stdout(ocrmypdf_exec, resources, outpdf): assert check_pdf(output_file) -@pytest.mark.xfail( - os.name == 'nt' and sys.version_info < (3, 8), - reason="Windows does not like this; not sure how to fix", -) def test_dev_null(resources): if 'COV_CORE_DATAFILE' in os.environ: pytest.skip("Coverage uses stdout") diff --git a/tests/test_validation.py b/tests/test_validation.py index fca0cc32..192925f8 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -48,22 +48,6 @@ def test_hocr_notlatin_warning(caplog): assert 'PDF renderer is known to cause' in caplog.text -def test_old_ghostscript(caplog): - with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'), patch( - 'ocrmypdf._exec.tesseract.get_languages', return_value={'eng', 'chi_sim'} - ): - vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa')) - assert 'does not work correctly' in caplog.text - - with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'): - with pytest.raises(MissingDependencyError): - vd.check_options(*make_opts_pm(output_type='pdfa-3')) - - with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'): - with pytest.raises(MissingDependencyError): - vd.check_options(*make_opts_pm()) - - def test_old_tesseract_error(): with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'): with pytest.raises(MissingDependencyError): @@ -103,22 +87,6 @@ def test_optimizing(caplog): assert 'will be ignored because' in caplog.text -def test_user_words(caplog): - with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False): - vd.check_options(*make_opts_pm(user_words='foo')) - assert ( - 'Tesseract 4.0 (which you have installed) ignores --user-words' - in caplog.text - ) - caplog.clear() - with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True): - vd.check_options(*make_opts_pm(user_patterns='foo')) - assert ( - 'Tesseract 4.0 (which you have installed) ignores --user-words' - not in caplog.text - ) - - def test_pillow_options(): vd.check_options_pillow(make_opts(max_image_mpixels=0)) @@ -229,37 +197,38 @@ def test_version_comparison(): program="tesseract", package="tesseract", version_checker=lambda: '4.0.0-beta.1', - need_version='4.0.0', + need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: 'v5.0.0-alpha.20200201', - need_version='4.0.0', + need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: '5.0.0-rc1.20211030', - need_version='4.0.0', + need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", - version_checker=lambda: 'v4.0.0.20181030', # Some Windows builds use this format - need_version='4.0.0', - version_parser=TesseractVersion, - ) - vd.check_external_program( - program="tesseract", - package="tesseract", - version_checker=lambda: '4.1.1-rc2-25-g9707', - need_version='4.0.0', + version_checker=lambda: 'v4.1.1.20181030', # Some Windows builds use this format + need_version='4.1.1', version_parser=TesseractVersion, ) + with pytest.raises(MissingDependencyError): + vd.check_external_program( + program="tesseract", + package="tesseract", + version_checker=lambda: '4.1.1-rc2-25-g9707', + need_version='4.1.1', + version_parser=TesseractVersion, + ) with pytest.raises(MissingDependencyError): vd.check_external_program( program="dummy_fails",