diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index dbb96baf..f9a15f41 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -22,8 +22,6 @@ jobs:
strategy:
matrix:
include:
- - os: ubuntu-18.04
- python: "3.7"
- os: ubuntu-20.04
python: "3.8"
- os: ubuntu-20.04
@@ -33,7 +31,9 @@ jobs:
- os: ubuntu-latest
python: "3.9"
- os: ubuntu-latest
- python: "pypy-3.8"
+ python: "pypy3.8"
+ - os: ubuntu-latest
+ python: "pypy3.9"
- os: ubuntu-latest
python: "3.9"
tesseract5: true
@@ -75,12 +75,6 @@ jobs:
unpaper \
zlib1g
- - name: Install Ubuntu 18.04 packages
- if: matrix.os == 'ubuntu-18.04'
- run: |
- sudo apt-get install -y --no-install-recommends \
- libexempi3
-
- name: Install Ubuntu 20.04 packages
if: matrix.os == 'ubuntu-20.04' || matrix.os == 'ubuntu-latest'
run: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1b99b96c..b6a207c8 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -28,7 +28,7 @@ repos:
rev: v2.37.2
hooks:
- id: pyupgrade
- args: ["--py37-plus"]
+ args: ["--py38-plus"]
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v0.971
hooks:
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index 68859db8..7c505ff5 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -17,7 +17,7 @@ formats:
# Optionally set the version of Python and requirements required to build your docs
python:
- version: "3.7"
+ version: "3.8"
install:
- method: pip
path: .
diff --git a/README.md b/README.md
index 815d2031..a2eb4c5c 100644
--- a/README.md
+++ b/README.md
@@ -70,7 +70,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl
| macOS (Homebrew) | ``brew install ocrmypdf`` |
| macOS (nix) | ``nix-env -i ocrmypdf`` |
| LinuxBrew | ``brew install ocrmypdf`` |
-| FreeBSD | ``pkg install py37-ocrmypdf`` |
+| FreeBSD | ``pkg install py-ocrmypdf`` |
| Conda | ``conda install ocrmypdf`` |
| Ubuntu Snap | ``snap install ocrmypdf`` |
@@ -96,10 +96,7 @@ brew install tesseract-lang
You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.
-OCRmyPDF supports Tesseract 4.0 and the beta versions of Tesseract 5.0. It will
-automatically use whichever version it finds first on the `PATH` environment
-variable. On Windows, if `PATH` does not provide a Tesseract binary, we use
-the highest version number that is installed according to the Windows Registry.
+OCRmyPDF supports Tesseract 4.1.1+. It will automatically use whichever version it finds first on the `PATH` environment variable. On Windows, if `PATH` does not provide a Tesseract binary, we use the highest version number that is installed according to the Windows Registry.
## Documentation and support
diff --git a/docs/api.rst b/docs/api.rst
index 0478dd1c..1f185446 100644
--- a/docs/api.rst
+++ b/docs/api.rst
@@ -72,14 +72,6 @@ OCRmyPDF, use processes.
not take at least one of these steps, process semantics will prevent
OCRmyPDF from working correctly.
-.. warning::
-
- On macOS with Python 3.7, you must call
- :func:`multiprocessing.set_start_method("spawn")`. Without this, multiprocessing
- will be unstable. From the command line, OCRmyPDF does this automatically,
- but as an API user you must do this. See Python bpo-33725 for details.
- Python 3.8+ also resolve this automatically.
-
Logging
-------
diff --git a/docs/conf.py b/docs/conf.py
index 4b6bbbb9..ee636db6 100755
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -76,6 +76,8 @@ author = 'James R. Barlow'
# The short X.Y version.
import os
+from importlib.metadata import version as package_version
+
on_rtd = os.environ.get('READTHEDOCS') == 'True'
@@ -96,10 +98,6 @@ if on_rtd:
]
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
-try:
- from importlib_metadata import version as package_version
-except ModuleNotFoundError:
- from importlib.metadata import version as package_version
# The full version, including alpha/beta/rc tags.
release = package_version('ocrmypdf')
diff --git a/docs/cookbook.rst b/docs/cookbook.rst
index 1360d17e..581c6474 100644
--- a/docs/cookbook.rst
+++ b/docs/cookbook.rst
@@ -283,7 +283,7 @@ argument. (Normally, OCRmyPDF will exit with an error if asked to modify
a file with OCR.)
This may be helpful for users who want to take advantage of accuracy
-improvements in Tesseract 4.0 for files they previously OCRed with an
+improvements in Tesseract for files they previously OCRed with an
earlier version of Tesseract and OCRmyPDF.
.. code-block:: bash
diff --git a/docs/installation.rst b/docs/installation.rst
index 79bfb7c4..07c25abd 100644
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -44,7 +44,7 @@ install, or install a more recent version than your platform provides, read on.
Installing on Linux
===================
-Debian and Ubuntu 18.04 or newer
+Debian and Ubuntu 20.04 or newer
--------------------------------
.. |deb-11| image:: https://repology.org/badge/version-for-repo/debian_11/ocrmypdf.svg
@@ -56,9 +56,6 @@ Debian and Ubuntu 18.04 or newer
.. |deb-unstable| image:: https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg
:alt: Debian unstable
-.. |ubu-1804| image:: https://repology.org/badge/version-for-repo/ubuntu_18_04/ocrmypdf.svg
- :alt: Ubuntu 18.04 LTS
-
.. |ubu-2004| image:: https://repology.org/badge/version-for-repo/ubuntu_20_04/ocrmypdf.svg
:alt: Ubuntu 20.04 LTS
@@ -72,7 +69,7 @@ Debian and Ubuntu 18.04 or newer
+-----------------------------------------------+
| |deb-11| |deb-12| |deb-unstable| |
+-----------------------------------------------+
-| |ubu-1804| |ubu-2004| |ubu-2204| |
+| |ubu-2004| |ubu-2204| |
+-----------------------------------------------+
Users of Debian 9 ("stretch") or later, or Ubuntu 18.04 or later, including users
@@ -80,7 +77,7 @@ of Windows Subsystem for Linux, may simply
.. code-block:: bash
- apt-get install ocrmypdf
+ apt install ocrmypdf
As indicated in the table above, Debian and Ubuntu releases may lag
behind the latest version. If the version available for your platform is
@@ -198,46 +195,6 @@ To install for the current user only:
To add JBIG2 encoding, see :ref:`jbig2`.
-Ubuntu 18.04 LTS
-----------------
-
-Ubuntu 18.04 includes ocrmypdf 6.1.2 - you can install that with ``apt``, but
-it is quite old now. To install a more recent version, uninstall the old version
-of ocrmypdf, and install the following dependencies:
-
-.. code-block:: bash
-
- sudo apt-get -y remove ocrmypdf
- sudo apt-get -y update
- sudo apt-get -y install \
- ghostscript \
- icc-profiles-free \
- libxml2 \
- pngquant \
- python3-distutils \
- python3-pkg-resources \
- python3-reportlab \
- qpdf \
- tesseract-ocr \
- zlib1g \
- unpaper
-
-We will need a newer version of ``pip`` then was available for Ubuntu 18.04:
-
-.. code-block:: bash
-
- wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py
-
-Then install the most recent ocrmypdf for the local user and set the
-user's ``PATH`` to check for the user's Python packages.
-
-.. code-block:: bash
-
- export PATH=$HOME/.local/bin:$PATH
- python3 -m pip install --user ocrmypdf
-
-To add JBIG2 encoding, see :ref:`jbig2`.
-
Arch Linux (AUR)
----------------
@@ -417,8 +374,8 @@ Native Windows
You must install the following for Windows:
-* Python 3.7 (64-bit) or later
-* Tesseract 4.0 or later
+* Python 3.8 (64-bit) or later
+* Tesseract 4.1.1 or later
* Ghostscript 9.50 or later
Using the `Chocolatey `_ package manager, install the
@@ -481,7 +438,7 @@ Cygwin64
First install the the following prerequisite Cygwin packages using ``setup-x86_64.exe``::
- python37 (or later)
+ python38 (or later)
python3?-devel
python3?-pip
python3?-lxml
@@ -616,9 +573,9 @@ manager. ``pip`` cannot provide them.
The following versions are required:
-- Python 3.7 or newer
-- Ghostscript 9.23 or newer
-- Tesseract 4.0.0 or newer
+- Python 3.8 or newer
+- Ghostscript 9.50 or newer
+- Tesseract 4.1.1 or newer
- jbig2enc 0.29 or newer
- pngquant 2.5 or newer
- unpaper 6.1
@@ -649,7 +606,7 @@ unfortunately, the ``pip install`` command cannot satisfy all of them.
Installing HEAD revision from sources
=====================================
-If you have ``git`` and Python 3.7 or newer installed, you can install
+If you have ``git`` and Python 3.8 or newer installed, you can install
from source. When the ``pip`` installer runs, it will alert you if
dependencies are missing.
diff --git a/docs/introduction.rst b/docs/introduction.rst
index 7c11818b..563844bb 100644
--- a/docs/introduction.rst
+++ b/docs/introduction.rst
@@ -190,8 +190,7 @@ Ghostscript also imposes some limitations:
behavior can be suppressed by setting ``--pdfa-image-compression`` to
``jpeg`` or ``lossless`` to set all images to one type or the other.
Ghostscript has no option to maintain the input image's format.
- (Ghostscript 9.25+ can copy JPEG images without transcoding them;
- earlier versions will transcode.)
+ (Modern Ghostscript can copy JPEG images without transcoding them.)
- Ghostscript's PDF/A conversion removes any XMP metadata that is not
one of the standard XMP metadata namespaces for PDFs. In particular,
PRISM Metdata is removed.
diff --git a/pyproject.toml b/pyproject.toml
index 68acb173..24ab6659 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,17 +2,101 @@
# SPDX-License-Identifier: MPL-2.0
[build-system]
requires = [
- "setuptools >= 52",
+ "setuptools >= 61",
"setuptools_scm[toml] >= 7.0.5",
"wheel"
]
build-backend = "setuptools.build_meta"
+[project]
+name = "ocrmypdf"
+dynamic = ["version"]
+description = "OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched"
+readme = "README.md"
+license = {text = "MPL-2.0"}
+requires-python = ">=3.8"
+dependencies = [
+ "Pillow>=8.2.0",
+ "coloredlogs>=14.0",
+ "deprecation>=2.1.0",
+ "img2pdf>=0.3.0", # pure Python
+ "packaging>=20",
+ "pdfminer.six!=20200720,>=20191110",
+ "pikepdf!=5.0.0,>=4.0.0",
+ "pluggy>=0.13.0",
+ "reportlab>=3.5.66",
+ "tqdm>=4",
+ "importlib-resources>=5;python_version<'3.9'", # until Python 3.9
+ "typing-extensions>=4;python_version<'3.10'",
+]
+authors = [{name = "James R. Barlow", email="james@purplerock.ca"}]
+classifiers = [
+ "Development Status :: 5 - Production/Stable",
+ "Environment :: Console",
+ "Intended Audience :: End Users/Desktop",
+ "Intended Audience :: Science/Research",
+ "Intended Audience :: System Administrators",
+ "License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
+ "Operating System :: MacOS :: MacOS X",
+ "Operating System :: Microsoft :: Windows :: Windows 10",
+ "Operating System :: POSIX",
+ "Operating System :: POSIX :: BSD",
+ "Operating System :: POSIX :: Linux",
+ "Programming Language :: Python :: 3",
+ "Programming Language :: Python :: 3 :: Only",
+ "Programming Language :: Python :: 3.8",
+ "Programming Language :: Python :: 3.9",
+ "Programming Language :: Python :: 3.10",
+ "Topic :: Scientific/Engineering :: Image Recognition",
+ "Topic :: Text Processing :: Indexing",
+ "Topic :: Text Processing :: Linguistic",
+]
+keywords = [
+ "PDF",
+ "OCR",
+ "optical character recognition",
+ "PDF/A",
+ "scanning",
+]
+
+[project.urls]
+Documentation = "https://ocrmypdf.readthedocs.io/"
+Source = "https://github.com/ocrmypdf/OCRmyPDF"
+Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues"
+
+[project.optional-dependencies]
+docs = ["sphinx", "sphinx-issues", "sphinx-rtd-theme"]
+extended_test = ["PyMuPDF==1.19.1"]
+test = [
+ "coverage[toml]>=5",
+ "pytest>=6.0.0",
+ "pytest-cov>=2.11.1",
+ "pytest-xdist>=2.2.0",
+ "python-xmp-toolkit==2.0.1", # also requires apt-get install libexempi3
+ "types-Pillow",
+ "types-humanfriendly",
+]
+watcher = ["watchdog>=1.0.2"]
+webservice = ["Flask>=1"]
+
+[project.scripts]
+ocrmypdf = "ocrmypdf.__main__:run"
+
+[tool.setuptools.package-data]
+ocrmypdf = ["data/sRGB.icc", "py.typed"]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+namespaces = false
+
[tool.setuptools_scm]
+[tool.distutils.bdist_wheel]
+python-tag = "py38"
+
[tool.black]
line-length = 88
-target-version = ["py37", "py38"]
+target-version = ["py38", "py39", "py310", "py311"]
skip-string-normalization = true
include = '\.pyi?$'
exclude = '''
@@ -96,8 +180,7 @@ module = [
'pdfminer.*',
'reportlab.*',
'fitz',
- 'libxmp.utils',
- 'importlib_metadata'
+ 'libxmp.utils'
]
ignore_missing_imports = true
diff --git a/setup.cfg b/setup.cfg
deleted file mode 100644
index 3dc89171..00000000
--- a/setup.cfg
+++ /dev/null
@@ -1,116 +0,0 @@
-[metadata]
-name = ocrmypdf
-description = OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched
-long_description = file: README.md
-long_description_content_type = text/markdown
-url = https://github.com/ocrmypdf/OCRmyPDF
-author = James R. Barlow
-author_email = james@purplerock.ca
-license = MPL-2.0
-license_file = LICENSE
-license_files =
- LICENSE
-classifiers =
- Development Status :: 5 - Production/Stable
- Environment :: Console
- Intended Audience :: End Users/Desktop
- Intended Audience :: Science/Research
- Intended Audience :: System Administrators
- License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
- Operating System :: MacOS :: MacOS X
- Operating System :: Microsoft :: Windows :: Windows 10
- Operating System :: POSIX
- Operating System :: POSIX :: BSD
- Operating System :: POSIX :: Linux
- Programming Language :: Python :: 3
- Programming Language :: Python :: 3 :: Only
- Programming Language :: Python :: 3.7
- Programming Language :: Python :: 3.8
- Programming Language :: Python :: 3.9
- Programming Language :: Python :: 3.10
- Topic :: Scientific/Engineering :: Image Recognition
- Topic :: Text Processing :: Indexing
- Topic :: Text Processing :: Linguistic
-keywords =
- PDF
- OCR
- optical character recognition
- PDF/A
- scanning
-project_urls =
- Documentation = https://ocrmypdf.readthedocs.io/
- Source = https://github.com/ocrmypdf/OCRmyPDF
- Tracker = https://github.com/ocrmypdf/OCRmyPDF/issues
-
-[options]
-packages = find:
-install_requires =
- Pillow>=8.2.0
- coloredlogs>=14.0 # strictly optional
- img2pdf>=0.3.0 # pure Python
- packaging>=20
- pdfminer.six!=20200720,>=20191110
- pikepdf!=5.0.0,>=4.0.0
- pluggy>=0.13.0
- reportlab>=3.5.66
- tqdm>=4
- importlib-metadata>=4;python_version<'3.8' # until Python 3.8
- importlib-resources>=5;python_version<'3.9' # until Python 3.9
- typing-extensions>=4;python_version<'3.10'
-python_requires = >=3.7
-include_package_data = True
-package_dir =
- =src
-platforms = any
-setup_requires =
- setuptools-scm
- setuptools-scm-git-archive
-zip_safe = False
-
-[options.packages.find]
-where = src
-
-[options.entry_points]
-console_scripts =
- ocrmypdf = ocrmypdf.__main__:run
-
-[options.extras_require]
-docs =
- sphinx
- sphinx-issues
- sphinx-rtd-theme
-extended_test =
- PyMuPDF==1.19.1
-test =
- coverage[toml]>=5
- pytest>=6.0.0
- pytest-cov>=2.11.1
- pytest-xdist>=2.2.0
- python-xmp-toolkit==2.0.1 # also requires apt-get install libexempi3
- types-Pillow
- types-humanfriendly
-watcher =
- watchdog>=1.0.2
-webservice =
- Flask>=1
-
-[options.package_data]
-ocrmypdf =
- data/sRGB.icc
- py.typed
-
-[bdist_wheel]
-python-tag = py37
-
-[aliases]
-test = pytest
-
-[check-manifest]
-ignore =
- .github
-
-[flake8]
-ignore = D203,F401,W503,E501,E203,F841
-exclude = .git,__pycache__,docs/conf.py,build,dist,.venv,.venvpp,.eggs,tmp,src/ocrmypdf/lib/
-max-complexity = 10
-max-line-length = 100
diff --git a/src/ocrmypdf/RELEASE.md b/src/ocrmypdf/RELEASE.md
index f9eab6db..f034c82e 100644
--- a/src/ocrmypdf/RELEASE.md
+++ b/src/ocrmypdf/RELEASE.md
@@ -17,11 +17,11 @@
- Check README.md
-- Check setup.py
+- Check pyproject.toml
- Are classifiers up to date?
- Is `python_requires` correct?
- - Python 3.6 is EOL on December 2021-12. Could drop support then.
+ - Is it to drop support for older Pythons?
- Can we tighten any `install_requires` dependencies?
- Search for old version shims we can remove
diff --git a/src/ocrmypdf/__init__.py b/src/ocrmypdf/__init__.py
index 7f4a4cd2..25c3039d 100644
--- a/src/ocrmypdf/__init__.py
+++ b/src/ocrmypdf/__init__.py
@@ -21,7 +21,6 @@ from ocrmypdf.exceptions import (
InputFileError,
MissingDependencyError,
OutputFileAccessError,
- PdfMergeFailedError,
PriorOcrFoundError,
SubprocessOutputError,
TesseractConfigError,
diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py
index 24d620c8..24a64ae9 100755
--- a/src/ocrmypdf/__main__.py
+++ b/src/ocrmypdf/__main__.py
@@ -71,6 +71,4 @@ def run(args=None):
if __name__ == '__main__':
- if sys.platform == 'darwin' and sys.version_info < (3, 8):
- set_start_method('spawn') # see python bpo-33725
sys.exit(run())
diff --git a/src/ocrmypdf/_exec/ghostscript.py b/src/ocrmypdf/_exec/ghostscript.py
index d0b007c2..b4b0cd29 100644
--- a/src/ocrmypdf/_exec/ghostscript.py
+++ b/src/ocrmypdf/_exec/ghostscript.py
@@ -47,21 +47,6 @@ def version():
return get_version(GS)
-def jpeg_passthrough_available() -> bool:
- """Returns True if the installed version of Ghostscript supports JPEG passthru
-
- Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23
- it gained the ability to keep JPEGs unmodified. However, the 9.23
- implementation was buggy and would deletes the last two bytes of images in
- some cases, as reported here.
- https://bugs.ghostscript.com/show_bug.cgi?id=699216
-
- The issue was fixed for 9.24, hence that is the first version we consider
- the feature available. (Ghostscript 9.24 has its own problems is blacklisted.)
- """
- return version() >= '9.24'
-
-
def _gs_error_reported(stream) -> bool:
match = re.search(r'error', stream, flags=re.IGNORECASE)
return bool(match)
@@ -201,19 +186,8 @@ def generate_pdfa(
]
strategy = 'LeaveColorUnchanged'
- # Older versions of Ghostscript expect a leading slash in
- # sColorConversionStrategy, newer ones should not have it. See Ghostscript
- # git commit fe1c025d.
gs_version = version()
- strategy = ('/' + strategy) if gs_version < '9.19' else strategy
-
- if gs_version == '9.23':
- # 9.23: added JPEG passthrough as a new feature, but with a bug that
- # incorrectly formats some images. Fixed as of 9.24. So we disable this
- # feature for 9.23.
- # https://bugs.ghostscript.com/show_bug.cgi?id=699216
- compression_args.append('-dPassThroughJPEGImages=false')
- elif gs_version == '9.56.0':
+ if gs_version == '9.56.0':
# 9.56.0 breaks our OCR, should be fixed in 9.56.1
# https://bugs.ghostscript.com/show_bug.cgi?id=705187
compression_args.append('-dNEWPDF=false')
diff --git a/src/ocrmypdf/_exec/tesseract.py b/src/ocrmypdf/_exec/tesseract.py
index ad98836a..adb1ec17 100644
--- a/src/ocrmypdf/_exec/tesseract.py
+++ b/src/ocrmypdf/_exec/tesseract.py
@@ -33,7 +33,7 @@ HOCR_TEMPLATE = """
-
+
@@ -114,15 +114,6 @@ def version() -> str:
return get_version('tesseract', regex=r'tesseract\s(.+)')
-def has_user_words() -> bool:
- """Does Tesseract have --user-words capability?
-
- Not available in 4.0, but available in 4.1. Also available in 3.x, but
- we no longer support 3.x.
- """
- return version() >= '4.1'
-
-
def has_thresholding() -> bool:
"""Does Tesseract have -c thresholding method capability?"""
return version() >= '5.0'
@@ -244,7 +235,7 @@ def get_deskew(
def tesseract_log_output(stream: bytes) -> None:
tlog = TesseractLoggerAdapter(
- log, extra=log.extra if hasattr(log, 'extra') else None
+ log, extra=log.extra if hasattr(log, 'extra') else None # type: ignore
)
if not stream:
diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py
index d7f24265..dd3a5116 100644
--- a/src/ocrmypdf/_exec/unpaper.py
+++ b/src/ocrmypdf/_exec/unpaper.py
@@ -1,12 +1,10 @@
# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
-from __future__ import annotations
-
-# unpaper documentation:
-# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
"""Interface to unpaper executable"""
+from __future__ import annotations
+
import logging
import os
import shlex
@@ -22,6 +20,10 @@ from PIL import Image
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
from ocrmypdf.subprocess import get_version, run
+# unpaper documentation:
+# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
+
+
if sys.version_info >= (3, 10):
from tempfile import TemporaryDirectory
else:
diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py
index 26d986f1..8521d1c6 100644
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -207,7 +207,7 @@ def exec_page_sync(page_context: PageContext) -> PageResult:
filtered_image = page_context.plugin_manager.hook.filter_page_image(
page=page_context, image_filename=visible_image_out
)
- if filtered_image:
+ if filtered_image is not None: # None if no hook is present
visible_image_out = filtered_image
pdf_page_from_image_out = create_pdf_page_from_image(
visible_image_out, page_context, orientation_correction
diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py
index 0533b928..977ea14e 100644
--- a/src/ocrmypdf/_validation.py
+++ b/src/ocrmypdf/_validation.py
@@ -134,7 +134,7 @@ def check_options_preprocessing(options: Namespace) -> None:
package='unpaper',
version_checker=unpaper.version,
need_version='6.1',
- required_for=['--clean, --clean-final'],
+ required_for="--clean, --clean-final", # Problem arguments
)
try:
if options.unpaper_args:
@@ -221,7 +221,7 @@ def check_options_metadata(options: Namespace) -> None:
def check_options_pillow(options: Namespace) -> None:
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
if PIL.Image.MAX_IMAGE_PIXELS == 0:
- PIL.Image.MAX_IMAGE_PIXELS = None
+ PIL.Image.MAX_IMAGE_PIXELS = None # type: ignore
def _check_plugin_invariant_options(options: Namespace) -> None:
diff --git a/src/ocrmypdf/_version.py b/src/ocrmypdf/_version.py
index 11f8ed3f..4b368f3e 100644
--- a/src/ocrmypdf/_version.py
+++ b/src/ocrmypdf/_version.py
@@ -8,10 +8,7 @@ OCRmyPDF uses setuptools_scm to derive version from git tags.
from __future__ import annotations
-try:
- from importlib.metadata import version as _package_version
-except ImportError:
- from importlib_metadata import version as _package_version # type: ignore
+from importlib.metadata import version as _package_version
PROGRAM_NAME = 'ocrmypdf'
diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py
index 8d92eb71..52821869 100644
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@@ -15,6 +15,9 @@ from pathlib import Path
from typing import AnyStr, BinaryIO, Iterable, Union
from warnings import warn
+import coloredlogs
+from humanfriendly.terminal import enable_ansi_support
+
from ocrmypdf._logging import PageNumberFilter, TqdmConsole
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf._sync import run_pipeline
@@ -22,15 +25,6 @@ from ocrmypdf._validation import check_options
from ocrmypdf.cli import ArgumentParser, get_parser
from ocrmypdf.helpers import is_iterable_notstr
-try:
- import coloredlogs
-except ModuleNotFoundError:
- coloredlogs = None # pylint: disable=invalid-name
-
-if coloredlogs:
- from humanfriendly.terminal import enable_ansi_support
-
-
StrPath = Union[Path, AnyStr]
PathOrIO = Union[BinaryIO, StrPath]
@@ -121,7 +115,7 @@ def configure_logging(
use_colors = progress_bar_friendly
formatter = None
- if coloredlogs and use_colors:
+ if use_colors:
use_colors = enable_ansi_support()
if use_colors:
use_colors = coloredlogs.terminal_supports_colors()
@@ -284,8 +278,6 @@ def ocr( # pylint: disable=unused-argument
``"-"``, some final validation steps are not performed (we do not read
back the stream after it is written).
Raises:
- ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
- with the OCR layer.
ocrmypdf.MissingDependencyError: If a required dependency program is missing or
was not found on PATH.
ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
diff --git a/src/ocrmypdf/builtin_plugins/ghostscript.py b/src/ocrmypdf/builtin_plugins/ghostscript.py
index 64375ad8..7ab8d7c9 100644
--- a/src/ocrmypdf/builtin_plugins/ghostscript.py
+++ b/src/ocrmypdf/builtin_plugins/ghostscript.py
@@ -21,37 +21,19 @@ def check_options(options):
program='gs',
package='ghostscript',
version_checker=ghostscript.version,
- need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
+ need_version='9.50', # Ubuntu 20.04's version
)
gs_version = ghostscript.version()
- if gs_version in ('9.24', '9.51'):
+ if gs_version in ('9.51',):
raise MissingDependencyError(
f"Ghostscript {gs_version} contains serious regressions and is not "
"supported. Please upgrade to a newer version, or downgrade to the "
"previous version."
)
- # We have these constraints to check for.
- # 1. Ghostscript < 9.20 mangles multibyte Unicode
- # 2. hocr doesn't work on non-Latin languages (so don't select it)
- is_latin = options.languages.issubset(HOCR_OK_LANGS)
- if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin:
- # https://bugs.ghostscript.com/show_bug.cgi?id=696874
- # Ghostscript < 9.20 fails to encode multibyte characters properly
- log.warning(
- f"The installed version of Ghostscript ({gs_version}) does not work "
- "correctly with the OCR languages you specified. Use --output-type pdf or "
- "upgrade to Ghostscript 9.20 or later to avoid this issue."
- )
-
if options.output_type == 'pdfa':
options.output_type = 'pdfa-2'
- if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
- raise MissingDependencyError(
- "--output-type pdfa-3 requires Ghostscript 9.19 or later"
- )
-
@hookimpl
def rasterize_pdf_page(
diff --git a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
index d9e24e4e..8372577c 100644
--- a/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
+++ b/src/ocrmypdf/builtin_plugins/tesseract_ocr.py
@@ -43,7 +43,7 @@ def add_options(parser):
metavar='MODE',
choices=range(0, 4),
help=(
- "Set Tesseract 4.0+ OCR engine mode: "
+ "Set Tesseract 4+ OCR engine mode: "
"0 - original Tesseract only; "
"1 - neural nets LSTM only; "
"2 - Tesseract + LSTM; "
@@ -93,7 +93,7 @@ def check_options(options):
program='tesseract',
package={'linux': 'tesseract-ocr'},
version_checker=tesseract.version,
- need_version='4.0.0-beta.1', # using backport for Travis CI
+ need_version='4.1.1', # Ubuntu 20.04 version
version_parser=tesseract.TesseractVersion,
)
@@ -101,11 +101,6 @@ def check_options(options):
if options.pdf_renderer == 'auto':
options.pdf_renderer = 'sandwich'
- if not tesseract.has_user_words() and (options.user_words or options.user_patterns):
- log.warning(
- "Tesseract 4.0 (which you have installed) ignores --user-words and "
- "--user-patterns, so these arguments have no effect."
- )
if not tesseract.has_thresholding() and options.tesseract_thresholding != 0:
log.warning(
"The installed version of Tesseract does not support changes to its "
diff --git a/src/ocrmypdf/exceptions.py b/src/ocrmypdf/exceptions.py
index d5a9ae8e..b56e8a1a 100644
--- a/src/ocrmypdf/exceptions.py
+++ b/src/ocrmypdf/exceptions.py
@@ -47,26 +47,6 @@ class BadArgsError(ExitCodeException):
exit_code = ExitCode.bad_args
-class PdfMergeFailedError(ExitCodeException): # deprecated
- """An intermediate PDF can't be merged.
-
- No longer in use.
- """
-
- exit_code = ExitCode.input_file
- message = dedent(
- '''\
- Failed to merge PDF image layer with OCR layer
-
- Usually this happens because the input PDF file is malformed and
- ocrmypdf cannot correct the problem on its own.
-
- Try using
- ocrmypdf --pdf-renderer sandwich [..other args..]
- '''
- )
-
-
class MissingDependencyError(ExitCodeException):
"""A third-party dependency is missing."""
diff --git a/src/ocrmypdf/helpers.py b/src/ocrmypdf/helpers.py
index 3882497c..aed5a750 100644
--- a/src/ocrmypdf/helpers.py
+++ b/src/ocrmypdf/helpers.py
@@ -12,7 +12,6 @@ import shutil
import warnings
from collections.abc import Iterable
from contextlib import suppress
-from functools import wraps
from io import StringIO
from math import isclose, isfinite
from pathlib import Path
@@ -291,20 +290,3 @@ def pikepdf_enable_mmap():
# Fix is not in pybind11 2.6.0
# log.debug("pikepdf mmap disabled")
return
-
-
-def deprecated(func):
- """Warn that function is deprecated."""
-
- @wraps(func)
- def new_func(*args, **kwargs):
- warnings.simplefilter('always', DeprecationWarning) # turn off filter
- warnings.warn(
- f"Call to deprecated function {func.__name__}.",
- category=DeprecationWarning,
- stacklevel=2,
- )
- warnings.simplefilter('default', DeprecationWarning) # reset filter
- return func(*args, **kwargs)
-
- return new_func
diff --git a/src/ocrmypdf/hocrtransform.py b/src/ocrmypdf/hocrtransform.py
index 409aa689..306dfa24 100755
--- a/src/ocrmypdf/hocrtransform.py
+++ b/src/ocrmypdf/hocrtransform.py
@@ -14,10 +14,11 @@ import re
import warnings
from math import atan, cos, sin
from pathlib import Path
-from typing import Any, NamedTuple, Optional, Tuple, Union
+from typing import Any, NamedTuple
from xml.etree import ElementTree
with warnings.catch_warnings():
+ # reportlab uses deprecated load_module
warnings.filterwarnings(
'ignore', category=DeprecationWarning, message=r".*load_module.*"
)
diff --git a/src/ocrmypdf/pluginspec.py b/src/ocrmypdf/pluginspec.py
index c3298e4c..dfa2e3ea 100644
--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@@ -278,10 +278,6 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path:
will occur. The return value should be a path to a file in the same folder
as ``image_filename``.
- Implementation detail: If the value returned is falsy, OCRmyPDF will ignore
- the return value and assume the input file was unmodified. This is deprecated.
- To leave the image unmodified, ``image_filename`` should be returned.
-
Note:
This hook will be called from child processes. Modifying global state
will not affect the main process or other child processes.
diff --git a/src/ocrmypdf/subprocess/__init__.py b/src/ocrmypdf/subprocess/__init__.py
index cc5b3550..1254727a 100644
--- a/src/ocrmypdf/subprocess/__init__.py
+++ b/src/ocrmypdf/subprocess/__init__.py
@@ -314,7 +314,7 @@ def check_external_program(
program: The name of the program to test.
package: The name of a software package that typically supplies this program.
Usually the same as program.
- version_check: A callable without arguments that retrieves the installed
+ version_checker: A callable without arguments that retrieves the installed
version of program.
need_version: The minimum required version.
required_for: The name of an argument of feature that requires this program.
@@ -325,10 +325,7 @@ def check_external_program(
"""
try:
- if callable(version_checker):
- found_version = version_checker()
- else: # deprecated
- found_version = version_checker
+ found_version = version_checker()
except (CalledProcessError, FileNotFoundError) as e:
_error_missing_program(program, package, required_for, recommended)
if not recommended:
diff --git a/src/ocrmypdf/subprocess/_windows.py b/src/ocrmypdf/subprocess/_windows.py
index 3a6c2a39..46f68931 100644
--- a/src/ocrmypdf/subprocess/_windows.py
+++ b/src/ocrmypdf/subprocess/_windows.py
@@ -171,11 +171,6 @@ SHIMS = [
def fix_windows_args(program: str, args, env):
"""Adjust our desired program and command line arguments for use on Windows"""
- if sys.version_info < (3, 8):
- # bpo-33617 - Windows needs manual Path -> str conversion
- args = [os.fspath(arg) for arg in args]
- program = os.fspath(program)
-
# If we are running a .py on Windows, ensure we call it with this Python
# (to support test suite shims)
if program.lower().endswith('.py'):
diff --git a/tests/plugins/gs_feature_elision.py b/tests/plugins/gs_feature_elision.py
index 0f63c502..ce829f5f 100644
--- a/tests/plugins/gs_feature_elision.py
+++ b/tests/plugins/gs_feature_elision.py
@@ -9,13 +9,13 @@ from ocrmypdf import hookimpl
from ocrmypdf.builtin_plugins import ghostscript
from ocrmypdf.subprocess import run_polling_stderr
-elision_warning = """GPL Ghostscript 9.20: Setting Overprint Mode to 1
+ELISION_WARNING = """GPL Ghostscript 9.50: Setting Overprint Mode to 1
not permitted in PDF/A-2, overprint mode not set"""
def run_append_stderr(*args, **kwargs):
proc = run_polling_stderr(*args, **kwargs)
- proc.stderr += '\n' + elision_warning + '\n'
+ proc.stderr += '\n' + ELISION_WARNING + '\n'
return proc
diff --git a/tests/plugins/tesseract_cache.py b/tests/plugins/tesseract_cache.py
index 3725f410..eec8f4c9 100644
--- a/tests/plugins/tesseract_cache.py
+++ b/tests/plugins/tesseract_cache.py
@@ -21,7 +21,7 @@ were produced.
Certain operations are not cached and routed to Tesseract OCR directly.
-Assumes Tesseract 4.0.0-alpha or higher.
+Assumes Tesseract 4+.
"""
diff --git a/tests/plugins/tesseract_debug_rotate.py b/tests/plugins/tesseract_debug_rotate.py
index f278cb3f..1e10cfb7 100644
--- a/tests/plugins/tesseract_debug_rotate.py
+++ b/tests/plugins/tesseract_debug_rotate.py
@@ -27,7 +27,7 @@ HOCR_TEMPLATE = '''
-
+
@@ -46,7 +46,7 @@ HOCR_TEMPLATE = '''
class FixedRotateNoopOcrEngine(OcrEngine):
@staticmethod
def version():
- return '4.0.0'
+ return '4.1.1'
@staticmethod
def creator_tag(options):
diff --git a/tests/plugins/tesseract_noop.py b/tests/plugins/tesseract_noop.py
index 4dd18dfd..68a0bfe6 100644
--- a/tests/plugins/tesseract_noop.py
+++ b/tests/plugins/tesseract_noop.py
@@ -25,7 +25,7 @@ HOCR_TEMPLATE = '''
-
+
@@ -44,7 +44,7 @@ HOCR_TEMPLATE = '''
class NoopOcrEngine(OcrEngine):
@staticmethod
def version():
- return '4.0.0'
+ return '4.1.1'
@staticmethod
def creator_tag(options):
diff --git a/tests/test_helpers.py b/tests/test_helpers.py
index bd43b5b5..e1992b32 100644
--- a/tests/test_helpers.py
+++ b/tests/test_helpers.py
@@ -54,15 +54,6 @@ def test_no_cpu_count(monkeypatch):
assert invoked, "Patched function called during test"
-def test_deprecated():
- @helpers.deprecated
- def old_function():
- return 42
-
- with pytest.deprecated_call():
- assert old_function() == 42
-
-
skipif_docker = pytest.mark.skipif(running_in_docker(), reason="fails on Docker")
diff --git a/tests/test_main.py b/tests/test_main.py
index 676c17cb..198ed3f1 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -545,7 +545,6 @@ def test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outp
assert p.returncode == ExitCode.invalid_config
-@pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0')
def test_user_words_ocr(resources, outdir):
# Does not actually test if --user-words causes output to differ
word_list = outdir / 'wordlist.txt'
@@ -722,11 +721,9 @@ def test_compression_changed(ocrmypdf_exec, resources, image, compression, outpd
if compression == "jpeg":
assert pdfimage.enc == Encoding.jpeg
else:
- if ghostscript.jpeg_passthrough_available():
- # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be
- # copied without transcoding - so report
- if image.endswith('jpg'):
- assert pdfimage.enc == Encoding.jpeg
+ if image.endswith('jpg'):
+ # Ghostscript JPEG passthrough - no issue
+ assert pdfimage.enc == Encoding.jpeg
else:
assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)
diff --git a/tests/test_metadata.py b/tests/test_metadata.py
index dbd9f7f9..80b97c78 100644
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -28,9 +28,6 @@ except ImportError:
fitz = None
-pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning')
-
-
@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
def test_preserve_docinfo(output_type, resources, outpdf):
pdf_before = pikepdf.open(resources / 'graph.pdf')
@@ -174,7 +171,12 @@ def test_creation_date_preserved(output_type, resources, infile, outpdf):
def libxmp_file_to_dict():
try:
with warnings.catch_warnings():
- warnings.simplefilter("ignore", DeprecationWarning)
+ # libxmp imports distutils.Version, which is deprecated
+ warnings.filterwarnings(
+ "ignore",
+ category=DeprecationWarning,
+ message=r".*distutils Version classes are deprecated.*",
+ )
from libxmp.utils import (
file_to_dict, # pylint: disable=import-outside-toplevel
)
diff --git a/tests/test_stdio.py b/tests/test_stdio.py
index 17536145..577ec607 100644
--- a/tests/test_stdio.py
+++ b/tests/test_stdio.py
@@ -51,10 +51,6 @@ def test_stdout(ocrmypdf_exec, resources, outpdf):
assert check_pdf(output_file)
-@pytest.mark.xfail(
- os.name == 'nt' and sys.version_info < (3, 8),
- reason="Windows does not like this; not sure how to fix",
-)
def test_dev_null(resources):
if 'COV_CORE_DATAFILE' in os.environ:
pytest.skip("Coverage uses stdout")
diff --git a/tests/test_validation.py b/tests/test_validation.py
index fca0cc32..192925f8 100644
--- a/tests/test_validation.py
+++ b/tests/test_validation.py
@@ -48,22 +48,6 @@ def test_hocr_notlatin_warning(caplog):
assert 'PDF renderer is known to cause' in caplog.text
-def test_old_ghostscript(caplog):
- with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'), patch(
- 'ocrmypdf._exec.tesseract.get_languages', return_value={'eng', 'chi_sim'}
- ):
- vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa'))
- assert 'does not work correctly' in caplog.text
-
- with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'):
- with pytest.raises(MissingDependencyError):
- vd.check_options(*make_opts_pm(output_type='pdfa-3'))
-
- with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'):
- with pytest.raises(MissingDependencyError):
- vd.check_options(*make_opts_pm())
-
-
def test_old_tesseract_error():
with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'):
with pytest.raises(MissingDependencyError):
@@ -103,22 +87,6 @@ def test_optimizing(caplog):
assert 'will be ignored because' in caplog.text
-def test_user_words(caplog):
- with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False):
- vd.check_options(*make_opts_pm(user_words='foo'))
- assert (
- 'Tesseract 4.0 (which you have installed) ignores --user-words'
- in caplog.text
- )
- caplog.clear()
- with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True):
- vd.check_options(*make_opts_pm(user_patterns='foo'))
- assert (
- 'Tesseract 4.0 (which you have installed) ignores --user-words'
- not in caplog.text
- )
-
-
def test_pillow_options():
vd.check_options_pillow(make_opts(max_image_mpixels=0))
@@ -229,37 +197,38 @@ def test_version_comparison():
program="tesseract",
package="tesseract",
version_checker=lambda: '4.0.0-beta.1',
- need_version='4.0.0',
+ need_version='4.1.1',
version_parser=TesseractVersion,
)
vd.check_external_program(
program="tesseract",
package="tesseract",
version_checker=lambda: 'v5.0.0-alpha.20200201',
- need_version='4.0.0',
+ need_version='4.1.1',
version_parser=TesseractVersion,
)
vd.check_external_program(
program="tesseract",
package="tesseract",
version_checker=lambda: '5.0.0-rc1.20211030',
- need_version='4.0.0',
+ need_version='4.1.1',
version_parser=TesseractVersion,
)
vd.check_external_program(
program="tesseract",
package="tesseract",
- version_checker=lambda: 'v4.0.0.20181030', # Some Windows builds use this format
- need_version='4.0.0',
- version_parser=TesseractVersion,
- )
- vd.check_external_program(
- program="tesseract",
- package="tesseract",
- version_checker=lambda: '4.1.1-rc2-25-g9707',
- need_version='4.0.0',
+ version_checker=lambda: 'v4.1.1.20181030', # Some Windows builds use this format
+ need_version='4.1.1',
version_parser=TesseractVersion,
)
+ with pytest.raises(MissingDependencyError):
+ vd.check_external_program(
+ program="tesseract",
+ package="tesseract",
+ version_checker=lambda: '4.1.1-rc2-25-g9707',
+ need_version='4.1.1',
+ version_parser=TesseractVersion,
+ )
with pytest.raises(MissingDependencyError):
vd.check_external_program(
program="dummy_fails",