mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-07 06:07:58 -04:00
Merge branch 'feature/drop-3.7'
This commit is contained in:
12
.github/workflows/build.yml
vendored
12
.github/workflows/build.yml
vendored
@@ -22,8 +22,6 @@ jobs:
|
||||
strategy:
|
||||
matrix:
|
||||
include:
|
||||
- os: ubuntu-18.04
|
||||
python: "3.7"
|
||||
- os: ubuntu-20.04
|
||||
python: "3.8"
|
||||
- os: ubuntu-20.04
|
||||
@@ -33,7 +31,9 @@ jobs:
|
||||
- os: ubuntu-latest
|
||||
python: "3.9"
|
||||
- os: ubuntu-latest
|
||||
python: "pypy-3.8"
|
||||
python: "pypy3.8"
|
||||
- os: ubuntu-latest
|
||||
python: "pypy3.9"
|
||||
- os: ubuntu-latest
|
||||
python: "3.9"
|
||||
tesseract5: true
|
||||
@@ -75,12 +75,6 @@ jobs:
|
||||
unpaper \
|
||||
zlib1g
|
||||
|
||||
- name: Install Ubuntu 18.04 packages
|
||||
if: matrix.os == 'ubuntu-18.04'
|
||||
run: |
|
||||
sudo apt-get install -y --no-install-recommends \
|
||||
libexempi3
|
||||
|
||||
- name: Install Ubuntu 20.04 packages
|
||||
if: matrix.os == 'ubuntu-20.04' || matrix.os == 'ubuntu-latest'
|
||||
run: |
|
||||
|
||||
@@ -28,7 +28,7 @@ repos:
|
||||
rev: v2.37.2
|
||||
hooks:
|
||||
- id: pyupgrade
|
||||
args: ["--py37-plus"]
|
||||
args: ["--py38-plus"]
|
||||
- repo: https://github.com/pre-commit/mirrors-mypy
|
||||
rev: v0.971
|
||||
hooks:
|
||||
|
||||
@@ -17,7 +17,7 @@ formats:
|
||||
|
||||
# Optionally set the version of Python and requirements required to build your docs
|
||||
python:
|
||||
version: "3.7"
|
||||
version: "3.8"
|
||||
install:
|
||||
- method: pip
|
||||
path: .
|
||||
|
||||
@@ -70,7 +70,7 @@ Linux, Windows, macOS and FreeBSD are supported. Docker images are also availabl
|
||||
| macOS (Homebrew) | ``brew install ocrmypdf`` |
|
||||
| macOS (nix) | ``nix-env -i ocrmypdf`` |
|
||||
| LinuxBrew | ``brew install ocrmypdf`` |
|
||||
| FreeBSD | ``pkg install py37-ocrmypdf`` |
|
||||
| FreeBSD | ``pkg install py-ocrmypdf`` |
|
||||
| Conda | ``conda install ocrmypdf`` |
|
||||
| Ubuntu Snap | ``snap install ocrmypdf`` |
|
||||
|
||||
@@ -96,10 +96,7 @@ brew install tesseract-lang
|
||||
|
||||
You can then pass the `-l LANG` argument to OCRmyPDF to give a hint as to what languages it should search for. Multiple languages can be requested.
|
||||
|
||||
OCRmyPDF supports Tesseract 4.0 and the beta versions of Tesseract 5.0. It will
|
||||
automatically use whichever version it finds first on the `PATH` environment
|
||||
variable. On Windows, if `PATH` does not provide a Tesseract binary, we use
|
||||
the highest version number that is installed according to the Windows Registry.
|
||||
OCRmyPDF supports Tesseract 4.1.1+. It will automatically use whichever version it finds first on the `PATH` environment variable. On Windows, if `PATH` does not provide a Tesseract binary, we use the highest version number that is installed according to the Windows Registry.
|
||||
|
||||
## Documentation and support
|
||||
|
||||
|
||||
@@ -72,14 +72,6 @@ OCRmyPDF, use processes.
|
||||
not take at least one of these steps, process semantics will prevent
|
||||
OCRmyPDF from working correctly.
|
||||
|
||||
.. warning::
|
||||
|
||||
On macOS with Python 3.7, you must call
|
||||
:func:`multiprocessing.set_start_method("spawn")`. Without this, multiprocessing
|
||||
will be unstable. From the command line, OCRmyPDF does this automatically,
|
||||
but as an API user you must do this. See Python bpo-33725 for details.
|
||||
Python 3.8+ also resolve this automatically.
|
||||
|
||||
Logging
|
||||
-------
|
||||
|
||||
|
||||
@@ -76,6 +76,8 @@ author = 'James R. Barlow'
|
||||
# The short X.Y version.
|
||||
|
||||
import os
|
||||
from importlib.metadata import version as package_version
|
||||
|
||||
|
||||
on_rtd = os.environ.get('READTHEDOCS') == 'True'
|
||||
|
||||
@@ -96,10 +98,6 @@ if on_rtd:
|
||||
]
|
||||
sys.modules.update((mod_name, Mock()) for mod_name in MOCK_MODULES)
|
||||
|
||||
try:
|
||||
from importlib_metadata import version as package_version
|
||||
except ModuleNotFoundError:
|
||||
from importlib.metadata import version as package_version
|
||||
|
||||
# The full version, including alpha/beta/rc tags.
|
||||
release = package_version('ocrmypdf')
|
||||
|
||||
@@ -283,7 +283,7 @@ argument. (Normally, OCRmyPDF will exit with an error if asked to modify
|
||||
a file with OCR.)
|
||||
|
||||
This may be helpful for users who want to take advantage of accuracy
|
||||
improvements in Tesseract 4.0 for files they previously OCRed with an
|
||||
improvements in Tesseract for files they previously OCRed with an
|
||||
earlier version of Tesseract and OCRmyPDF.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
@@ -44,7 +44,7 @@ install, or install a more recent version than your platform provides, read on.
|
||||
Installing on Linux
|
||||
===================
|
||||
|
||||
Debian and Ubuntu 18.04 or newer
|
||||
Debian and Ubuntu 20.04 or newer
|
||||
--------------------------------
|
||||
|
||||
.. |deb-11| image:: https://repology.org/badge/version-for-repo/debian_11/ocrmypdf.svg
|
||||
@@ -56,9 +56,6 @@ Debian and Ubuntu 18.04 or newer
|
||||
.. |deb-unstable| image:: https://repology.org/badge/version-for-repo/debian_unstable/ocrmypdf.svg
|
||||
:alt: Debian unstable
|
||||
|
||||
.. |ubu-1804| image:: https://repology.org/badge/version-for-repo/ubuntu_18_04/ocrmypdf.svg
|
||||
:alt: Ubuntu 18.04 LTS
|
||||
|
||||
.. |ubu-2004| image:: https://repology.org/badge/version-for-repo/ubuntu_20_04/ocrmypdf.svg
|
||||
:alt: Ubuntu 20.04 LTS
|
||||
|
||||
@@ -72,7 +69,7 @@ Debian and Ubuntu 18.04 or newer
|
||||
+-----------------------------------------------+
|
||||
| |deb-11| |deb-12| |deb-unstable| |
|
||||
+-----------------------------------------------+
|
||||
| |ubu-1804| |ubu-2004| |ubu-2204| |
|
||||
| |ubu-2004| |ubu-2204| |
|
||||
+-----------------------------------------------+
|
||||
|
||||
Users of Debian 9 ("stretch") or later, or Ubuntu 18.04 or later, including users
|
||||
@@ -80,7 +77,7 @@ of Windows Subsystem for Linux, may simply
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
apt-get install ocrmypdf
|
||||
apt install ocrmypdf
|
||||
|
||||
As indicated in the table above, Debian and Ubuntu releases may lag
|
||||
behind the latest version. If the version available for your platform is
|
||||
@@ -198,46 +195,6 @@ To install for the current user only:
|
||||
|
||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
Ubuntu 18.04 LTS
|
||||
----------------
|
||||
|
||||
Ubuntu 18.04 includes ocrmypdf 6.1.2 - you can install that with ``apt``, but
|
||||
it is quite old now. To install a more recent version, uninstall the old version
|
||||
of ocrmypdf, and install the following dependencies:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
sudo apt-get -y remove ocrmypdf
|
||||
sudo apt-get -y update
|
||||
sudo apt-get -y install \
|
||||
ghostscript \
|
||||
icc-profiles-free \
|
||||
libxml2 \
|
||||
pngquant \
|
||||
python3-distutils \
|
||||
python3-pkg-resources \
|
||||
python3-reportlab \
|
||||
qpdf \
|
||||
tesseract-ocr \
|
||||
zlib1g \
|
||||
unpaper
|
||||
|
||||
We will need a newer version of ``pip`` then was available for Ubuntu 18.04:
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
wget https://bootstrap.pypa.io/get-pip.py && python3 get-pip.py
|
||||
|
||||
Then install the most recent ocrmypdf for the local user and set the
|
||||
user's ``PATH`` to check for the user's Python packages.
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
export PATH=$HOME/.local/bin:$PATH
|
||||
python3 -m pip install --user ocrmypdf
|
||||
|
||||
To add JBIG2 encoding, see :ref:`jbig2`.
|
||||
|
||||
Arch Linux (AUR)
|
||||
----------------
|
||||
|
||||
@@ -417,8 +374,8 @@ Native Windows
|
||||
|
||||
You must install the following for Windows:
|
||||
|
||||
* Python 3.7 (64-bit) or later
|
||||
* Tesseract 4.0 or later
|
||||
* Python 3.8 (64-bit) or later
|
||||
* Tesseract 4.1.1 or later
|
||||
* Ghostscript 9.50 or later
|
||||
|
||||
Using the `Chocolatey <https://chocolatey.org/>`_ package manager, install the
|
||||
@@ -481,7 +438,7 @@ Cygwin64
|
||||
|
||||
First install the the following prerequisite Cygwin packages using ``setup-x86_64.exe``::
|
||||
|
||||
python37 (or later)
|
||||
python38 (or later)
|
||||
python3?-devel
|
||||
python3?-pip
|
||||
python3?-lxml
|
||||
@@ -616,9 +573,9 @@ manager. ``pip`` cannot provide them.
|
||||
|
||||
The following versions are required:
|
||||
|
||||
- Python 3.7 or newer
|
||||
- Ghostscript 9.23 or newer
|
||||
- Tesseract 4.0.0 or newer
|
||||
- Python 3.8 or newer
|
||||
- Ghostscript 9.50 or newer
|
||||
- Tesseract 4.1.1 or newer
|
||||
- jbig2enc 0.29 or newer
|
||||
- pngquant 2.5 or newer
|
||||
- unpaper 6.1
|
||||
@@ -649,7 +606,7 @@ unfortunately, the ``pip install`` command cannot satisfy all of them.
|
||||
Installing HEAD revision from sources
|
||||
=====================================
|
||||
|
||||
If you have ``git`` and Python 3.7 or newer installed, you can install
|
||||
If you have ``git`` and Python 3.8 or newer installed, you can install
|
||||
from source. When the ``pip`` installer runs, it will alert you if
|
||||
dependencies are missing.
|
||||
|
||||
|
||||
@@ -190,8 +190,7 @@ Ghostscript also imposes some limitations:
|
||||
behavior can be suppressed by setting ``--pdfa-image-compression`` to
|
||||
``jpeg`` or ``lossless`` to set all images to one type or the other.
|
||||
Ghostscript has no option to maintain the input image's format.
|
||||
(Ghostscript 9.25+ can copy JPEG images without transcoding them;
|
||||
earlier versions will transcode.)
|
||||
(Modern Ghostscript can copy JPEG images without transcoding them.)
|
||||
- Ghostscript's PDF/A conversion removes any XMP metadata that is not
|
||||
one of the standard XMP metadata namespaces for PDFs. In particular,
|
||||
PRISM Metdata is removed.
|
||||
|
||||
@@ -2,17 +2,101 @@
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
[build-system]
|
||||
requires = [
|
||||
"setuptools >= 52",
|
||||
"setuptools >= 61",
|
||||
"setuptools_scm[toml] >= 7.0.5",
|
||||
"wheel"
|
||||
]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "ocrmypdf"
|
||||
dynamic = ["version"]
|
||||
description = "OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched"
|
||||
readme = "README.md"
|
||||
license = {text = "MPL-2.0"}
|
||||
requires-python = ">=3.8"
|
||||
dependencies = [
|
||||
"Pillow>=8.2.0",
|
||||
"coloredlogs>=14.0",
|
||||
"deprecation>=2.1.0",
|
||||
"img2pdf>=0.3.0", # pure Python
|
||||
"packaging>=20",
|
||||
"pdfminer.six!=20200720,>=20191110",
|
||||
"pikepdf!=5.0.0,>=4.0.0",
|
||||
"pluggy>=0.13.0",
|
||||
"reportlab>=3.5.66",
|
||||
"tqdm>=4",
|
||||
"importlib-resources>=5;python_version<'3.9'", # until Python 3.9
|
||||
"typing-extensions>=4;python_version<'3.10'",
|
||||
]
|
||||
authors = [{name = "James R. Barlow", email="james@purplerock.ca"}]
|
||||
classifiers = [
|
||||
"Development Status :: 5 - Production/Stable",
|
||||
"Environment :: Console",
|
||||
"Intended Audience :: End Users/Desktop",
|
||||
"Intended Audience :: Science/Research",
|
||||
"Intended Audience :: System Administrators",
|
||||
"License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)",
|
||||
"Operating System :: MacOS :: MacOS X",
|
||||
"Operating System :: Microsoft :: Windows :: Windows 10",
|
||||
"Operating System :: POSIX",
|
||||
"Operating System :: POSIX :: BSD",
|
||||
"Operating System :: POSIX :: Linux",
|
||||
"Programming Language :: Python :: 3",
|
||||
"Programming Language :: Python :: 3 :: Only",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Programming Language :: Python :: 3.9",
|
||||
"Programming Language :: Python :: 3.10",
|
||||
"Topic :: Scientific/Engineering :: Image Recognition",
|
||||
"Topic :: Text Processing :: Indexing",
|
||||
"Topic :: Text Processing :: Linguistic",
|
||||
]
|
||||
keywords = [
|
||||
"PDF",
|
||||
"OCR",
|
||||
"optical character recognition",
|
||||
"PDF/A",
|
||||
"scanning",
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Documentation = "https://ocrmypdf.readthedocs.io/"
|
||||
Source = "https://github.com/ocrmypdf/OCRmyPDF"
|
||||
Tracker = "https://github.com/ocrmypdf/OCRmyPDF/issues"
|
||||
|
||||
[project.optional-dependencies]
|
||||
docs = ["sphinx", "sphinx-issues", "sphinx-rtd-theme"]
|
||||
extended_test = ["PyMuPDF==1.19.1"]
|
||||
test = [
|
||||
"coverage[toml]>=5",
|
||||
"pytest>=6.0.0",
|
||||
"pytest-cov>=2.11.1",
|
||||
"pytest-xdist>=2.2.0",
|
||||
"python-xmp-toolkit==2.0.1", # also requires apt-get install libexempi3
|
||||
"types-Pillow",
|
||||
"types-humanfriendly",
|
||||
]
|
||||
watcher = ["watchdog>=1.0.2"]
|
||||
webservice = ["Flask>=1"]
|
||||
|
||||
[project.scripts]
|
||||
ocrmypdf = "ocrmypdf.__main__:run"
|
||||
|
||||
[tool.setuptools.package-data]
|
||||
ocrmypdf = ["data/sRGB.icc", "py.typed"]
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
where = ["src"]
|
||||
namespaces = false
|
||||
|
||||
[tool.setuptools_scm]
|
||||
|
||||
[tool.distutils.bdist_wheel]
|
||||
python-tag = "py38"
|
||||
|
||||
[tool.black]
|
||||
line-length = 88
|
||||
target-version = ["py37", "py38"]
|
||||
target-version = ["py38", "py39", "py310", "py311"]
|
||||
skip-string-normalization = true
|
||||
include = '\.pyi?$'
|
||||
exclude = '''
|
||||
@@ -96,8 +180,7 @@ module = [
|
||||
'pdfminer.*',
|
||||
'reportlab.*',
|
||||
'fitz',
|
||||
'libxmp.utils',
|
||||
'importlib_metadata'
|
||||
'libxmp.utils'
|
||||
]
|
||||
ignore_missing_imports = true
|
||||
|
||||
|
||||
116
setup.cfg
116
setup.cfg
@@ -1,116 +0,0 @@
|
||||
[metadata]
|
||||
name = ocrmypdf
|
||||
description = OCRmyPDF adds an OCR text layer to scanned PDF files, allowing them to be searched
|
||||
long_description = file: README.md
|
||||
long_description_content_type = text/markdown
|
||||
url = https://github.com/ocrmypdf/OCRmyPDF
|
||||
author = James R. Barlow
|
||||
author_email = james@purplerock.ca
|
||||
license = MPL-2.0
|
||||
license_file = LICENSE
|
||||
license_files =
|
||||
LICENSE
|
||||
classifiers =
|
||||
Development Status :: 5 - Production/Stable
|
||||
Environment :: Console
|
||||
Intended Audience :: End Users/Desktop
|
||||
Intended Audience :: Science/Research
|
||||
Intended Audience :: System Administrators
|
||||
License :: OSI Approved :: Mozilla Public License 2.0 (MPL 2.0)
|
||||
Operating System :: MacOS :: MacOS X
|
||||
Operating System :: Microsoft :: Windows :: Windows 10
|
||||
Operating System :: POSIX
|
||||
Operating System :: POSIX :: BSD
|
||||
Operating System :: POSIX :: Linux
|
||||
Programming Language :: Python :: 3
|
||||
Programming Language :: Python :: 3 :: Only
|
||||
Programming Language :: Python :: 3.7
|
||||
Programming Language :: Python :: 3.8
|
||||
Programming Language :: Python :: 3.9
|
||||
Programming Language :: Python :: 3.10
|
||||
Topic :: Scientific/Engineering :: Image Recognition
|
||||
Topic :: Text Processing :: Indexing
|
||||
Topic :: Text Processing :: Linguistic
|
||||
keywords =
|
||||
PDF
|
||||
OCR
|
||||
optical character recognition
|
||||
PDF/A
|
||||
scanning
|
||||
project_urls =
|
||||
Documentation = https://ocrmypdf.readthedocs.io/
|
||||
Source = https://github.com/ocrmypdf/OCRmyPDF
|
||||
Tracker = https://github.com/ocrmypdf/OCRmyPDF/issues
|
||||
|
||||
[options]
|
||||
packages = find:
|
||||
install_requires =
|
||||
Pillow>=8.2.0
|
||||
coloredlogs>=14.0 # strictly optional
|
||||
img2pdf>=0.3.0 # pure Python
|
||||
packaging>=20
|
||||
pdfminer.six!=20200720,>=20191110
|
||||
pikepdf!=5.0.0,>=4.0.0
|
||||
pluggy>=0.13.0
|
||||
reportlab>=3.5.66
|
||||
tqdm>=4
|
||||
importlib-metadata>=4;python_version<'3.8' # until Python 3.8
|
||||
importlib-resources>=5;python_version<'3.9' # until Python 3.9
|
||||
typing-extensions>=4;python_version<'3.10'
|
||||
python_requires = >=3.7
|
||||
include_package_data = True
|
||||
package_dir =
|
||||
=src
|
||||
platforms = any
|
||||
setup_requires =
|
||||
setuptools-scm
|
||||
setuptools-scm-git-archive
|
||||
zip_safe = False
|
||||
|
||||
[options.packages.find]
|
||||
where = src
|
||||
|
||||
[options.entry_points]
|
||||
console_scripts =
|
||||
ocrmypdf = ocrmypdf.__main__:run
|
||||
|
||||
[options.extras_require]
|
||||
docs =
|
||||
sphinx
|
||||
sphinx-issues
|
||||
sphinx-rtd-theme
|
||||
extended_test =
|
||||
PyMuPDF==1.19.1
|
||||
test =
|
||||
coverage[toml]>=5
|
||||
pytest>=6.0.0
|
||||
pytest-cov>=2.11.1
|
||||
pytest-xdist>=2.2.0
|
||||
python-xmp-toolkit==2.0.1 # also requires apt-get install libexempi3
|
||||
types-Pillow
|
||||
types-humanfriendly
|
||||
watcher =
|
||||
watchdog>=1.0.2
|
||||
webservice =
|
||||
Flask>=1
|
||||
|
||||
[options.package_data]
|
||||
ocrmypdf =
|
||||
data/sRGB.icc
|
||||
py.typed
|
||||
|
||||
[bdist_wheel]
|
||||
python-tag = py37
|
||||
|
||||
[aliases]
|
||||
test = pytest
|
||||
|
||||
[check-manifest]
|
||||
ignore =
|
||||
.github
|
||||
|
||||
[flake8]
|
||||
ignore = D203,F401,W503,E501,E203,F841
|
||||
exclude = .git,__pycache__,docs/conf.py,build,dist,.venv,.venvpp,.eggs,tmp,src/ocrmypdf/lib/
|
||||
max-complexity = 10
|
||||
max-line-length = 100
|
||||
@@ -17,11 +17,11 @@
|
||||
|
||||
- Check README.md
|
||||
|
||||
- Check setup.py
|
||||
- Check pyproject.toml
|
||||
|
||||
- Are classifiers up to date?
|
||||
- Is `python_requires` correct?
|
||||
- Python 3.6 is EOL on December 2021-12. Could drop support then.
|
||||
- Is it to drop support for older Pythons?
|
||||
- Can we tighten any `install_requires` dependencies?
|
||||
|
||||
- Search for old version shims we can remove
|
||||
|
||||
@@ -21,7 +21,6 @@ from ocrmypdf.exceptions import (
|
||||
InputFileError,
|
||||
MissingDependencyError,
|
||||
OutputFileAccessError,
|
||||
PdfMergeFailedError,
|
||||
PriorOcrFoundError,
|
||||
SubprocessOutputError,
|
||||
TesseractConfigError,
|
||||
|
||||
@@ -71,6 +71,4 @@ def run(args=None):
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if sys.platform == 'darwin' and sys.version_info < (3, 8):
|
||||
set_start_method('spawn') # see python bpo-33725
|
||||
sys.exit(run())
|
||||
|
||||
@@ -47,21 +47,6 @@ def version():
|
||||
return get_version(GS)
|
||||
|
||||
|
||||
def jpeg_passthrough_available() -> bool:
|
||||
"""Returns True if the installed version of Ghostscript supports JPEG passthru
|
||||
|
||||
Prior to 9.23, Ghostscript decoded and re-encoded JPEGs internally. In 9.23
|
||||
it gained the ability to keep JPEGs unmodified. However, the 9.23
|
||||
implementation was buggy and would deletes the last two bytes of images in
|
||||
some cases, as reported here.
|
||||
https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
|
||||
The issue was fixed for 9.24, hence that is the first version we consider
|
||||
the feature available. (Ghostscript 9.24 has its own problems is blacklisted.)
|
||||
"""
|
||||
return version() >= '9.24'
|
||||
|
||||
|
||||
def _gs_error_reported(stream) -> bool:
|
||||
match = re.search(r'error', stream, flags=re.IGNORECASE)
|
||||
return bool(match)
|
||||
@@ -201,19 +186,8 @@ def generate_pdfa(
|
||||
]
|
||||
|
||||
strategy = 'LeaveColorUnchanged'
|
||||
# Older versions of Ghostscript expect a leading slash in
|
||||
# sColorConversionStrategy, newer ones should not have it. See Ghostscript
|
||||
# git commit fe1c025d.
|
||||
gs_version = version()
|
||||
strategy = ('/' + strategy) if gs_version < '9.19' else strategy
|
||||
|
||||
if gs_version == '9.23':
|
||||
# 9.23: added JPEG passthrough as a new feature, but with a bug that
|
||||
# incorrectly formats some images. Fixed as of 9.24. So we disable this
|
||||
# feature for 9.23.
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=699216
|
||||
compression_args.append('-dPassThroughJPEGImages=false')
|
||||
elif gs_version == '9.56.0':
|
||||
if gs_version == '9.56.0':
|
||||
# 9.56.0 breaks our OCR, should be fixed in 9.56.1
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=705187
|
||||
compression_args.append('-dNEWPDF=false')
|
||||
|
||||
@@ -33,7 +33,7 @@ HOCR_TEMPLATE = """<?xml version="1.0" encoding="UTF-8"?>
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html;charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 4.0.0' />
|
||||
<meta name='ocr-system' content='tesseract 4.1.1' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word ocrp_wconf'/>
|
||||
</head>
|
||||
<body>
|
||||
@@ -114,15 +114,6 @@ def version() -> str:
|
||||
return get_version('tesseract', regex=r'tesseract\s(.+)')
|
||||
|
||||
|
||||
def has_user_words() -> bool:
|
||||
"""Does Tesseract have --user-words capability?
|
||||
|
||||
Not available in 4.0, but available in 4.1. Also available in 3.x, but
|
||||
we no longer support 3.x.
|
||||
"""
|
||||
return version() >= '4.1'
|
||||
|
||||
|
||||
def has_thresholding() -> bool:
|
||||
"""Does Tesseract have -c thresholding method capability?"""
|
||||
return version() >= '5.0'
|
||||
@@ -244,7 +235,7 @@ def get_deskew(
|
||||
|
||||
def tesseract_log_output(stream: bytes) -> None:
|
||||
tlog = TesseractLoggerAdapter(
|
||||
log, extra=log.extra if hasattr(log, 'extra') else None
|
||||
log, extra=log.extra if hasattr(log, 'extra') else None # type: ignore
|
||||
)
|
||||
|
||||
if not stream:
|
||||
|
||||
@@ -1,12 +1,10 @@
|
||||
# SPDX-FileCopyrightText: 2022 James R. Barlow
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
# unpaper documentation:
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
"""Interface to unpaper executable"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
@@ -22,6 +20,10 @@ from PIL import Image
|
||||
from ocrmypdf.exceptions import MissingDependencyError, SubprocessOutputError
|
||||
from ocrmypdf.subprocess import get_version, run
|
||||
|
||||
# unpaper documentation:
|
||||
# https://github.com/Flameeyes/unpaper/blob/master/doc/basic-concepts.md
|
||||
|
||||
|
||||
if sys.version_info >= (3, 10):
|
||||
from tempfile import TemporaryDirectory
|
||||
else:
|
||||
|
||||
@@ -207,7 +207,7 @@ def exec_page_sync(page_context: PageContext) -> PageResult:
|
||||
filtered_image = page_context.plugin_manager.hook.filter_page_image(
|
||||
page=page_context, image_filename=visible_image_out
|
||||
)
|
||||
if filtered_image:
|
||||
if filtered_image is not None: # None if no hook is present
|
||||
visible_image_out = filtered_image
|
||||
pdf_page_from_image_out = create_pdf_page_from_image(
|
||||
visible_image_out, page_context, orientation_correction
|
||||
|
||||
@@ -134,7 +134,7 @@ def check_options_preprocessing(options: Namespace) -> None:
|
||||
package='unpaper',
|
||||
version_checker=unpaper.version,
|
||||
need_version='6.1',
|
||||
required_for=['--clean, --clean-final'],
|
||||
required_for="--clean, --clean-final", # Problem arguments
|
||||
)
|
||||
try:
|
||||
if options.unpaper_args:
|
||||
@@ -221,7 +221,7 @@ def check_options_metadata(options: Namespace) -> None:
|
||||
def check_options_pillow(options: Namespace) -> None:
|
||||
PIL.Image.MAX_IMAGE_PIXELS = int(options.max_image_mpixels * 1_000_000)
|
||||
if PIL.Image.MAX_IMAGE_PIXELS == 0:
|
||||
PIL.Image.MAX_IMAGE_PIXELS = None
|
||||
PIL.Image.MAX_IMAGE_PIXELS = None # type: ignore
|
||||
|
||||
|
||||
def _check_plugin_invariant_options(options: Namespace) -> None:
|
||||
|
||||
@@ -8,10 +8,7 @@ OCRmyPDF uses setuptools_scm to derive version from git tags.
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
try:
|
||||
from importlib.metadata import version as _package_version
|
||||
except ImportError:
|
||||
from importlib_metadata import version as _package_version # type: ignore
|
||||
from importlib.metadata import version as _package_version
|
||||
|
||||
PROGRAM_NAME = 'ocrmypdf'
|
||||
|
||||
|
||||
@@ -15,6 +15,9 @@ from pathlib import Path
|
||||
from typing import AnyStr, BinaryIO, Iterable, Union
|
||||
from warnings import warn
|
||||
|
||||
import coloredlogs
|
||||
from humanfriendly.terminal import enable_ansi_support
|
||||
|
||||
from ocrmypdf._logging import PageNumberFilter, TqdmConsole
|
||||
from ocrmypdf._plugin_manager import get_plugin_manager
|
||||
from ocrmypdf._sync import run_pipeline
|
||||
@@ -22,15 +25,6 @@ from ocrmypdf._validation import check_options
|
||||
from ocrmypdf.cli import ArgumentParser, get_parser
|
||||
from ocrmypdf.helpers import is_iterable_notstr
|
||||
|
||||
try:
|
||||
import coloredlogs
|
||||
except ModuleNotFoundError:
|
||||
coloredlogs = None # pylint: disable=invalid-name
|
||||
|
||||
if coloredlogs:
|
||||
from humanfriendly.terminal import enable_ansi_support
|
||||
|
||||
|
||||
StrPath = Union[Path, AnyStr]
|
||||
PathOrIO = Union[BinaryIO, StrPath]
|
||||
|
||||
@@ -121,7 +115,7 @@ def configure_logging(
|
||||
|
||||
use_colors = progress_bar_friendly
|
||||
formatter = None
|
||||
if coloredlogs and use_colors:
|
||||
if use_colors:
|
||||
use_colors = enable_ansi_support()
|
||||
if use_colors:
|
||||
use_colors = coloredlogs.terminal_supports_colors()
|
||||
@@ -284,8 +278,6 @@ def ocr( # pylint: disable=unused-argument
|
||||
``"-"``, some final validation steps are not performed (we do not read
|
||||
back the stream after it is written).
|
||||
Raises:
|
||||
ocrmypdf.PdfMergeFailedError: If the input PDF is malformed, preventing merging
|
||||
with the OCR layer.
|
||||
ocrmypdf.MissingDependencyError: If a required dependency program is missing or
|
||||
was not found on PATH.
|
||||
ocrmypdf.UnsupportedImageFormatError: If the input file type was an image that
|
||||
|
||||
@@ -21,37 +21,19 @@ def check_options(options):
|
||||
program='gs',
|
||||
package='ghostscript',
|
||||
version_checker=ghostscript.version,
|
||||
need_version='9.15', # limited by Travis CI / Ubuntu 14.04 backports
|
||||
need_version='9.50', # Ubuntu 20.04's version
|
||||
)
|
||||
gs_version = ghostscript.version()
|
||||
if gs_version in ('9.24', '9.51'):
|
||||
if gs_version in ('9.51',):
|
||||
raise MissingDependencyError(
|
||||
f"Ghostscript {gs_version} contains serious regressions and is not "
|
||||
"supported. Please upgrade to a newer version, or downgrade to the "
|
||||
"previous version."
|
||||
)
|
||||
|
||||
# We have these constraints to check for.
|
||||
# 1. Ghostscript < 9.20 mangles multibyte Unicode
|
||||
# 2. hocr doesn't work on non-Latin languages (so don't select it)
|
||||
is_latin = options.languages.issubset(HOCR_OK_LANGS)
|
||||
if gs_version < '9.20' and options.output_type != 'pdf' and not is_latin:
|
||||
# https://bugs.ghostscript.com/show_bug.cgi?id=696874
|
||||
# Ghostscript < 9.20 fails to encode multibyte characters properly
|
||||
log.warning(
|
||||
f"The installed version of Ghostscript ({gs_version}) does not work "
|
||||
"correctly with the OCR languages you specified. Use --output-type pdf or "
|
||||
"upgrade to Ghostscript 9.20 or later to avoid this issue."
|
||||
)
|
||||
|
||||
if options.output_type == 'pdfa':
|
||||
options.output_type = 'pdfa-2'
|
||||
|
||||
if options.output_type == 'pdfa-3' and ghostscript.version() < '9.19':
|
||||
raise MissingDependencyError(
|
||||
"--output-type pdfa-3 requires Ghostscript 9.19 or later"
|
||||
)
|
||||
|
||||
|
||||
@hookimpl
|
||||
def rasterize_pdf_page(
|
||||
|
||||
@@ -43,7 +43,7 @@ def add_options(parser):
|
||||
metavar='MODE',
|
||||
choices=range(0, 4),
|
||||
help=(
|
||||
"Set Tesseract 4.0+ OCR engine mode: "
|
||||
"Set Tesseract 4+ OCR engine mode: "
|
||||
"0 - original Tesseract only; "
|
||||
"1 - neural nets LSTM only; "
|
||||
"2 - Tesseract + LSTM; "
|
||||
@@ -93,7 +93,7 @@ def check_options(options):
|
||||
program='tesseract',
|
||||
package={'linux': 'tesseract-ocr'},
|
||||
version_checker=tesseract.version,
|
||||
need_version='4.0.0-beta.1', # using backport for Travis CI
|
||||
need_version='4.1.1', # Ubuntu 20.04 version
|
||||
version_parser=tesseract.TesseractVersion,
|
||||
)
|
||||
|
||||
@@ -101,11 +101,6 @@ def check_options(options):
|
||||
if options.pdf_renderer == 'auto':
|
||||
options.pdf_renderer = 'sandwich'
|
||||
|
||||
if not tesseract.has_user_words() and (options.user_words or options.user_patterns):
|
||||
log.warning(
|
||||
"Tesseract 4.0 (which you have installed) ignores --user-words and "
|
||||
"--user-patterns, so these arguments have no effect."
|
||||
)
|
||||
if not tesseract.has_thresholding() and options.tesseract_thresholding != 0:
|
||||
log.warning(
|
||||
"The installed version of Tesseract does not support changes to its "
|
||||
|
||||
@@ -47,26 +47,6 @@ class BadArgsError(ExitCodeException):
|
||||
exit_code = ExitCode.bad_args
|
||||
|
||||
|
||||
class PdfMergeFailedError(ExitCodeException): # deprecated
|
||||
"""An intermediate PDF can't be merged.
|
||||
|
||||
No longer in use.
|
||||
"""
|
||||
|
||||
exit_code = ExitCode.input_file
|
||||
message = dedent(
|
||||
'''\
|
||||
Failed to merge PDF image layer with OCR layer
|
||||
|
||||
Usually this happens because the input PDF file is malformed and
|
||||
ocrmypdf cannot correct the problem on its own.
|
||||
|
||||
Try using
|
||||
ocrmypdf --pdf-renderer sandwich [..other args..]
|
||||
'''
|
||||
)
|
||||
|
||||
|
||||
class MissingDependencyError(ExitCodeException):
|
||||
"""A third-party dependency is missing."""
|
||||
|
||||
|
||||
@@ -12,7 +12,6 @@ import shutil
|
||||
import warnings
|
||||
from collections.abc import Iterable
|
||||
from contextlib import suppress
|
||||
from functools import wraps
|
||||
from io import StringIO
|
||||
from math import isclose, isfinite
|
||||
from pathlib import Path
|
||||
@@ -291,20 +290,3 @@ def pikepdf_enable_mmap():
|
||||
# Fix is not in pybind11 2.6.0
|
||||
# log.debug("pikepdf mmap disabled")
|
||||
return
|
||||
|
||||
|
||||
def deprecated(func):
|
||||
"""Warn that function is deprecated."""
|
||||
|
||||
@wraps(func)
|
||||
def new_func(*args, **kwargs):
|
||||
warnings.simplefilter('always', DeprecationWarning) # turn off filter
|
||||
warnings.warn(
|
||||
f"Call to deprecated function {func.__name__}.",
|
||||
category=DeprecationWarning,
|
||||
stacklevel=2,
|
||||
)
|
||||
warnings.simplefilter('default', DeprecationWarning) # reset filter
|
||||
return func(*args, **kwargs)
|
||||
|
||||
return new_func
|
||||
|
||||
@@ -14,10 +14,11 @@ import re
|
||||
import warnings
|
||||
from math import atan, cos, sin
|
||||
from pathlib import Path
|
||||
from typing import Any, NamedTuple, Optional, Tuple, Union
|
||||
from typing import Any, NamedTuple
|
||||
from xml.etree import ElementTree
|
||||
|
||||
with warnings.catch_warnings():
|
||||
# reportlab uses deprecated load_module
|
||||
warnings.filterwarnings(
|
||||
'ignore', category=DeprecationWarning, message=r".*load_module.*"
|
||||
)
|
||||
|
||||
@@ -278,10 +278,6 @@ def filter_page_image(page: PageContext, image_filename: Path) -> Path:
|
||||
will occur. The return value should be a path to a file in the same folder
|
||||
as ``image_filename``.
|
||||
|
||||
Implementation detail: If the value returned is falsy, OCRmyPDF will ignore
|
||||
the return value and assume the input file was unmodified. This is deprecated.
|
||||
To leave the image unmodified, ``image_filename`` should be returned.
|
||||
|
||||
Note:
|
||||
This hook will be called from child processes. Modifying global state
|
||||
will not affect the main process or other child processes.
|
||||
|
||||
@@ -314,7 +314,7 @@ def check_external_program(
|
||||
program: The name of the program to test.
|
||||
package: The name of a software package that typically supplies this program.
|
||||
Usually the same as program.
|
||||
version_check: A callable without arguments that retrieves the installed
|
||||
version_checker: A callable without arguments that retrieves the installed
|
||||
version of program.
|
||||
need_version: The minimum required version.
|
||||
required_for: The name of an argument of feature that requires this program.
|
||||
@@ -325,10 +325,7 @@ def check_external_program(
|
||||
"""
|
||||
|
||||
try:
|
||||
if callable(version_checker):
|
||||
found_version = version_checker()
|
||||
else: # deprecated
|
||||
found_version = version_checker
|
||||
found_version = version_checker()
|
||||
except (CalledProcessError, FileNotFoundError) as e:
|
||||
_error_missing_program(program, package, required_for, recommended)
|
||||
if not recommended:
|
||||
|
||||
@@ -171,11 +171,6 @@ SHIMS = [
|
||||
def fix_windows_args(program: str, args, env):
|
||||
"""Adjust our desired program and command line arguments for use on Windows"""
|
||||
|
||||
if sys.version_info < (3, 8):
|
||||
# bpo-33617 - Windows needs manual Path -> str conversion
|
||||
args = [os.fspath(arg) for arg in args]
|
||||
program = os.fspath(program)
|
||||
|
||||
# If we are running a .py on Windows, ensure we call it with this Python
|
||||
# (to support test suite shims)
|
||||
if program.lower().endswith('.py'):
|
||||
|
||||
@@ -9,13 +9,13 @@ from ocrmypdf import hookimpl
|
||||
from ocrmypdf.builtin_plugins import ghostscript
|
||||
from ocrmypdf.subprocess import run_polling_stderr
|
||||
|
||||
elision_warning = """GPL Ghostscript 9.20: Setting Overprint Mode to 1
|
||||
ELISION_WARNING = """GPL Ghostscript 9.50: Setting Overprint Mode to 1
|
||||
not permitted in PDF/A-2, overprint mode not set"""
|
||||
|
||||
|
||||
def run_append_stderr(*args, **kwargs):
|
||||
proc = run_polling_stderr(*args, **kwargs)
|
||||
proc.stderr += '\n' + elision_warning + '\n'
|
||||
proc.stderr += '\n' + ELISION_WARNING + '\n'
|
||||
return proc
|
||||
|
||||
|
||||
|
||||
@@ -21,7 +21,7 @@ were produced.
|
||||
|
||||
Certain operations are not cached and routed to Tesseract OCR directly.
|
||||
|
||||
Assumes Tesseract 4.0.0-alpha or higher.
|
||||
Assumes Tesseract 4+.
|
||||
|
||||
"""
|
||||
|
||||
|
||||
@@ -27,7 +27,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 4.0.0' />
|
||||
<meta name='ocr-system' content='tesseract 4.1.1' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
||||
</head>
|
||||
<body>
|
||||
@@ -46,7 +46,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
class FixedRotateNoopOcrEngine(OcrEngine):
|
||||
@staticmethod
|
||||
def version():
|
||||
return '4.0.0'
|
||||
return '4.1.1'
|
||||
|
||||
@staticmethod
|
||||
def creator_tag(options):
|
||||
|
||||
@@ -25,7 +25,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
<head>
|
||||
<title></title>
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<meta name='ocr-system' content='tesseract 4.0.0' />
|
||||
<meta name='ocr-system' content='tesseract 4.1.1' />
|
||||
<meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par ocr_line ocrx_word'/>
|
||||
</head>
|
||||
<body>
|
||||
@@ -44,7 +44,7 @@ HOCR_TEMPLATE = '''<?xml version="1.0" encoding="UTF-8"?>
|
||||
class NoopOcrEngine(OcrEngine):
|
||||
@staticmethod
|
||||
def version():
|
||||
return '4.0.0'
|
||||
return '4.1.1'
|
||||
|
||||
@staticmethod
|
||||
def creator_tag(options):
|
||||
|
||||
@@ -54,15 +54,6 @@ def test_no_cpu_count(monkeypatch):
|
||||
assert invoked, "Patched function called during test"
|
||||
|
||||
|
||||
def test_deprecated():
|
||||
@helpers.deprecated
|
||||
def old_function():
|
||||
return 42
|
||||
|
||||
with pytest.deprecated_call():
|
||||
assert old_function() == 42
|
||||
|
||||
|
||||
skipif_docker = pytest.mark.skipif(running_in_docker(), reason="fails on Docker")
|
||||
|
||||
|
||||
|
||||
@@ -545,7 +545,6 @@ def test_tesseract_config_invalid(renderer, resources, invalid_tess_config, outp
|
||||
assert p.returncode == ExitCode.invalid_config
|
||||
|
||||
|
||||
@pytest.mark.skipif(not tesseract.has_user_words(), reason='not functional until 4.1.0')
|
||||
def test_user_words_ocr(resources, outdir):
|
||||
# Does not actually test if --user-words causes output to differ
|
||||
word_list = outdir / 'wordlist.txt'
|
||||
@@ -722,11 +721,9 @@ def test_compression_changed(ocrmypdf_exec, resources, image, compression, outpd
|
||||
if compression == "jpeg":
|
||||
assert pdfimage.enc == Encoding.jpeg
|
||||
else:
|
||||
if ghostscript.jpeg_passthrough_available():
|
||||
# Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be
|
||||
# copied without transcoding - so report
|
||||
if image.endswith('jpg'):
|
||||
assert pdfimage.enc == Encoding.jpeg
|
||||
if image.endswith('jpg'):
|
||||
# Ghostscript JPEG passthrough - no issue
|
||||
assert pdfimage.enc == Encoding.jpeg
|
||||
else:
|
||||
assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000)
|
||||
|
||||
|
||||
@@ -28,9 +28,6 @@ except ImportError:
|
||||
fitz = None
|
||||
|
||||
|
||||
pytestmark = pytest.mark.filterwarnings('ignore:.*XMLParser.*:DeprecationWarning')
|
||||
|
||||
|
||||
@pytest.mark.parametrize("output_type", ['pdfa', 'pdf'])
|
||||
def test_preserve_docinfo(output_type, resources, outpdf):
|
||||
pdf_before = pikepdf.open(resources / 'graph.pdf')
|
||||
@@ -174,7 +171,12 @@ def test_creation_date_preserved(output_type, resources, infile, outpdf):
|
||||
def libxmp_file_to_dict():
|
||||
try:
|
||||
with warnings.catch_warnings():
|
||||
warnings.simplefilter("ignore", DeprecationWarning)
|
||||
# libxmp imports distutils.Version, which is deprecated
|
||||
warnings.filterwarnings(
|
||||
"ignore",
|
||||
category=DeprecationWarning,
|
||||
message=r".*distutils Version classes are deprecated.*",
|
||||
)
|
||||
from libxmp.utils import (
|
||||
file_to_dict, # pylint: disable=import-outside-toplevel
|
||||
)
|
||||
|
||||
@@ -51,10 +51,6 @@ def test_stdout(ocrmypdf_exec, resources, outpdf):
|
||||
assert check_pdf(output_file)
|
||||
|
||||
|
||||
@pytest.mark.xfail(
|
||||
os.name == 'nt' and sys.version_info < (3, 8),
|
||||
reason="Windows does not like this; not sure how to fix",
|
||||
)
|
||||
def test_dev_null(resources):
|
||||
if 'COV_CORE_DATAFILE' in os.environ:
|
||||
pytest.skip("Coverage uses stdout")
|
||||
|
||||
@@ -48,22 +48,6 @@ def test_hocr_notlatin_warning(caplog):
|
||||
assert 'PDF renderer is known to cause' in caplog.text
|
||||
|
||||
|
||||
def test_old_ghostscript(caplog):
|
||||
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.19'), patch(
|
||||
'ocrmypdf._exec.tesseract.get_languages', return_value={'eng', 'chi_sim'}
|
||||
):
|
||||
vd.check_options(*make_opts_pm(language='chi_sim', output_type='pdfa'))
|
||||
assert 'does not work correctly' in caplog.text
|
||||
|
||||
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.18'):
|
||||
with pytest.raises(MissingDependencyError):
|
||||
vd.check_options(*make_opts_pm(output_type='pdfa-3'))
|
||||
|
||||
with patch('ocrmypdf._exec.ghostscript.version', return_value='9.24'):
|
||||
with pytest.raises(MissingDependencyError):
|
||||
vd.check_options(*make_opts_pm())
|
||||
|
||||
|
||||
def test_old_tesseract_error():
|
||||
with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'):
|
||||
with pytest.raises(MissingDependencyError):
|
||||
@@ -103,22 +87,6 @@ def test_optimizing(caplog):
|
||||
assert 'will be ignored because' in caplog.text
|
||||
|
||||
|
||||
def test_user_words(caplog):
|
||||
with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=False):
|
||||
vd.check_options(*make_opts_pm(user_words='foo'))
|
||||
assert (
|
||||
'Tesseract 4.0 (which you have installed) ignores --user-words'
|
||||
in caplog.text
|
||||
)
|
||||
caplog.clear()
|
||||
with patch('ocrmypdf._exec.tesseract.has_user_words', return_value=True):
|
||||
vd.check_options(*make_opts_pm(user_patterns='foo'))
|
||||
assert (
|
||||
'Tesseract 4.0 (which you have installed) ignores --user-words'
|
||||
not in caplog.text
|
||||
)
|
||||
|
||||
|
||||
def test_pillow_options():
|
||||
vd.check_options_pillow(make_opts(max_image_mpixels=0))
|
||||
|
||||
@@ -229,37 +197,38 @@ def test_version_comparison():
|
||||
program="tesseract",
|
||||
package="tesseract",
|
||||
version_checker=lambda: '4.0.0-beta.1',
|
||||
need_version='4.0.0',
|
||||
need_version='4.1.1',
|
||||
version_parser=TesseractVersion,
|
||||
)
|
||||
vd.check_external_program(
|
||||
program="tesseract",
|
||||
package="tesseract",
|
||||
version_checker=lambda: 'v5.0.0-alpha.20200201',
|
||||
need_version='4.0.0',
|
||||
need_version='4.1.1',
|
||||
version_parser=TesseractVersion,
|
||||
)
|
||||
vd.check_external_program(
|
||||
program="tesseract",
|
||||
package="tesseract",
|
||||
version_checker=lambda: '5.0.0-rc1.20211030',
|
||||
need_version='4.0.0',
|
||||
need_version='4.1.1',
|
||||
version_parser=TesseractVersion,
|
||||
)
|
||||
vd.check_external_program(
|
||||
program="tesseract",
|
||||
package="tesseract",
|
||||
version_checker=lambda: 'v4.0.0.20181030', # Some Windows builds use this format
|
||||
need_version='4.0.0',
|
||||
version_parser=TesseractVersion,
|
||||
)
|
||||
vd.check_external_program(
|
||||
program="tesseract",
|
||||
package="tesseract",
|
||||
version_checker=lambda: '4.1.1-rc2-25-g9707',
|
||||
need_version='4.0.0',
|
||||
version_checker=lambda: 'v4.1.1.20181030', # Some Windows builds use this format
|
||||
need_version='4.1.1',
|
||||
version_parser=TesseractVersion,
|
||||
)
|
||||
with pytest.raises(MissingDependencyError):
|
||||
vd.check_external_program(
|
||||
program="tesseract",
|
||||
package="tesseract",
|
||||
version_checker=lambda: '4.1.1-rc2-25-g9707',
|
||||
need_version='4.1.1',
|
||||
version_parser=TesseractVersion,
|
||||
)
|
||||
with pytest.raises(MissingDependencyError):
|
||||
vd.check_external_program(
|
||||
program="dummy_fails",
|
||||
|
||||
Reference in New Issue
Block a user