Update develop with master changes

We’re well out of the “trivial updates” zone
This commit is contained in:
James R. Barlow
2017-05-11 22:54:27 -07:00
31 changed files with 857 additions and 204 deletions

1
.gitignore vendored
View File

@@ -6,6 +6,7 @@ pyvenv.cfg
tasks.py
.bash_history
.ruffus_history.sqlite
.idea/
# Package building
*.egg-info/

View File

@@ -7,6 +7,12 @@ cache:
- tests/cache
- $HOME/Library/Caches/Homebrew
env:
global:
- secure: "oyX5xesoHD7qcDXKIxMyGZPi+H/WxcvfFkaviEmq84K1DDyHk48+9e92IKgrw8/lcTADnEo/AgVKfnhCPflFimk1xTkgaK4sUg1WLI2YjmaHcwl5SlBHa2rN3uGBwy1hyP92qyv/mMc9R59NtRJ8u76lbn6eN9wi7lkFWdE6BTw=" # DOCKERHUB_OCRMYPDF_TOKEN
- secure: "WlyII8YLsiUUyLtEA563GvEZmbneDb/T8q/P1uNbyQ2ps1U82tH0nSUV2CspSMxOFtZzPHCrRvnAmuTYKshBj+GNnBb1J9FKQmFwF+4NPeqsFdUkQ1NeeCmfIRShuNC3Otg2GGwj4Zssdg+QnVy43t2L11qizzfY+lY+MVzAYcM=" # DOCKERHUB_OCRMYPDF_TESS4_TOKEN
- secure: "hsf6MT+n2x3OiDM2fQyJZdV0/PWYmv81LdVqC6cfnHBE/8N3DloJRqQ7WfO14TxhiK9PEC7MpyCj0lSabUHEO7gSH6Vks6I1asoSkt8S9/bSMlhT4hei+pwVpeGEiU5xHVATNjY+D919VC3IFvc3XmjT74h/2SLhaZ+jhEmDggM=" # HOMEBREW_OCRMYPDF_TOKEN
matrix:
include:
- os: linux
@@ -24,9 +30,12 @@ matrix:
before_cache:
- rm -f $HOME/.cache/pip/log/debug.log
before_install:
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then bash .travis/linux_before_install.sh ; fi
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then bash .travis/osx_before_install.sh ; fi
before_install: |
if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
bash .travis/linux_before_install.sh
elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
bash .travis/osx_before_install.sh
fi
install:
- pip3 install .
@@ -34,23 +43,53 @@ install:
script:
- mv ocrmypdf dont_import_this_ocrmypdf
- pytest
- pytest -n auto
- mv dont_import_this_ocrmypdf ocrmypdf
after_success:
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then bash .travis/osx_brew.sh ; fi
# See https://www.appneta.com/blog/pypi-deployment-with-travis-ci/ for
# steps to set up testpypi deploy for untagged builds if desired
deploy:
provider: pypi
# release for main pypi
# 3.6 is considered the build leader and does the deploy, otherwise there is
# a race and all versions will try to deploy
# OTOH if we ever need separate binary wheels then each version needs its
# own deploy
- provider: pypi
user: ocrmypdf-travis
password:
secure: DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo=
secure: "DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo="
distributions: "sdist bdist_wheel"
on:
branch: master
tags: true
condition: $TRAVIS_PYTHON_VERSION == "3.6" && $TRAVIS_OS_NAME == "linux"
skip_upload_docs: true
# test pypi
- provider: pypi
server: https://testpypi.python.org/pypi
user: ocrmypdf-travis
password:
secure: "DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo="
distributions: "sdist"
on:
branch: develop
tags: false
condition: $TRAVIS_OS_NAME == "osx"
skip_upload_docs: true
# null deploy for osx
# we really just want to run after_deploy *after* pypi upload is done, but
# after_deploy on runs if a given box deployed
- provider: script
script: .travis/null_deploy.sh
on:
branch: master
tags: false
condition: $TRAVIS_OS_NAME == "osx"
after_deploy: |
if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
bash .travis/osx_brew.sh
elif [[ "$TRAVIS_PYTHON_VERSION" == "3.6" && "$TRAVIS_OS_NAME" == "linux" ]]; then
curl -H "Content-Type: application/json" --data '{"build": true}' -X POST https://registry.hub.docker.com/u/jbarlow83/ocrmypdf/trigger/$DOCKERHUB_OCRMYPDF_TOKEN/
curl -H "Content-Type: application/json" --data '{"build": true}' -X POST https://registry.hub.docker.com/u/jbarlow83/ocrmypdf-tess4/trigger/$DOCKERHUB_OCRMYPDF_TESS4_TOKEN/
fi

View File

@@ -13,7 +13,6 @@ class Ocrmypdf < Formula
depends_on :python3
depends_on "pkg-config" => :build
depends_on "zlib"
depends_on "libffi"
depends_on "tesseract"
depends_on "ghostscript"

3
.travis/null_deploy.sh Executable file
View File

@@ -0,0 +1,3 @@
#!/usr/bin/env bash
exit 0

View File

@@ -4,5 +4,17 @@ set -x
pip3 install homebrew-pypi-poet
python3 .travis/autobrew.py
brew audit ocrmypdf.rb
cat ocrmypdf.rb
brew audit ocrmypdf.rb
# Important: disable debug output so token is hidden
set +x
git clone https://$HOMEBREW_OCRMYPDF_TOKEN@github.com/jbarlow83/homebrew-ocrmypdf.git
set -x
pushd homebrew-ocrmypdf
cp ../ocrmypdf.rb Formula/ocrmypdf.rb
git add Formula/ocrmypdf.rb
git commit -m "homebrew-ocrmypdf: automatic release $TRAVIS_BUILD_NUMBER $TRAVIS_TAG"
git push origin master
popd

View File

@@ -35,7 +35,7 @@ Main features
- Supports more than `100 languages <https://github.com/tesseract-ocr/tessdata>`_ recognized by Tesseract
- Battle-tested on thousands of PDFs, a test suite and continuous integration
For details: please consult the `release notes <RELEASE_NOTES.rst>`_.
For details: please consult the `documentation <https://ocrmypdf.readthedocs.io/en/latest/>`_.
Motivation
----------

View File

@@ -28,6 +28,13 @@ Add an OCR layer and output a standard PDF
ocrmypdf --output-type pdf input.pdf output.pdf
Create a PDF/A with all color and grayscale images converted to JPEG
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
.. code-block:: bash
ocrmypdf --output-type pdfa --pdfa-image-compression jpeg input.pdf output.pdf
Modify a file in place
""""""""""""""""""""""

View File

@@ -11,14 +11,18 @@ be searched.
PDFs are the best format for scanned documents. Unfortunately, PDFs can be difficult to work with. OCRmyPDF makes it easy to apply image processing and OCR to existing PDFs.
Contents:
.. toctree::
:maxdepth: 2
:maxdepth: 1
introduction
release_notes
installation
languages
.. toctree::
:caption: Usage
:maxdepth: 2
cookbook
advanced
batch
@@ -26,7 +30,6 @@ Contents:
errors
Indices and tables
==================

View File

@@ -53,6 +53,20 @@ By default, OCRmyPDF will convert the file to a PDF/A. This behavior can be dis
Depending on the settings selected, OCRmyPDF may "graft" the OCR layer into the existing PDF, or reconstruct a visually equivalent new PDF.
Why you shouldn't do this manually
----------------------------------
There are two routes to manually applying OCR to an existing PDF, both of which destroy information in the original PDF.
1. Rasterize each page as an image, OCR the images, and combine the output into a PDF. This preserves the appearance of each page, but resamples all images (possibly losing quality, increasing file size, introducing compression artifacts, etc.)
2. Extract each image, OCR, and combine the output into a PDF. This loses the context in which images are used in the PDF, meaning that cropping, rotation and scaling of pages may be lost. Some scanned PDFs use multiple images segmented into black and white, grayscale and color regions, with stencil masks to prevent overlap, as this can enhance the appearance of a file while reducing file size. Clearly, reassembling these images will be easy. This also loses and text or vector art on any pages in a PDF with both scanned and pure digital content.
In the case of a PDF that is nothing other than a container of images (no rotation, scaling, cropping, one image per page), the second approach can be lossless.
OCRmyPDF uses several strategies depending on input options and the input PDF itself, but generally speaking it rasterizes a page for OCR and then grafts the OCR back onto the original. As such it can handle complex PDFs and still preserve their contents as much as possible.
Limitations
-----------
@@ -64,6 +78,7 @@ OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences these l
* If a document contains languages outside of those given in the ``-l LANG`` arguments, results may be poor.
* It is not always good at analyzing the natural reading order of documents. For example, it may fail to recognize that a document contains two columns and join text across the columns.
* Poor quality scans may produce poor quality OCR. Garbage in, garbage out.
* PDFs that use transparent layers are not currently checked in the test suite, so they may not work correctly.
OCRmyPDF is also limited by the PDF specification:
@@ -72,5 +87,18 @@ OCRmyPDF is also limited by the PDF specification:
Ghostscript also imposes some limitations:
* PDFs containing JBIG2-encoded content will be converted to CCITT Group4 encoding, which has lower compression ratios, if Ghostscript PDF/A is enabled.
* PDFs containing JPEG 2000-encoded content will be converted to JPEG encoding, which may introduce compression artifacts, if Ghostscript PDF/A is enabled.
* Ghostscript may transcode grayscale and color images, either lossy to lossless or lossless to lossy, based on an internal algorithm. This behavior can be suppressed by setting ``--pdfa-image-compression`` to ``jpeg`` or ``lossless`` to set all images to one type or the other. Ghostscript has no option to maintain the input image's format.
OCRmyPDF is currently not designed to be used as a Python API; it is designed to be run as a command line tool. ``import ocrmypf`` currently attempts to process the command line on ``sys.argv`` at import time so it has side effects that will interfere with its use as a package. The API it presents should not be considered stable.
OCRmyPDF is currently not designed to be used as a Python API; it is designed to be run as a command line tool. ``import ocrmypf`` currently attempts to process the command line on ``sys.argv`` at import time so it has side effects that will interfere with its use as a package. The API it presents should not be considered stable.
Similar programs
----------------
To the author's knowledge, OCRmyPDF is the most feature-rich and thoroughly tested command line OCR PDF conversion tool. If it does not meet your needs, contributions and suggestions are welcome. If not, consider one of these similar open source programs:
* pdf2pdfocr
* pdfsandwich
* pypdfocr
* pdfbeads

View File

@@ -1,4 +1,4 @@
RELEASE NOTES
Release notes
=============
OCRmyPDF uses `semantic versioning <http://semver.org/>`_.
@@ -11,15 +11,39 @@ next
- Remove OCRmyPDF.sh script
v4.5.3
======
v4.5.6
------
- Added a workaround for Ghostscript 9.21 and probably early versions would fail with the error message "VMerror -25", due to a Ghostscript bug in XMP metadata handling
- Fixed issue #156, 'NoneType' object has no attribute 'getObject' on pages with no optional /Contents record. This should resolve all issues related to pages with no /Contents record.
- Fixed issue #158, ocrmypdf now stops and terminates if Ghostscript fails on an intermediate step, as it is not possible to proceed.
- Fixed issue #160, exception thrown on certain invalid arguments instead of error message
v4.5.5
------
- Automated update of macOS homebrew tap
- Fixed issue #154, KeyError '/Contents' when searching for text on blank pages that have no /Contents record. Note: incomplete fix for this issue.
v4.5.4
------
- Fix ``--skip-big`` raising an exception if a page contains no images (#152) (thanks to @TomRaz)
- Fix an issue where pages with no images might trigger "cannot write mode P as JPEG" (#151)
v4.5.3
------
- Added a workaround for Ghostscript 9.21 and probably earlier versions would fail with the error message "VMerror -25", due to a Ghostscript bug in XMP metadata handling
- High Unicode characters (U+10000 and up) are no longer accepted for setting metadata on the command line, as Ghostscript may not handle them correctly.
- Fixed an issue where the ``tess4`` renderer would duplicate content onto output pages if tesseract failed or timed out
- Fixed ``tess4`` renderer not recognized when lossless reconstruction is possible
v4.5.2
======
------
- Fix issue #147. ``--pdf-renderer tess4 --clean`` will produce an oversized page containing the original image in the bottom left corner, due to loss DPI information.
- Make "using Tesseract 4.0" warning less ominous
@@ -27,13 +51,13 @@ v4.5.2
v4.5.1
======
------
- Fix issue #137, proportions of images with a non-square pixel aspect ratio would be distorted in output for ``--force-ocr`` and some other combinations of flags
v4.5
====
----
- Exotic PDFs containing "Form XObjects" are now supported (issue #134; PDF reference manual 8.10), and images they contain are taken into account when determining the resolution for rasterizing
- The Tesseract 4 Docker image no longer includes all languages, because it took so long to build something would tend to fail
@@ -41,7 +65,7 @@ v4.5
v4.4.2
======
------
- The Docker images (ocrmypdf, ocrmypdf-polyglot, ocrmypdf-tess4) are now based on Ubuntu 16.10 instead of Debian stretch
@@ -52,7 +76,7 @@ v4.4.2
v4.4.1
======
------
- To prevent a `TIFF output error <https://github.com/python-pillow/Pillow/issues/2206>`_ caused by img2pdf >= 0.2.1 and Pillow <= 3.4.2, dependencies have been tightened
- The Tesseract 4.00 simultaneous process limit was increased from 1 to 2, since it was observed that 1 lowers performance
@@ -61,7 +85,7 @@ v4.4.1
- Tweaks to setup.py to deal with issues in the v4.4 release
v4.4
====
----
- Tesseract 4.00 is now supported on an experimental basis.
@@ -79,32 +103,32 @@ v4.4
v4.3.5
======
------
- Update documentation to confirm Python 3.6.0 compatibility. No code changes were needed, so many earlier versions are likely supported.
v4.3.4
======
------
- Fixed "decimal.InvalidOperation: quantize result has too many digits" for high DPI images
v4.3.3
======
------
- Fixed PDF/A creation with Ghostscript 9.20 properly
- Fixed an exception on inline stencil masks with a missing optional parameter
v4.3.2
======
------
- Fixed a PDF/A creation issue with Ghostscript 9.20 (note: this fix did not actually work)
v4.3.1
======
------
- Fixed an issue where pages produced by the "hocr" renderer after a Tesseract timeout would be rotated incorrectly if the input page was rotated with a /Rotate marker
- Fixed a file handle leak in LeptonicaErrorTrap that would cause a "too many open files" error for files around hundred pages of pages long when ``--deskew`` or ``--remove-background`` or other Leptonica based image processing features were in use, depending on the system value of ``ulimit -n``
@@ -115,7 +139,7 @@ v4.3.1
v4.3
====
----
- New feature ``--remove-background`` to detect and erase the background of color and grayscale images
- Better documentation
@@ -126,20 +150,20 @@ v4.3
+ Some output validation is disabled in this mode
v4.2.5
======
------
- Fixed an issue (#100) with PDFs that omit the optional /BitsPerComponent parameter on images
- Removed non-free file milk.pdf
v4.2.4
======
------
- Fixed an error (#90) caused by PDFs that use stencil masks properly
- Fixed handling of PDFs that try to draw images or stencil masks without properly setting up the graphics state (such images are now ignored for the purposes of calculating DPI)
v4.2.3
======
------
- Fixed an issue with PDFs that store page rotation (/Rotate) in an indirect object
- Integrated a few fixes to simplify downstream packaging (Debian)
@@ -153,20 +177,20 @@ v4.2.3
v4.2.2
======
------
- Improvements to documentation
v4.2.1
======
------
- Fixed an issue where PDF pages that contained stencil masks would report an incorrect DPI and cause Ghostscript to abort
- Implemented stdin streaming
v4.2
====
----
- ocrmypdf will now try to convert single image files to PDFs if they are provided as input (#15)
@@ -199,13 +223,13 @@ v4.2
- Ghostscript now runs in "safer" mode where possible
v4.1.4
======
------
- Bug fix: monochrome images with an ICC profile attached were incorrectly converted to full color images if lossless reconstruction was not possible due to other settings; consequence was increased file size for these images
v4.1.3
======
------
- More helpful error message for PDFs with version 4 security handler
- Update usage instructions for Windows/Docker users
@@ -214,14 +238,14 @@ v4.1.3
v4.1.2
======
------
- Replace IEC sRGB ICC profile with Debian's sRGB (from icc-profiles-free) which is more compatible with the MIT license
- More helpful error message for an error related to certain types of malformed PDFs
v4.1
====
----
- ``--rotate-pages`` now only rotates pages when reasonably confidence in the orientation. This behavior can be adjusted with the new argument ``--rotate-pages-threshold``
- Fixed problems in error checking if ``unpaper`` is uninstalled or missing at run-time
@@ -229,20 +253,20 @@ v4.1
v4.0.7
======
------
- Minor correction to Ghostscript output settings
v4.0.6
======
------
- Update install instructions
- Provide a sRGB profile instead of using Ghostscript's
v4.0.5
======
------
- Remove some verbose debug messages from v4.0.4
- Fixed temporary that wasn't being deleted
@@ -250,22 +274,22 @@ v4.0.5
- Inline images are now checked during DPI calculation instead of rejecting the image
v4.0.4
======
------
Released with verbose debug message turned on. Do not use. Skip to v4.0.5.
v4.0.3
======
------
New features
------------
^^^^^^^^^^^^
- Page orientations detected are now reported in a summary comment
Fixes
-----
^^^^^
- Show stack trace if unexpected errors occur
- Treat "too few characters" error message from Tesseract as a reason to skip that page rather than
@@ -274,10 +298,10 @@ Fixes
v4.0.2
======
------
Fixes
-----
^^^^^
- Fixed compatibility with Tesseract 3.04.01 release, particularly its different way of outputting
orientation information
@@ -286,19 +310,19 @@ Fixes
v4.0.1
======
------
Fixes
-----
^^^^^
- Fixed a KeyError if tesseract fails to find page orientation information
v4.0
====
----
New features
------------
^^^^^^^^^^^^
- Automatic page rotation (``-r``) is now available. It uses ignores any prior rotation information
on PDFs and sets rotation based on the dominant orientation of detectable text. This feature is
@@ -308,7 +332,7 @@ New features
Fixes
-----
^^^^^
- Fixed an issue where lossless reconstruction could cause some pages to be appear incorrectly
if the page was rotated by the user in Acrobat after being scanned (specifically if it a /Rotate tag)
@@ -317,7 +341,7 @@ Fixes
Changes
-------
^^^^^^^
- Logging output is now much easier to read
- ``--deskew`` is now performed by Leptonica instead of unpaper (#25)
@@ -330,20 +354,20 @@ Changes
v3.2.1
======
------
Changes
-------
^^^^^^^
- Fixed issue #47 "convert() got and unexpected keyword argument 'dpi'" by upgrading to img2pdf 0.2
- Tweaked the Dockerfiles
v3.2
====
----
New features
------------
^^^^^^^^^^^^
- Lossless reconstruction: when possible, OCRmyPDF will inject text layers without
otherwise manipulating the content and layout of a PDF page. For example, a PDF containing a mix
@@ -355,25 +379,25 @@ New features
for the polyglots among us. It is much larger.
Changes
-------
^^^^^^^
- JPEG transcoding quality is now 95 instead of the default 75. Bigger file sizes for less degradation.
v3.1.1
======
------
Changes
-------
^^^^^^^
- Fixed bug that caused incorrect page size and DPI calculations on documents with mixed page sizes
v3.1
====
----
Changes
-------
^^^^^^^
- Default output format is now PDF/A-2b instead of PDF/A-1b
- Python 3.5 and macOS El Capitan are now supported platforms - no changes were
@@ -388,10 +412,10 @@ Changes
- Set up Travis CI automatic integration testing
v3.0
====
----
New features
------------
^^^^^^^^^^^^
- Easier installation with a Docker container or Python's ``pip`` package manager
- Eliminated many external dependencies, so it's easier to setup
@@ -414,7 +438,7 @@ New features
- Multiple images on the same PDF page are now supported
Changes
-------
^^^^^^^
- New, robust rewrite in Python 3.4+ with ruffus_ pipelines
- Now uses Ghostscript 9.14's improved color conversion model to preserve PDF colors
@@ -452,7 +476,7 @@ Changes
.. _JHOVE: http://jhove.sourceforge.net/
Release candidates
------------------
^^^^^^^^^^^^^^^^^^
- rc9:
@@ -520,12 +544,12 @@ where ``settings.txt`` contains *one argument per line*, for example:
Fixes
-----
^^^^^
- Handling of filenames containing spaces: fixed
Notes and known issues
----------------------
^^^^^^^^^^^^^^^^^^^^^^
- Some dependencies may work with lower versions than tested, so try
overriding dependencies if they are "in the way" to see if they work.
@@ -538,7 +562,7 @@ Notes and known issues
v2.2-stable (2014-09-29)
========================
------------------------
OCRmyPDF versions 1 and 2 were implemented as shell scripts. OCRmyPDF 3.0+ is a fork that gradually replaced all shell scripts with Python while maintaining the existing command line arguments. No one is maintaining old versions.

View File

@@ -1,4 +1,4 @@
PDF Security Issues
PDF security issues
===================
OCRmyPDF should only be used on PDFs you trust. It is not designed to protect you against malware.

View File

@@ -46,6 +46,10 @@ def complain(message):
print(*textwrap.wrap(message), file=sys.stderr)
# Hack to help debugger context find /usr/local/bin
if 'IDE_PROJECT_ROOTS' in os.environ:
os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
if tesseract.version() < MINIMUM_TESS_VERSION:
complain(
"Please install tesseract {0} or newer "
@@ -56,14 +60,9 @@ if tesseract.version() < MINIMUM_TESS_VERSION:
# -------------
# Parser
parser = cmdline.get_argparse(
parser = argparse.ArgumentParser(
prog=PROGRAM_NAME,
version=VERSION,
fromfile_prefix_chars='@',
ignored_args=[
'touch_files_only', 'recreate_database', 'checksum_file_name',
'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
'forced_tasks', 'target_tasks', 'use_threads', 'jobs', 'log_file'],
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""\
Generates a searchable PDF or PDF/A from a regular PDF.
@@ -115,24 +114,22 @@ Online documentation is located at:
""")
parser.add_argument(
'input_file',
'input_file', metavar="input_pdf_or_image",
help="PDF file containing the images to be OCRed (or '-' to read from "
"standard input)")
parser.add_argument(
'output_file',
help="output searchable PDF file (or '-' to write to standard output)")
'output_file', metavar="output_pdf",
help="Output searchable PDF file (or '-' to write to standard output). "
"Existing files will be ovewritten. If same as input file, the "
"input file will be updated only if processing is successful.")
parser.add_argument(
'-l', '--language', action='append',
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
"all language packs installed in your system). To specify multiple "
"languages, join them with '+' or issue this argument once for each "
"language.")
parser.add_argument(
'-j', '--jobs', metavar='N', type=int,
help="Use up to N CPU cores simultaneously (default: use all)")
"all language packs installed in your system). Use -l eng+deu for "
"multiple languages.")
parser.add_argument(
'--image-dpi', metavar='DPI', type=int,
help="for input image instead of PDF, use this DPI instead of file's")
help="For input image instead of PDF, use this DPI instead of file's.")
parser.add_argument(
'--output-type', choices=['pdfa', 'pdf'], default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
@@ -141,47 +138,74 @@ parser.add_argument(
"also has problems with full Unicode text. 'pdf' attempts to "
"preserve file contents as much as possible.")
# Use null string '\0' as sentinel to indicate the user supplied no argument,
# since that is the only invalid character for filepaths on all platforms
# bool('\0') is True in Python
parser.add_argument(
'--sidecar', nargs='?', const='\0', default=None, metavar='FILE',
help="Generate sidecar text files that contain the same text recognized "
"by Tesseract. This may be useful for building a OCR text database. "
"If FILE is omitted, the sidecar file be named {output_file}.txt "
"If FILE is set to '-', the sidecar is written to stdout (a "
"convenient way to preview OCR quality). The output file and sidecar "
"may not both use stdout at the same time.")
parser.add_argument(
'--version', action='version', version=VERSION,
help="Print program version and exit")
jobcontrol = parser.add_argument_group(
"Job control options")
jobcontrol.add_argument(
'-j', '--jobs', metavar='N', type=int,
help="Use up to N CPU cores simultaneously (default: use all).")
jobcontrol.add_argument(
'-q', '--quiet', action='store_true', help="Suppress INFO messages")
jobcontrol.add_argument(
'-v', '--verbose', const="+", default=[], nargs='?', action="append",
help="Print more verbose messages for each additional verbose level")
metadata = parser.add_argument_group(
"Metadata options",
"Set output PDF/A metadata (default: use input document's metadata)")
"Set output PDF/A metadata (default: copy input document's metadata)")
metadata.add_argument(
'--title', type=str,
help="set document title (place multiple words in quotes)")
help="Set document title (place multiple words in quotes)")
metadata.add_argument(
'--author', type=str,
help="set document author")
help="Set document author")
metadata.add_argument(
'--subject', type=str,
help="set document subject description")
help="Set document subject description")
metadata.add_argument(
'--keywords', type=str,
help="set document keywords")
help="Set document keywords")
preprocessing = parser.add_argument_group(
"Image preprocessing options",
"Options to improve the quality of the final PDF and OCR")
preprocessing.add_argument(
'-r', '--rotate-pages', action='store_true',
help="automatically rotate pages based on detected text orientation")
help="Automatically rotate pages based on detected text orientation")
preprocessing.add_argument(
'--remove-background', action='store_true',
help="attempt to remove background from gray or color pages, setting it "
help="Attempt to remove background from gray or color pages, setting it "
"to white ")
preprocessing.add_argument(
'-d', '--deskew', action='store_true',
help="deskew each page before performing OCR")
help="Deskew each page before performing OCR")
preprocessing.add_argument(
'-c', '--clean', action='store_true',
help="clean pages from scanning artifacts before performing OCR, and send "
help="Clean pages from scanning artifacts before performing OCR, and send "
"the cleaned page to OCR, but do not include the cleaned page in "
"the output ")
"the output")
preprocessing.add_argument(
'-i', '--clean-final', action='store_true',
help="clean page as above, and incorporate the cleaned image in the final "
"PDF")
help="Clean page as above, and incorporate the cleaned image in the final "
"PDF. Might remove desired content.")
preprocessing.add_argument(
'--oversample', metavar='DPI', type=int, default=0,
help="oversample images to at least the specified DPI, to improve OCR "
help="Oversample images to at least the specified DPI, to improve OCR "
"results slightly")
ocrsettings = parser.add_argument_group(
@@ -189,11 +213,11 @@ ocrsettings = parser.add_argument_group(
"Control how OCR is applied")
ocrsettings.add_argument(
'-f', '--force-ocr', action='store_true',
help="rasterize any fonts or vector objects on each page, apply OCR, and "
help="Rasterize any fonts or vector objects on each page, apply OCR, and "
"save the rastered output (this rewrites the PDF)")
ocrsettings.add_argument(
'-s', '--skip-text', action='store_true',
help="skip OCR on any pages that already contain text, but include the "
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages")
ocrsettings.add_argument(
@@ -203,23 +227,23 @@ ocrsettings.add_argument(
ocrsettings.add_argument(
'--skip-big', type=float, metavar='MPixels',
help="skip OCR on pages larger than the specified amount of megapixels, "
help="Skip OCR on pages larger than the specified amount of megapixels, "
"but include skipped pages in final output")
advanced = parser.add_argument_group(
"Advanced",
"Advanced options for power users")
"Advanced options to control Tesseract's OCR behavior")
advanced.add_argument(
'--tesseract-config', action='append', metavar='CFG', default=[],
help="additional Tesseract configuration files -- see documentation")
help="Additional Tesseract configuration files -- see documentation")
advanced.add_argument(
'--tesseract-pagesegmode', action='store', type=int, metavar='PSM',
choices=range(0, 14),
help="set Tesseract page segmentation mode (see tesseract --help)")
help="Set Tesseract page segmentation mode (see tesseract --help)")
advanced.add_argument(
'--tesseract-oem', action='store', type=int, metavar='MODE',
choices=range(0, 4),
help=("set Tesseract 4.0 OCR engine mode: "
help=("Set Tesseract 4.0 OCR engine mode: "
"0 - original Tesseract only; "
"1 - neural nets LSTM only; "
"2 - Tesseract + LSTM; "
@@ -227,7 +251,7 @@ advanced.add_argument(
)
advanced.add_argument(
'--pdf-renderer', choices=['auto', 'tesseract', 'hocr', 'tess4'], default='auto',
help="choose OCR PDF renderer - the default option is to let OCRmyPDF "
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
"choose. The 'tesseract' PDF renderer is more accurate and does a "
"better job and document structure such as recognizing columns. It "
"also does a better job on non-Latin languages. However, it does "
@@ -237,22 +261,36 @@ advanced.add_argument(
"to 'tesseract', requires tesseract 4, and gives superior results.")
advanced.add_argument(
'--tesseract-timeout', default=180.0, type=float, metavar='SECONDS',
help='give up on OCR after the timeout, but copy the preprocessed page '
help='Give up on OCR after the timeout, but copy the preprocessed page '
'into the final output')
advanced.add_argument(
'--rotate-pages-threshold', default=14.0, type=float, metavar='CONFIDENCE',
help="only rotate pages when confidence is above this value (arbitrary "
help="Only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)")
advanced.add_argument(
'--pdfa-image-compression', choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.")
debugging = parser.add_argument_group(
"Debugging",
"Arguments to help with troubleshooting and debugging")
debugging.add_argument(
'-k', '--keep-temporary-files', action='store_true',
help="keep temporary files (helpful for debugging)")
help="Keep temporary files (helpful for debugging)")
debugging.add_argument(
'-g', '--debug-rendering', action='store_true',
help="render each page twice with debug information on second page")
help="Render each page twice with debug information on second page")
debugging.add_argument(
'--flowchart', type=str,
help="Generate the pipeline execution flowchart")
def check_options_languages(options, _log):
@@ -269,7 +307,7 @@ def check_options_languages(options, _log):
"data for the following requested languages: \n")
for lang in (set(options.language) - tesseract.languages()):
msg += lang + '\n'
raise argparse.ArgumentError(msg)
raise argparse.ArgumentError(None, msg)
def check_options_output(options, log):
@@ -295,13 +333,23 @@ def check_options_output(options, log):
"--pdf-renderer=tesseract.")
lossless_reconstruction = False
if options.pdf_renderer == 'hocr':
if options.pdf_renderer in ('hocr', 'tess4'):
if not any((options.deskew, options.clean_final, options.force_ocr,
options.remove_background)):
lossless_reconstruction = True
options.lossless_reconstruction = lossless_reconstruction
def check_options_sidecar(options, log):
if options.sidecar == '\0':
if options.output_file == '-':
raise argparse.ArgumentError(
None,
"--sidecar filename must be specified when output file is "
"stdout.")
options.sidecar = options.output_file + '.txt'
def check_options_preprocessing(options, log):
if any((options.clean, options.clean_final)):
from .exec import unpaper
@@ -325,6 +373,7 @@ def check_options_preprocessing(options, log):
def check_options_ocr_behavior(options, log):
if options.force_ocr and options.skip_text:
raise argparse.ArgumentError(
None,
"Error: --force-ocr and --skip-text are mutually incompatible.")
if options.redo_ocr and (options.skip_text or options.force_ocr):
@@ -350,6 +399,12 @@ def check_options_advanced(options, log):
raise MissingDependencyError(
"--pdf-renderer tess4 requires Tesseract 4.x "
"commit 3d9fb3b or later")
if options.pdfa_image_compression != 'auto' and \
options.output_type != 'pdfa':
log.warning(
"--pdfa-image-compression argument has no effect when "
"--output-type is not 'pdfa'"
)
def check_options_metadata(options, log):
@@ -371,6 +426,7 @@ def check_options(options, log):
check_options_languages(options, log)
check_options_metadata(options, log)
check_options_output(options, log)
check_options_sidecar(options, log)
check_options_preprocessing(options, log)
check_options_ocr_behavior(options, log)
check_options_advanced(options, log)
@@ -389,8 +445,9 @@ def check_options(options, log):
# Logging
def logging_factory(logger_name, listargs):
log_file_name, verbose = listargs
def logging_factory(logger_name, logger_args):
verbose = logger_args['verbose']
quiet = logger_args['quiet']
root_logger = logging.getLogger(logger_name)
root_logger.setLevel(logging.DEBUG)
@@ -400,6 +457,8 @@ def logging_factory(logger_name, listargs):
handler.setFormatter(formatter_)
if verbose:
handler.setLevel(logging.DEBUG)
elif quiet:
handler.setLevel(logging.WARNING)
else:
handler.setLevel(logging.INFO)
root_logger.addHandler(handler)
@@ -425,7 +484,7 @@ def available_cpu_count():
def cleanup_ruffus_error_message(msg):
msg = re.sub(r'\s+', r' ', msg, re.MULTILINE)
msg = re.sub(r'\s+', r' ', msg)
msg = re.sub(r"\((.+?)\)", r'\1', msg)
msg = msg.strip()
return msg
@@ -559,8 +618,10 @@ def run_pipeline():
if not check_closed_streams(options):
return ExitCode.bad_args
logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
_log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
logging_factory, __name__, [None, options.verbose])
logging_factory, __name__, logger_args)
_log.debug('ocrmypdf ' + VERSION)
_log.debug('tesseract ' + tesseract.version())
@@ -600,7 +661,7 @@ def run_pipeline():
return ExitCode.bad_args
elif not is_file_writable(options.output_file):
_log.error(textwrap.dedent("""\
Cutput file location is not writable."""))
Output file location is not writable."""))
return ExitCode.file_access_error
manager = JobContextManager()
@@ -646,7 +707,9 @@ def run_pipeline():
_log.error(e)
return ExitCode.other_error
if options.output_file != '-':
if options.flowchart:
_log.info("Flowchart saved to {}".format(options.flowchart))
elif options.output_file != '-':
if options.output_type == 'pdfa':
pdfa_info = file_claims_pdfa(options.output_file)
if pdfa_info['pass']:

View File

@@ -48,7 +48,7 @@ class InputFileError(ExitCodeException):
class SubprocessOutputError(ExitCodeException):
exit_code = ExitCode.other_error
exit_code = ExitCode.child_process_error
class EncryptedPdfError(ExitCodeException):

View File

@@ -5,8 +5,9 @@ from tempfile import NamedTemporaryFile
from subprocess import run, PIPE, STDOUT, CalledProcessError
from shutil import copy
from functools import lru_cache
import re
from . import get_program
from ..pdfa import SRGB_ICC_PROFILE
from ..exceptions import SubprocessOutputError
@lru_cache(maxsize=1)
@@ -27,6 +28,10 @@ def version():
return version.strip()
def _gs_error_reported(stream):
return re.search(r'error', stream, flags=re.IGNORECASE)
def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
pageno=1):
with NamedTemporaryFile(delete=True) as tmp:
@@ -46,7 +51,7 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
p = run(args_gs, stdout=PIPE, stderr=STDOUT,
universal_newlines=True)
if 'error' in p.stdout.lower():
if _gs_error_reported(p.stdout):
log.error(p.stdout)
else:
log.debug(p.stdout)
@@ -54,10 +59,32 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
if p.returncode == 0:
copy(tmp.name, output_file)
else:
log.error('Ghostscript rendering failed')
log.error('Ghostscript rasterizing failed')
raise SubprocessOutputError()
def generate_pdfa(pdf_pages, output_file, log, threads=1):
def generate_pdfa(pdf_pages, output_file, compression, log, threads=1):
compression_args = []
if compression == 'jpeg':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/DCTEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/DCTEncode",
]
elif compression == 'lossless':
compression_args = [
"-dAutoFilterColorImages=false",
"-dColorImageFilter=/FlateEncode",
"-dAutoFilterGrayImages=false",
"-dGrayImageFilter=/FlateEncode",
]
else:
compression_args = [
"-dAutoFilterColorImages=true",
"-dAutoFilterGrayImages=true",
]
with NamedTemporaryFile(delete=True) as gs_pdf:
args_gs = [
get_program("gs"),
@@ -68,7 +95,8 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1):
"-sDEVICE=pdfwrite",
"-dAutoRotatePages=/None",
"-sColorConversionStrategy=/RGB",
"-sProcessColorModel=DeviceRGB",
"-sProcessColorModel=DeviceRGB"
] + compression_args + [
"-dJPEGQ=95",
"-dPDFA=2",
"-dPDFACompatibilityPolicy=1",
@@ -78,7 +106,7 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1):
p = run(args_gs, stdout=PIPE, stderr=STDOUT,
universal_newlines=True)
if 'error' in p.stdout.lower():
if _gs_error_reported(p.stdout):
log.error(p.stdout)
elif 'overprint mode not set' in p.stdout:
# Unless someone is going to print PDF/A documents on a
@@ -96,4 +124,5 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1):
# PDF/A - check PDF/A status elsewhere
copy(gs_pdf.name, output_file)
else:
log.error('Ghostscript PDF/A failed')
log.error('Ghostscript PDF/A rendering failed')
raise SubprocessOutputError()

View File

@@ -11,6 +11,7 @@ from ..helpers import page_number
from . import get_program
from collections import namedtuple
from textwrap import dedent
import PyPDF2 as pypdf
from subprocess import Popen, PIPE, CalledProcessError, \
TimeoutExpired, check_output, STDOUT, DEVNULL
@@ -199,7 +200,7 @@ def page_timedout(log, input_file):
log.warning(prefix + " took too long to OCR - skipping")
def _generate_null_hocr(output_hocr, image):
def _generate_null_hocr(output_hocr, output_sidecar, image):
"""Produce a .hocr file that reports no text detected on a page that is
the same size as the input image."""
from PIL import Image
@@ -209,22 +210,29 @@ def _generate_null_hocr(output_hocr, image):
with open(output_hocr, 'w', encoding="utf-8") as f:
f.write(HOCR_TEMPLATE.format(w, h))
with open(output_sidecar, 'w', encoding='utf-8') as f:
f.write('[skipped page]')
def generate_hocr(input_file, output_hocr, language: list, engine_mode,
def generate_hocr(input_file, output_files, language: list, engine_mode,
tessconfig: list,
timeout: float, pagesegmode: int, log):
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
prefix = os.path.splitext(output_hocr)[0]
args_tesseract = tess_base_args(language, engine_mode)
if pagesegmode is not None:
args_tesseract.extend([psm(), str(pagesegmode)])
# Reminder: test suite tesseract spoofers will break after any changes
# to the number of order parameters here
args_tesseract.extend([
input_file,
badxml,
prefix,
'txt',
'hocr'
] + tessconfig)
try:
@@ -237,25 +245,30 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
# Temporary workaround to hocrTransform not being able to function if
# it does not have a valid hOCR file.
page_timedout(log, input_file)
_generate_null_hocr(output_hocr, input_file)
_generate_null_hocr(output_hocr, output_sidecar, input_file)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_file)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
_generate_null_hocr(output_hocr, input_file)
_generate_null_hocr(output_hocr, output_sidecar, input_file)
return
raise e from e
else:
tesseract_log_output(log, stdout, input_file)
if os.path.exists(badxml + '.html'):
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
shutil.move(badxml + '.html', badxml)
elif os.path.exists(badxml + '.hocr'):
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
shutil.move(badxml + '.hocr', badxml)
# Tesseract 3.02 appends suffix ".html" instead of ".hocr". For
# consistency rename its output to .hocr
if os.path.exists(prefix + '.html'):
shutil.move(prefix + '.html', prefix + '.tmp')
elif os.path.exists(prefix + '.hocr'):
shutil.move(prefix + '.hocr', prefix + '.tmp')
# The sidecar text file will get the suffix .txt; rename it to
# whatever caller wants it named
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_sidecar)
# Tesseract 3.03 inserts source filename into hocr file without
# escaping it, creating invalid XML and breaking the parser.
@@ -264,7 +277,7 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
regex_nested_single_quotes = re.compile(
r"""title='image "([^"]*)";""")
with open(badxml, mode='r', encoding='utf-8') as f_in, \
with open(prefix + '.tmp', mode='r', encoding='utf-8') as f_in, \
open(output_hocr, mode='w', encoding='utf-8') as f_out:
for line in f_in:
line = regex_nested_single_quotes.sub(
@@ -272,14 +285,36 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
f_out.write(line)
def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
engine_mode, text_only: bool,
def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
with open(output_text, 'w') as f:
f.write('[skipped page]')
if not text_only:
os.symlink(skip_pdf, output_pdf)
return
# For text only we must create a blank page with dimensions identical
# to the skip page because this is equivalent to a page with no text
pdf_in = pypdf.PdfFileReader(skip_pdf)
page0 = pdf_in.pages[0]
with open(output_pdf, 'wb') as out:
pdf_out = pypdf.PdfFileWriter()
w, h = page0.mediaBox.getWidth(), page0.mediaBox.getHeight()
pdf_out.addBlankPage(w, h)
pdf_out.write(out)
def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
language: list, engine_mode, text_only: bool,
tessconfig: list, timeout: float, pagesegmode: int, log):
'''Use Tesseract to render a PDF.
input_image -- image to analyze
skip_pdf -- if we time out, use this file as output
output_pdf -- file to generate
output_text -- OCR text file
language -- list of languages to consider
engine_mode -- engine mode argument for tess v4
text_only -- enable tesseract text only mode?
@@ -296,10 +331,15 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
if text_only:
args_tesseract.extend(['-c', 'textonly_pdf=1'])
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
# Reminder: test suite tesseract spoofers will break after any changes
# to the number of order parameters here
args_tesseract.extend([
input_image,
os.path.splitext(output_pdf)[0], # Tesseract appends suffix
'pdf'
prefix,
'txt',
'pdf',
] + tessconfig)
try:
@@ -307,16 +347,18 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
stdout = check_output(
args_tesseract, close_fds=True, stderr=STDOUT,
universal_newlines=True, timeout=timeout)
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_text)
except TimeoutExpired:
page_timedout(log, input_image)
shutil.copy(skip_pdf, output_pdf)
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image)
if 'read_params_file: parameter not found' in e.output:
raise TesseractConfigError() from e
if 'Image too large' in e.output:
shutil.copy(skip_pdf, output_pdf)
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
return
raise e from e
else:

View File

@@ -423,7 +423,7 @@ def _find_images(pdf, container, shorthand=None):
"""
if container.get('/Type') == '/Page':
if container.get('/Type') == '/Page' and '/Contents' in container:
# For a /Page the content stream is attached to the page's /Contents
page = container
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
@@ -457,6 +457,9 @@ def _find_images(pdf, container, shorthand=None):
def _page_has_text(pdf, page):
if not '/Contents' in page:
return False
# Simple test
text = page.extractText()
if text.strip() != '':

View File

@@ -247,7 +247,7 @@ def is_ocr_required(pageinfo, log, options):
"skipping all processing on this page"))
ocr_required = False
if ocr_required and options.skip_big:
if ocr_required and options.skip_big and pageinfo['images']:
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
if pixel_count > (options.skip_big * 1000000):
ocr_required = False
@@ -393,15 +393,16 @@ def rasterize_with_ghostscript(
pageinfo = get_pageinfo(input_file, context)
device = 'png16m' # 24-bit
if all(image['comp'] == 1 for image in pageinfo['images']):
if all(image['bpc'] == 1 for image in pageinfo['images']):
device = 'pngmono'
elif all(image['bpc'] > 1 and image['color'] == 'index'
for image in pageinfo['images']):
device = 'png256'
elif all(image['bpc'] > 1 and image['color'] == 'gray'
for image in pageinfo['images']):
device = 'pnggray'
if pageinfo['images']:
if all(image['comp'] == 1 for image in pageinfo['images']):
if all(image['bpc'] == 1 for image in pageinfo['images']):
device = 'pngmono'
elif all(image['bpc'] > 1 and image['color'] == 'index'
for image in pageinfo['images']):
device = 'png256'
elif all(image['bpc'] > 1 and image['color'] == 'gray'
for image in pageinfo['images']):
device = 'pnggray'
log.debug("Rasterize {0} with {1}".format(
os.path.basename(input_file), device))
@@ -482,13 +483,13 @@ def select_ocr_image(
def ocr_tesseract_hocr(
input_file,
output_file,
output_files,
log,
context):
options = context.get_options()
tesseract.generate_hocr(
input_file=input_file,
output_hocr=output_file,
output_files=output_files,
language=options.language,
engine_mode=options.tesseract_oem,
tessconfig=options.tesseract_config,
@@ -517,7 +518,10 @@ def select_visible_page_image(
image = next(ii for ii in infiles if ii.endswith(image_suffix))
pageinfo = get_pageinfo(image, context)
if all(orig_image['enc'] == 'jpeg' for orig_image in pageinfo['images']):
if pageinfo['images'] and \
all(im['enc'] == 'jpeg' for im in pageinfo['images']):
log.debug('{:4d}: JPEG input -> JPEG output'.format(
page_number(image)))
# If all images were JPEGs originally, produce a JPEG as output
im = Image.open(image)
@@ -575,12 +579,12 @@ def select_image_layer(
def render_hocr_page(
input_file,
infiles,
output_file,
log,
context):
options = context.get_options()
hocr = input_file
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
pageinfo = get_pageinfo(hocr, context)
dpi = get_page_square_dpi(pageinfo, options)
@@ -589,14 +593,22 @@ def render_hocr_page(
showBoundingboxes=False, invisibleText=True)
def flatten_groups(groups):
for obj in groups:
if is_iterable_notstr(obj):
yield from obj
else:
yield obj
def render_hocr_debug_page(
infiles,
output_file,
log,
context):
options = context.get_options()
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
image = next(ii for ii in infiles if ii.endswith('.image'))
hocr = next(ii for ii in flatten_groups(infiles) if ii.endswith('.hocr'))
image = next(ii for ii in flatten_groups(infiles) if ii.endswith('.image'))
pageinfo = get_pageinfo(image, context)
dpi = get_page_square_dpi(pageinfo, options)
@@ -611,8 +623,10 @@ def combine_layers(
output_file,
log,
context):
text = next(ii for ii in infiles if ii.endswith('.text.pdf'))
image = next(ii for ii in infiles if ii.endswith('.image-layer.pdf'))
text = next(ii for ii in flatten_groups(infiles)
if ii.endswith('.text.pdf'))
image = next(ii for ii in flatten_groups(infiles)
if ii.endswith('.image-layer.pdf'))
pdf_text = pypdf.PdfFileReader(open(text, "rb"))
pdf_image = pypdf.PdfFileReader(open(image, "rb"))
@@ -678,21 +692,27 @@ def combine_layers(
def ocr_tesseract_and_render_pdf(
infiles,
output_file,
outfiles,
log,
context):
options = context.get_options()
input_image = next((ii for ii in infiles if ii.endswith('.image')), '')
input_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
if not input_image:
# Skipping this page
re_symlink(input_pdf, output_file, log)
re_symlink(input_pdf, output_pdf, log)
with open(output_text, 'w') as f:
f.write('[skipped page]')
return
tesseract.generate_pdf(
input_image=input_image,
skip_pdf=input_pdf,
output_pdf=output_file,
output_pdf=output_pdf,
output_text=output_text,
language=options.language,
engine_mode=options.tesseract_oem,
text_only=False,
@@ -704,19 +724,23 @@ def ocr_tesseract_and_render_pdf(
def ocr_tesseract_textonly_pdf(
infiles,
output_file,
outfiles,
log,
context):
options = context.get_options()
input_image = next((ii for ii in infiles if ii.endswith('.ocr.png')), '')
if not input_image:
raise ValueError("No image rendered?")
skip_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
tesseract.generate_pdf(
input_image=input_image,
skip_pdf=skip_pdf,
output_pdf=output_file,
output_pdf=output_pdf,
output_text=output_text,
language=options.language,
engine_mode=options.tesseract_oem,
text_only=True,
@@ -773,7 +797,8 @@ def generate_postscript_stub(
def skip_page(
input_file,
output_file,
log):
log,
context):
# The purpose of this step is its filter to forward only the skipped
# files (.skip.oriented.pdf) while disregarding the processed ones
# (.ocr.oriented.pdf). Alternative would be for merge_pages to filter
@@ -782,7 +807,7 @@ def skip_page(
def merge_pages_ghostscript(
input_files,
input_files_groups,
output_file,
log,
context):
@@ -800,17 +825,24 @@ def merge_pages_ghostscript(
key += 1
return key
input_files = (f for f in flatten_groups(input_files_groups)
if not f.endswith('.txt'))
pdf_pages = sorted(input_files, key=input_file_order)
log.debug("Final pages: " + "\n".join(pdf_pages))
ghostscript.generate_pdfa(pdf_pages, output_file, log, options.jobs or 1)
ghostscript.generate_pdfa(
pdf_pages, output_file, options.pdfa_image_compression,
log, options.jobs or 1)
def merge_pages_qpdf(
input_files,
input_files_groups,
output_file,
log,
context):
options = context.get_options()
input_files = list(f for f in flatten_groups(input_files_groups)
if not f.endswith('.txt'))
metadata_file = next(
(ii for ii in input_files if ii.endswith('.repaired.pdf')))
input_files.remove(metadata_file)
@@ -844,6 +876,40 @@ def merge_pages_qpdf(
qpdf.merge(pdf_pages, output_file)
def merge_sidecars(
input_files_groups,
output_file,
log,
context):
options = context.get_options()
pdfinfo = context.get_pdfinfo()
txt_files = [None] * len(pdfinfo)
for infile in flatten_groups(input_files_groups):
if infile.endswith('.txt'):
idx = page_number(infile) - 1
txt_files[idx] = infile
def write_pages(stream):
for page_number, txt_file in enumerate(txt_files):
if page_number != 0:
stream.write('\f') # Form feed between pages
if txt_file:
with open(txt_file, 'r') as in_:
stream.write(in_.read())
else:
stream.write('[OCR skipped on page {}]'.format(
page_number + 1))
if output_file == '-':
write_pages(sys.stdout)
sys.stdout.flush()
else:
with open(output_file, 'w', encoding='utf-8') as out:
write_pages(out)
def copy_final(
input_files,
output_file,
@@ -948,7 +1014,7 @@ def build_pipeline(options, work_folder, log, context):
task_func=ocr_tesseract_hocr,
input=task_select_ocr_image,
filter=suffix(".ocr.png"),
output=".hocr",
output=[".hocr", ".txt"],
extras=[log, context])
task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
@@ -980,8 +1046,8 @@ def build_pipeline(options, work_folder, log, context):
task_render_hocr_page = main_pipeline.transform(
task_func=render_hocr_page,
input=task_ocr_tesseract_hocr,
filter=suffix('.hocr'),
output='.text.pdf',
filter=regex(r".*/(\d{6})(?:\.hocr)"),
output=os.path.join(work_folder, r'\1.text.pdf'),
extras=[log, context])
task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
@@ -1001,7 +1067,8 @@ def build_pipeline(options, work_folder, log, context):
task_func=ocr_tesseract_textonly_pdf,
input=[task_select_ocr_image, task_orient_page],
filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
output=os.path.join(work_folder, r'\1.text.pdf'),
output=[os.path.join(work_folder, r'\1.text.pdf'),
os.path.join(work_folder, r'\1.text.txt')],
extras=[log, context])
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4')
@@ -1024,7 +1091,8 @@ def build_pipeline(options, work_folder, log, context):
task_func=ocr_tesseract_and_render_pdf,
input=[task_select_visible_page_image, task_orient_page],
filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
output=os.path.join(work_folder, r'\1.rendered.pdf'),
output=[os.path.join(work_folder, r'\1.rendered.pdf'),
os.path.join(work_folder, r'\1.rendered.txt')],
extras=[log, context])
task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract')
@@ -1048,7 +1116,7 @@ def build_pipeline(options, work_folder, log, context):
filter=suffix('.skip.oriented.pdf'),
output='.done.pdf',
output_dir=work_folder,
extras=[log])
extras=[log, context])
# Merge pages
task_merge_pages_ghostscript = main_pipeline.merge(
@@ -1073,6 +1141,15 @@ def build_pipeline(options, work_folder, log, context):
extras=[log, context])
task_merge_pages_qpdf.active_if(options.output_type == 'pdf')
task_merge_sidecars = main_pipeline.merge(
task_func=merge_sidecars,
input=[task_ocr_tesseract_hocr,
task_ocr_tesseract_and_render_pdf,
task_ocr_tesseract_textonly_pdf],
output=options.sidecar,
extras=[log, context])
task_merge_sidecars.active_if(options.sidecar)
# Finalize
task_copy_final = main_pipeline.merge(
task_func=copy_final,

View File

@@ -11,3 +11,4 @@ ignore =
[tool:pytest]
norecursedirs = lib .pc .git output cache resources
testpaths = tests
addopts = -n auto

View File

@@ -1,2 +1,3 @@
pytest >= 3.0
pytest-helpers-namespace
pytest-helpers-namespace
pytest-xdist

View File

@@ -34,6 +34,9 @@ In some cases they were converted from one image format to another without other
* - typewriter.png, 2400dpi.pdf
- `Wikimedia: Triumph typewrtier text Linzensoep`_
* Creative Commons BY-SA 2.5
* - baiona.png
- `Wikimedia: Baionako udalerri mugakideak`_
- Creative Commons BY-SA 4.0
Files generated for this project
@@ -85,6 +88,9 @@ under the terms of the license in LICENSE.rst.
* - overlay.pdf
- @maxandersen
- PDF file generated by PDFPen pro that triggered content stream parse errors
* - no_conentes.pdf
- @jbarlow83
- synthetic PDF with a blank page that has no /Contents entry
Assemblies
==========
@@ -115,4 +121,6 @@ These test resources are assemblies from other previously mentioned files, relea
.. _`Linux (Wikipedia Article)`: https://de.wikipedia.org/wiki/Linux
.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif
.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif
.. _`Wikimedia: Baionako udalerri mugakideak`: https://commons.wikimedia.org/wiki/File:Baionako_udalerri_mugakideak.png

BIN
tests/resources/baiona.png Normal file
View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

Binary file not shown.

View File

@@ -0,0 +1,31 @@
#!/usr/bin/env python3
import sys
import os
"""Replicate Ghostscript raster failure while allowing rendering"""
def real_ghostscript(argv):
gs_args = ['gs'] + argv[1:]
os.execvp("gs", gs_args)
return # Not reachable
def main():
if '--version' in sys.argv:
print('9.20')
print('SPOOFED: ' + os.path.basename(__filename__))
sys.exit(0)
# For any rendering calls (device == pdfwrite) call real ghostscript
if '-sDEVICE=pdfwrite' in sys.argv:
real_ghostscript(sys.argv)
return
# Fail
print("ERROR: Ghost story archive not found")
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,31 @@
#!/usr/bin/env python3
import sys
import os
"""Replicate Ghostscript render failure while allowing rasterizing"""
def real_ghostscript(argv):
gs_args = ['gs'] + argv[1:]
os.execvp("gs", gs_args)
return # Not reachable
def main():
if '--version' in sys.argv:
print('9.20')
print('SPOOFED: ' + os.path.basename(__filename__))
sys.exit(0)
# For any rasterize calls (device != pdfwrite) call real ghostscript
if '-sDEVICE=pdfwrite' not in sys.argv:
real_ghostscript(sys.argv)
return
# Fail
print("ERROR: Casper is not a friendly ghost")
sys.exit(1)
if __name__ == '__main__':
main()

View File

@@ -36,6 +36,10 @@ def real_tesseract():
def main():
operation = sys.argv[-1]
sidecar = False
if sys.argv[-2] == 'txt':
sidecar = True
# For anything unexpected operation, defer to real tesseract binary
# Currently this includes all use of "--tesseract-config"
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
@@ -92,16 +96,22 @@ def main():
return
if operation == 'stdout':
# tesseract [--options] ... input stdout
input_file = sys.argv[-2]
output_file = 'stdout'
sidecar_file = ''
else:
input_file = sys.argv[-3]
output_file = sys.argv[-2]
# tesseract [--options] ... input output txt hocr|pdf
input_file = sys.argv[-4]
output_file = sys.argv[-3]
sidecar_file = sys.argv[-3]
if operation == 'hocr':
output_file += '.hocr'
sidecar_file += '.txt'
elif operation == 'pdf':
output_file += '.pdf'
sidecar_file += '.txt'
with open(input_file, 'rb') as f:
m.update(f.read())
@@ -112,6 +122,8 @@ def main():
print("Tesseract cache hit", file=sys.stderr)
if operation != 'stdout':
shutil.copy(cache_name, output_file)
if sidecar:
shutil.copy(cache_name + '.sidecar', sidecar_file)
# Replicate output
with open(cache_name + '.stdout', 'rb') as f:
@@ -149,6 +161,8 @@ def main():
shutil.copy(output_file, cache_name)
else:
print("Could not find output file", file=sys.stderr)
if sidecar and os.path.exists(sidecar_file):
shutil.copy(sidecar_file, cache_name + '.sidecar')
else:
open(cache_name, 'w').close()

View File

@@ -53,18 +53,22 @@ def main():
print('List of available languages (1):\neng', file=sys.stderr)
sys.exit(0)
elif sys.argv[-1] == 'hocr':
inputf = sys.argv[-3]
output = sys.argv[-2]
inputf = sys.argv[-4]
output = sys.argv[-3]
with Image.open(inputf) as im, \
open(output + '.hocr', 'w', encoding='utf-8') as f:
w, h = im.size
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
with open(output + '.txt', 'w') as f:
f.write('')
elif sys.argv[-1] == 'pdf':
inputf = sys.argv[-3]
output = sys.argv[-2]
inputf = sys.argv[-4]
output = sys.argv[-3]
pdf_bytes = img2pdf.convert([inputf], dpi=300)
with open(output + '.pdf', 'wb') as f:
f.write(pdf_bytes)
with open(output + '.txt', 'w') as f:
f.write('')
elif sys.argv[-1] == 'stdout':
inputf = sys.argv[-2]
print("""Orientation: 0

View File

@@ -51,6 +51,16 @@ def spoof_no_tess_pdfa_warning():
return spoof(tesseract='tesseract_noop.py', gs='gs_feature_elision.py')
@pytest.fixture
def spoof_no_tess_gs_render_fail():
return spoof(tesseract='tesseract_noop.py', gs='gs_render_failure.py')
@pytest.fixture
def spoof_no_tess_gs_raster_fail():
return spoof(tesseract='tesseract_noop.py', gs='gs_raster_failure.py')
@pytest.fixture
def spoof_qpdf_always_error():
return spoof(qpdf='qpdf_dummy_return2.py')
@@ -137,14 +147,18 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir):
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
resources, outdir):
outfile = outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer)
check_ocrmypdf(
resources / pdf,
outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer),
outfile,
'-dc',
'-v', '1',
'--output-type', output_type,
'--sidecar',
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
assert outfile.with_suffix('.pdf.txt').exists()
@pytest.mark.parametrize("output_type", [
'pdfa', 'pdf'
@@ -738,6 +752,11 @@ def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf):
"Checks for a Decimal quantize error with high DPI, etc"
check_ocrmypdf(resources / '2400dpi.pdf', outpdf,
env=spoof_tesseract_cache)
pdfinfo = pdf_get_all_pageinfo(outpdf)
image = pdfinfo[0]['images'][0]
assert image['dpi_w'] == image['dpi_h']
assert image['dpi_w'] == 2400
def test_overlay(spoof_tesseract_noop, resources, outpdf):
@@ -835,3 +854,138 @@ def test_pagesize_consistency(renderer, resources, outpdf):
assert isclose(before_dims[0], after_dims[0])
assert isclose(before_dims[1], after_dims[1])
def test_skip_big_with_no_images(spoof_tesseract_noop, resources, outpdf):
check_ocrmypdf(resources / 'blank.pdf', outpdf,
'--skip-big', '5',
'--force-ocr',
env=spoof_tesseract_noop)
def test_gs_render_failure(spoof_no_tess_gs_render_fail, resources, outpdf):
p, out, err = run_ocrmypdf(
resources / 'blank.pdf', outpdf,
env=spoof_no_tess_gs_render_fail)
print(err)
assert p.returncode == ExitCode.child_process_error
def test_gs_raster_failure(spoof_no_tess_gs_raster_fail, resources, outpdf):
p, out, err = run_ocrmypdf(
resources / 'ccitt.pdf', outpdf,
env=spoof_no_tess_gs_raster_fail)
print(err)
assert p.returncode == ExitCode.child_process_error
def test_no_contents(spoof_tesseract_noop, resources, outpdf):
check_ocrmypdf(resources / 'no_contents.pdf', outpdf, '--force-ocr',
env=spoof_tesseract_noop)
@pytest.mark.parametrize('image', [
'baiona.png',
'baiona_gray.png',
'congress.jpg'
])
def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec,
resources, image, outpdf):
from PIL import Image
input_file = str(resources / image)
output_file = str(outpdf)
im = Image.open(input_file)
# Runs: ocrmypdf - output.pdf < testfile
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + [
'--image-dpi', '150', '--output-type', 'pdf', '-', output_file]
p = Popen(
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
stdin=input_stream, env=spoof_tesseract_noop)
out, err = p.communicate()
assert p.returncode == ExitCode.ok
pdfinfo = pdf_get_all_pageinfo(output_file)
pdfimage = pdfinfo[0]['images'][0]
if input_file.endswith('.png'):
assert pdfimage['enc'] != 'jpeg', \
"Lossless compression changed to lossy!"
elif input_file.endswith('.jpg'):
assert pdfimage['enc'] == 'jpeg', \
"Lossy compression changed to lossless!"
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
assert pdfimage['color'] == 'rgb', \
"Colorspace changed"
elif im.mode.startswith('L'):
assert pdfimage['color'] == 'gray', \
"Colorspace changed"
@pytest.mark.parametrize('image,compression', [
('baiona.png', 'jpeg'),
('baiona_gray.png', 'lossless'),
('congress.jpg', 'lossless')
])
def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
resources, image, compression, outpdf):
from PIL import Image
input_file = str(resources / image)
output_file = str(outpdf)
im = Image.open(input_file)
# Runs: ocrmypdf - output.pdf < testfile
with open(input_file, 'rb') as input_stream:
p_args = ocrmypdf_exec + [
'--image-dpi', '150', '--output-type', 'pdfa',
'--pdfa-image-compression', compression,
'-', output_file]
p = Popen(
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
stdin=input_stream, env=spoof_tesseract_noop)
out, err = p.communicate()
assert p.returncode == ExitCode.ok
pdfinfo = pdf_get_all_pageinfo(output_file)
pdfimage = pdfinfo[0]['images'][0]
if compression == 'jpeg':
assert pdfimage['enc'] == 'jpeg'
elif compression == 'lossless':
assert pdfimage['enc'] == 'image'
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
assert pdfimage['color'] == 'rgb', \
"Colorspace changed"
elif im.mode.startswith('L'):
assert pdfimage['color'] == 'gray', \
"Colorspace changed"
def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf):
sidecar = outpdf + '.txt'
check_ocrmypdf(
resources / 'multipage.pdf', outpdf,
'--skip-text',
'--sidecar', sidecar,
env=spoof_tesseract_cache)
pdfinfo = pdf_get_all_pageinfo(str(resources / 'multipage.pdf'))
num_pages = len(pdfinfo)
with open(sidecar, 'r') as f:
ocr_text = f.read()
# There should a formfeed between each pair of pages, so the count of
# formfeeds is the page count less one
assert ocr_text.count('\f') == num_pages - 1, \
"Sidecar page count does not match PDF page count"

View File

@@ -107,3 +107,11 @@ def test_form_xobject(resources):
pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
pdfimage = pdfinfo[0]['images'][0]
assert pdfimage['width'] == 50
def test_no_contents(resources):
filename = resources / 'no_contents.pdf'
pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
assert len(pdfinfo[0]['images']) == 0
assert pdfinfo[0]['has_text'] == False

View File

@@ -6,11 +6,42 @@ from ocrmypdf.exceptions import ExitCode
from ocrmypdf.exec import tesseract
from ocrmypdf import pageinfo
import sys
import os
import PyPDF2 as pypdf
spoof = pytest.helpers.spoof
def tess4_possible_location():
"""The location of tesseract 4 may be OCRMYPDF_TESS4, OCRMYPDF_TESSERACT,
or the installed version on PATH."""
return os.environ.get('OCRMYPDF_TESS4') or \
os.environ.get('OCRMYPDF_TESSERACT') or \
'tesseract'
@pytest.fixture
def ensure_tess4():
return spoof(tesseract=tess4_possible_location())
def tess4_available():
"""Check if a tesseract 4 binary is available, even if it's not the
official "tesseract" on PATH
"""
old_environ = os.environ.copy()
try:
os.environ['OCRMYPDF_TESSERACT'] = tess4_possible_location()
return tesseract.v4() and tesseract.has_textonly_pdf()
finally:
os.environ = old_environ
# Skip all tests in this file if not tesseract 4
pytestmark = pytest.mark.skipif(
not (tesseract.v4() and tesseract.has_textonly_pdf()),
not tess4_available(),
reason="tesseract 4.0 with textonly_pdf feature required")
check_ocrmypdf = pytest.helpers.check_ocrmypdf
@@ -18,14 +49,16 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof
def test_textonly_pdf(resources, outdir):
def test_textonly_pdf(ensure_tess4, resources, outdir):
check_ocrmypdf(
resources / 'linn.pdf',
outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4')
outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4',
'--sidecar', 'foo',
env=ensure_tess4)
@pytest.mark.skipif(sys.version_info < (3, 5), reason="needs math.isclose")
def test_pagesize_consistency_tess4(resources, outpdf):
def test_pagesize_consistency_tess4(ensure_tess4, resources, outpdf):
from math import isclose
infile = resources / 'linn.pdf'
@@ -35,9 +68,47 @@ def test_pagesize_consistency_tess4(resources, outpdf):
check_ocrmypdf(
infile,
outpdf, '--pdf-renderer', 'tess4',
'--clean', '--deskew', '--remove-background', '--clean-final')
'--clean', '--deskew', '--remove-background', '--clean-final',
env=ensure_tess4)
after_dims = pytest.helpers.first_page_dimensions(outpdf)
assert isclose(before_dims[0], after_dims[0])
assert isclose(before_dims[1], after_dims[1])
@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])
def test_skip_pages_does_not_replicate(
ensure_tess4, resources, basename, outdir):
infile = resources / basename
outpdf = outdir / basename
check_ocrmypdf(
infile,
outpdf, '--pdf-renderer', 'tess4', '--force-ocr',
'--tesseract-timeout', '0',
env=ensure_tess4
)
info_in = pageinfo.pdf_get_all_pageinfo(str(infile))
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
for page in info:
assert len(page['images']) == 1, "skipped page was replicated"
for n in range(len(info_in)):
assert info[n]['width_inches'] == info_in[n]['width_inches']
def test_content_preservation(ensure_tess4, resources, outpdf):
infile = resources / 'masks.pdf'
check_ocrmypdf(
infile,
outpdf, '--pdf-renderer', 'tess4', '--tesseract-timeout', '0',
env=ensure_tess4
)
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
page = info[0]
assert len(page['images']) > 1, "masked were rasterized"