mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 03:58:06 -04:00
Update develop with master changes
We’re well out of the “trivial updates” zone
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -6,6 +6,7 @@ pyvenv.cfg
|
||||
tasks.py
|
||||
.bash_history
|
||||
.ruffus_history.sqlite
|
||||
.idea/
|
||||
|
||||
# Package building
|
||||
*.egg-info/
|
||||
|
||||
63
.travis.yml
63
.travis.yml
@@ -7,6 +7,12 @@ cache:
|
||||
- tests/cache
|
||||
- $HOME/Library/Caches/Homebrew
|
||||
|
||||
env:
|
||||
global:
|
||||
- secure: "oyX5xesoHD7qcDXKIxMyGZPi+H/WxcvfFkaviEmq84K1DDyHk48+9e92IKgrw8/lcTADnEo/AgVKfnhCPflFimk1xTkgaK4sUg1WLI2YjmaHcwl5SlBHa2rN3uGBwy1hyP92qyv/mMc9R59NtRJ8u76lbn6eN9wi7lkFWdE6BTw=" # DOCKERHUB_OCRMYPDF_TOKEN
|
||||
- secure: "WlyII8YLsiUUyLtEA563GvEZmbneDb/T8q/P1uNbyQ2ps1U82tH0nSUV2CspSMxOFtZzPHCrRvnAmuTYKshBj+GNnBb1J9FKQmFwF+4NPeqsFdUkQ1NeeCmfIRShuNC3Otg2GGwj4Zssdg+QnVy43t2L11qizzfY+lY+MVzAYcM=" # DOCKERHUB_OCRMYPDF_TESS4_TOKEN
|
||||
- secure: "hsf6MT+n2x3OiDM2fQyJZdV0/PWYmv81LdVqC6cfnHBE/8N3DloJRqQ7WfO14TxhiK9PEC7MpyCj0lSabUHEO7gSH6Vks6I1asoSkt8S9/bSMlhT4hei+pwVpeGEiU5xHVATNjY+D919VC3IFvc3XmjT74h/2SLhaZ+jhEmDggM=" # HOMEBREW_OCRMYPDF_TOKEN
|
||||
|
||||
matrix:
|
||||
include:
|
||||
- os: linux
|
||||
@@ -24,9 +30,12 @@ matrix:
|
||||
before_cache:
|
||||
- rm -f $HOME/.cache/pip/log/debug.log
|
||||
|
||||
before_install:
|
||||
- if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then bash .travis/linux_before_install.sh ; fi
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then bash .travis/osx_before_install.sh ; fi
|
||||
before_install: |
|
||||
if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
|
||||
bash .travis/linux_before_install.sh
|
||||
elif [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
|
||||
bash .travis/osx_before_install.sh
|
||||
fi
|
||||
|
||||
install:
|
||||
- pip3 install .
|
||||
@@ -34,23 +43,53 @@ install:
|
||||
|
||||
script:
|
||||
- mv ocrmypdf dont_import_this_ocrmypdf
|
||||
- pytest
|
||||
- pytest -n auto
|
||||
- mv dont_import_this_ocrmypdf ocrmypdf
|
||||
|
||||
after_success:
|
||||
- if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then bash .travis/osx_brew.sh ; fi
|
||||
|
||||
# See https://www.appneta.com/blog/pypi-deployment-with-travis-ci/ for
|
||||
# steps to set up testpypi deploy for untagged builds if desired
|
||||
|
||||
deploy:
|
||||
provider: pypi
|
||||
# release for main pypi
|
||||
# 3.6 is considered the build leader and does the deploy, otherwise there is
|
||||
# a race and all versions will try to deploy
|
||||
# OTOH if we ever need separate binary wheels then each version needs its
|
||||
# own deploy
|
||||
- provider: pypi
|
||||
user: ocrmypdf-travis
|
||||
password:
|
||||
secure: DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo=
|
||||
secure: "DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo="
|
||||
distributions: "sdist bdist_wheel"
|
||||
on:
|
||||
branch: master
|
||||
tags: true
|
||||
condition: $TRAVIS_PYTHON_VERSION == "3.6" && $TRAVIS_OS_NAME == "linux"
|
||||
skip_upload_docs: true
|
||||
|
||||
# test pypi
|
||||
- provider: pypi
|
||||
server: https://testpypi.python.org/pypi
|
||||
user: ocrmypdf-travis
|
||||
password:
|
||||
secure: "DTFOmmNL6olA0+yXvp4u9jXZlZeqrJsJ0526jzqf4a3gZ6jnGTq5UI6WzRsslSyoMMfXKtHQebqHM6ogSgCZinyZ3ufHJo8fn9brxbEc2gsiWkbj5o3bGwdWMT1vNNE7XW0VCpw87rZ1EEwjl4FJHFudMlPR1yfU5+uq0k0PACo="
|
||||
distributions: "sdist"
|
||||
on:
|
||||
branch: develop
|
||||
tags: false
|
||||
condition: $TRAVIS_OS_NAME == "osx"
|
||||
skip_upload_docs: true
|
||||
|
||||
# null deploy for osx
|
||||
# we really just want to run after_deploy *after* pypi upload is done, but
|
||||
# after_deploy on runs if a given box deployed
|
||||
- provider: script
|
||||
script: .travis/null_deploy.sh
|
||||
on:
|
||||
branch: master
|
||||
tags: false
|
||||
condition: $TRAVIS_OS_NAME == "osx"
|
||||
|
||||
after_deploy: |
|
||||
if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then
|
||||
bash .travis/osx_brew.sh
|
||||
elif [[ "$TRAVIS_PYTHON_VERSION" == "3.6" && "$TRAVIS_OS_NAME" == "linux" ]]; then
|
||||
curl -H "Content-Type: application/json" --data '{"build": true}' -X POST https://registry.hub.docker.com/u/jbarlow83/ocrmypdf/trigger/$DOCKERHUB_OCRMYPDF_TOKEN/
|
||||
curl -H "Content-Type: application/json" --data '{"build": true}' -X POST https://registry.hub.docker.com/u/jbarlow83/ocrmypdf-tess4/trigger/$DOCKERHUB_OCRMYPDF_TESS4_TOKEN/
|
||||
fi
|
||||
@@ -13,7 +13,6 @@ class Ocrmypdf < Formula
|
||||
|
||||
depends_on :python3
|
||||
depends_on "pkg-config" => :build
|
||||
depends_on "zlib"
|
||||
depends_on "libffi"
|
||||
depends_on "tesseract"
|
||||
depends_on "ghostscript"
|
||||
|
||||
3
.travis/null_deploy.sh
Executable file
3
.travis/null_deploy.sh
Executable file
@@ -0,0 +1,3 @@
|
||||
#!/usr/bin/env bash
|
||||
|
||||
exit 0
|
||||
@@ -4,5 +4,17 @@ set -x
|
||||
|
||||
pip3 install homebrew-pypi-poet
|
||||
python3 .travis/autobrew.py
|
||||
brew audit ocrmypdf.rb
|
||||
cat ocrmypdf.rb
|
||||
brew audit ocrmypdf.rb
|
||||
|
||||
# Important: disable debug output so token is hidden
|
||||
set +x
|
||||
git clone https://$HOMEBREW_OCRMYPDF_TOKEN@github.com/jbarlow83/homebrew-ocrmypdf.git
|
||||
set -x
|
||||
|
||||
pushd homebrew-ocrmypdf
|
||||
cp ../ocrmypdf.rb Formula/ocrmypdf.rb
|
||||
git add Formula/ocrmypdf.rb
|
||||
git commit -m "homebrew-ocrmypdf: automatic release $TRAVIS_BUILD_NUMBER $TRAVIS_TAG"
|
||||
git push origin master
|
||||
popd
|
||||
|
||||
@@ -35,7 +35,7 @@ Main features
|
||||
- Supports more than `100 languages <https://github.com/tesseract-ocr/tessdata>`_ recognized by Tesseract
|
||||
- Battle-tested on thousands of PDFs, a test suite and continuous integration
|
||||
|
||||
For details: please consult the `release notes <RELEASE_NOTES.rst>`_.
|
||||
For details: please consult the `documentation <https://ocrmypdf.readthedocs.io/en/latest/>`_.
|
||||
|
||||
Motivation
|
||||
----------
|
||||
|
||||
@@ -28,6 +28,13 @@ Add an OCR layer and output a standard PDF
|
||||
|
||||
ocrmypdf --output-type pdf input.pdf output.pdf
|
||||
|
||||
Create a PDF/A with all color and grayscale images converted to JPEG
|
||||
""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
|
||||
|
||||
.. code-block:: bash
|
||||
|
||||
ocrmypdf --output-type pdfa --pdfa-image-compression jpeg input.pdf output.pdf
|
||||
|
||||
Modify a file in place
|
||||
""""""""""""""""""""""
|
||||
|
||||
|
||||
@@ -11,14 +11,18 @@ be searched.
|
||||
|
||||
PDFs are the best format for scanned documents. Unfortunately, PDFs can be difficult to work with. OCRmyPDF makes it easy to apply image processing and OCR to existing PDFs.
|
||||
|
||||
Contents:
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
:maxdepth: 1
|
||||
|
||||
introduction
|
||||
release_notes
|
||||
installation
|
||||
languages
|
||||
|
||||
.. toctree::
|
||||
:caption: Usage
|
||||
:maxdepth: 2
|
||||
|
||||
cookbook
|
||||
advanced
|
||||
batch
|
||||
@@ -26,7 +30,6 @@ Contents:
|
||||
errors
|
||||
|
||||
|
||||
|
||||
Indices and tables
|
||||
==================
|
||||
|
||||
|
||||
@@ -53,6 +53,20 @@ By default, OCRmyPDF will convert the file to a PDF/A. This behavior can be dis
|
||||
Depending on the settings selected, OCRmyPDF may "graft" the OCR layer into the existing PDF, or reconstruct a visually equivalent new PDF.
|
||||
|
||||
|
||||
Why you shouldn't do this manually
|
||||
----------------------------------
|
||||
|
||||
There are two routes to manually applying OCR to an existing PDF, both of which destroy information in the original PDF.
|
||||
|
||||
1. Rasterize each page as an image, OCR the images, and combine the output into a PDF. This preserves the appearance of each page, but resamples all images (possibly losing quality, increasing file size, introducing compression artifacts, etc.)
|
||||
|
||||
2. Extract each image, OCR, and combine the output into a PDF. This loses the context in which images are used in the PDF, meaning that cropping, rotation and scaling of pages may be lost. Some scanned PDFs use multiple images segmented into black and white, grayscale and color regions, with stencil masks to prevent overlap, as this can enhance the appearance of a file while reducing file size. Clearly, reassembling these images will be easy. This also loses and text or vector art on any pages in a PDF with both scanned and pure digital content.
|
||||
|
||||
In the case of a PDF that is nothing other than a container of images (no rotation, scaling, cropping, one image per page), the second approach can be lossless.
|
||||
|
||||
OCRmyPDF uses several strategies depending on input options and the input PDF itself, but generally speaking it rasterizes a page for OCR and then grafts the OCR back onto the original. As such it can handle complex PDFs and still preserve their contents as much as possible.
|
||||
|
||||
|
||||
Limitations
|
||||
-----------
|
||||
|
||||
@@ -64,6 +78,7 @@ OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences these l
|
||||
* If a document contains languages outside of those given in the ``-l LANG`` arguments, results may be poor.
|
||||
* It is not always good at analyzing the natural reading order of documents. For example, it may fail to recognize that a document contains two columns and join text across the columns.
|
||||
* Poor quality scans may produce poor quality OCR. Garbage in, garbage out.
|
||||
* PDFs that use transparent layers are not currently checked in the test suite, so they may not work correctly.
|
||||
|
||||
OCRmyPDF is also limited by the PDF specification:
|
||||
|
||||
@@ -72,5 +87,18 @@ OCRmyPDF is also limited by the PDF specification:
|
||||
Ghostscript also imposes some limitations:
|
||||
|
||||
* PDFs containing JBIG2-encoded content will be converted to CCITT Group4 encoding, which has lower compression ratios, if Ghostscript PDF/A is enabled.
|
||||
* PDFs containing JPEG 2000-encoded content will be converted to JPEG encoding, which may introduce compression artifacts, if Ghostscript PDF/A is enabled.
|
||||
* Ghostscript may transcode grayscale and color images, either lossy to lossless or lossless to lossy, based on an internal algorithm. This behavior can be suppressed by setting ``--pdfa-image-compression`` to ``jpeg`` or ``lossless`` to set all images to one type or the other. Ghostscript has no option to maintain the input image's format.
|
||||
|
||||
OCRmyPDF is currently not designed to be used as a Python API; it is designed to be run as a command line tool. ``import ocrmypf`` currently attempts to process the command line on ``sys.argv`` at import time so it has side effects that will interfere with its use as a package. The API it presents should not be considered stable.
|
||||
OCRmyPDF is currently not designed to be used as a Python API; it is designed to be run as a command line tool. ``import ocrmypf`` currently attempts to process the command line on ``sys.argv`` at import time so it has side effects that will interfere with its use as a package. The API it presents should not be considered stable.
|
||||
|
||||
|
||||
Similar programs
|
||||
----------------
|
||||
|
||||
To the author's knowledge, OCRmyPDF is the most feature-rich and thoroughly tested command line OCR PDF conversion tool. If it does not meet your needs, contributions and suggestions are welcome. If not, consider one of these similar open source programs:
|
||||
|
||||
* pdf2pdfocr
|
||||
* pdfsandwich
|
||||
* pypdfocr
|
||||
* pdfbeads
|
||||
@@ -1,4 +1,4 @@
|
||||
RELEASE NOTES
|
||||
Release notes
|
||||
=============
|
||||
|
||||
OCRmyPDF uses `semantic versioning <http://semver.org/>`_.
|
||||
@@ -11,15 +11,39 @@ next
|
||||
- Remove OCRmyPDF.sh script
|
||||
|
||||
|
||||
v4.5.3
|
||||
======
|
||||
v4.5.6
|
||||
------
|
||||
|
||||
- Added a workaround for Ghostscript 9.21 and probably early versions would fail with the error message "VMerror -25", due to a Ghostscript bug in XMP metadata handling
|
||||
- Fixed issue #156, 'NoneType' object has no attribute 'getObject' on pages with no optional /Contents record. This should resolve all issues related to pages with no /Contents record.
|
||||
- Fixed issue #158, ocrmypdf now stops and terminates if Ghostscript fails on an intermediate step, as it is not possible to proceed.
|
||||
- Fixed issue #160, exception thrown on certain invalid arguments instead of error message
|
||||
|
||||
|
||||
v4.5.5
|
||||
------
|
||||
|
||||
- Automated update of macOS homebrew tap
|
||||
- Fixed issue #154, KeyError '/Contents' when searching for text on blank pages that have no /Contents record. Note: incomplete fix for this issue.
|
||||
|
||||
|
||||
v4.5.4
|
||||
------
|
||||
|
||||
- Fix ``--skip-big`` raising an exception if a page contains no images (#152) (thanks to @TomRaz)
|
||||
- Fix an issue where pages with no images might trigger "cannot write mode P as JPEG" (#151)
|
||||
|
||||
|
||||
v4.5.3
|
||||
------
|
||||
|
||||
- Added a workaround for Ghostscript 9.21 and probably earlier versions would fail with the error message "VMerror -25", due to a Ghostscript bug in XMP metadata handling
|
||||
- High Unicode characters (U+10000 and up) are no longer accepted for setting metadata on the command line, as Ghostscript may not handle them correctly.
|
||||
- Fixed an issue where the ``tess4`` renderer would duplicate content onto output pages if tesseract failed or timed out
|
||||
- Fixed ``tess4`` renderer not recognized when lossless reconstruction is possible
|
||||
|
||||
|
||||
v4.5.2
|
||||
======
|
||||
------
|
||||
|
||||
- Fix issue #147. ``--pdf-renderer tess4 --clean`` will produce an oversized page containing the original image in the bottom left corner, due to loss DPI information.
|
||||
- Make "using Tesseract 4.0" warning less ominous
|
||||
@@ -27,13 +51,13 @@ v4.5.2
|
||||
|
||||
|
||||
v4.5.1
|
||||
======
|
||||
------
|
||||
|
||||
- Fix issue #137, proportions of images with a non-square pixel aspect ratio would be distorted in output for ``--force-ocr`` and some other combinations of flags
|
||||
|
||||
|
||||
v4.5
|
||||
====
|
||||
----
|
||||
|
||||
- Exotic PDFs containing "Form XObjects" are now supported (issue #134; PDF reference manual 8.10), and images they contain are taken into account when determining the resolution for rasterizing
|
||||
- The Tesseract 4 Docker image no longer includes all languages, because it took so long to build something would tend to fail
|
||||
@@ -41,7 +65,7 @@ v4.5
|
||||
|
||||
|
||||
v4.4.2
|
||||
======
|
||||
------
|
||||
|
||||
- The Docker images (ocrmypdf, ocrmypdf-polyglot, ocrmypdf-tess4) are now based on Ubuntu 16.10 instead of Debian stretch
|
||||
|
||||
@@ -52,7 +76,7 @@ v4.4.2
|
||||
|
||||
|
||||
v4.4.1
|
||||
======
|
||||
------
|
||||
|
||||
- To prevent a `TIFF output error <https://github.com/python-pillow/Pillow/issues/2206>`_ caused by img2pdf >= 0.2.1 and Pillow <= 3.4.2, dependencies have been tightened
|
||||
- The Tesseract 4.00 simultaneous process limit was increased from 1 to 2, since it was observed that 1 lowers performance
|
||||
@@ -61,7 +85,7 @@ v4.4.1
|
||||
- Tweaks to setup.py to deal with issues in the v4.4 release
|
||||
|
||||
v4.4
|
||||
====
|
||||
----
|
||||
|
||||
- Tesseract 4.00 is now supported on an experimental basis.
|
||||
|
||||
@@ -79,32 +103,32 @@ v4.4
|
||||
|
||||
|
||||
v4.3.5
|
||||
======
|
||||
------
|
||||
|
||||
- Update documentation to confirm Python 3.6.0 compatibility. No code changes were needed, so many earlier versions are likely supported.
|
||||
|
||||
|
||||
v4.3.4
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed "decimal.InvalidOperation: quantize result has too many digits" for high DPI images
|
||||
|
||||
|
||||
v4.3.3
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed PDF/A creation with Ghostscript 9.20 properly
|
||||
- Fixed an exception on inline stencil masks with a missing optional parameter
|
||||
|
||||
|
||||
v4.3.2
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed a PDF/A creation issue with Ghostscript 9.20 (note: this fix did not actually work)
|
||||
|
||||
|
||||
v4.3.1
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed an issue where pages produced by the "hocr" renderer after a Tesseract timeout would be rotated incorrectly if the input page was rotated with a /Rotate marker
|
||||
- Fixed a file handle leak in LeptonicaErrorTrap that would cause a "too many open files" error for files around hundred pages of pages long when ``--deskew`` or ``--remove-background`` or other Leptonica based image processing features were in use, depending on the system value of ``ulimit -n``
|
||||
@@ -115,7 +139,7 @@ v4.3.1
|
||||
|
||||
|
||||
v4.3
|
||||
====
|
||||
----
|
||||
|
||||
- New feature ``--remove-background`` to detect and erase the background of color and grayscale images
|
||||
- Better documentation
|
||||
@@ -126,20 +150,20 @@ v4.3
|
||||
+ Some output validation is disabled in this mode
|
||||
|
||||
v4.2.5
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed an issue (#100) with PDFs that omit the optional /BitsPerComponent parameter on images
|
||||
- Removed non-free file milk.pdf
|
||||
|
||||
|
||||
v4.2.4
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed an error (#90) caused by PDFs that use stencil masks properly
|
||||
- Fixed handling of PDFs that try to draw images or stencil masks without properly setting up the graphics state (such images are now ignored for the purposes of calculating DPI)
|
||||
|
||||
v4.2.3
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed an issue with PDFs that store page rotation (/Rotate) in an indirect object
|
||||
- Integrated a few fixes to simplify downstream packaging (Debian)
|
||||
@@ -153,20 +177,20 @@ v4.2.3
|
||||
|
||||
|
||||
v4.2.2
|
||||
======
|
||||
------
|
||||
|
||||
- Improvements to documentation
|
||||
|
||||
|
||||
v4.2.1
|
||||
======
|
||||
------
|
||||
|
||||
- Fixed an issue where PDF pages that contained stencil masks would report an incorrect DPI and cause Ghostscript to abort
|
||||
- Implemented stdin streaming
|
||||
|
||||
|
||||
v4.2
|
||||
====
|
||||
----
|
||||
|
||||
- ocrmypdf will now try to convert single image files to PDFs if they are provided as input (#15)
|
||||
|
||||
@@ -199,13 +223,13 @@ v4.2
|
||||
- Ghostscript now runs in "safer" mode where possible
|
||||
|
||||
v4.1.4
|
||||
======
|
||||
------
|
||||
|
||||
- Bug fix: monochrome images with an ICC profile attached were incorrectly converted to full color images if lossless reconstruction was not possible due to other settings; consequence was increased file size for these images
|
||||
|
||||
|
||||
v4.1.3
|
||||
======
|
||||
------
|
||||
|
||||
- More helpful error message for PDFs with version 4 security handler
|
||||
- Update usage instructions for Windows/Docker users
|
||||
@@ -214,14 +238,14 @@ v4.1.3
|
||||
|
||||
|
||||
v4.1.2
|
||||
======
|
||||
------
|
||||
|
||||
- Replace IEC sRGB ICC profile with Debian's sRGB (from icc-profiles-free) which is more compatible with the MIT license
|
||||
- More helpful error message for an error related to certain types of malformed PDFs
|
||||
|
||||
|
||||
v4.1
|
||||
====
|
||||
----
|
||||
|
||||
- ``--rotate-pages`` now only rotates pages when reasonably confidence in the orientation. This behavior can be adjusted with the new argument ``--rotate-pages-threshold``
|
||||
- Fixed problems in error checking if ``unpaper`` is uninstalled or missing at run-time
|
||||
@@ -229,20 +253,20 @@ v4.1
|
||||
|
||||
|
||||
v4.0.7
|
||||
======
|
||||
------
|
||||
|
||||
- Minor correction to Ghostscript output settings
|
||||
|
||||
|
||||
v4.0.6
|
||||
======
|
||||
------
|
||||
|
||||
- Update install instructions
|
||||
- Provide a sRGB profile instead of using Ghostscript's
|
||||
|
||||
|
||||
v4.0.5
|
||||
======
|
||||
------
|
||||
|
||||
- Remove some verbose debug messages from v4.0.4
|
||||
- Fixed temporary that wasn't being deleted
|
||||
@@ -250,22 +274,22 @@ v4.0.5
|
||||
- Inline images are now checked during DPI calculation instead of rejecting the image
|
||||
|
||||
v4.0.4
|
||||
======
|
||||
------
|
||||
|
||||
Released with verbose debug message turned on. Do not use. Skip to v4.0.5.
|
||||
|
||||
|
||||
v4.0.3
|
||||
======
|
||||
------
|
||||
|
||||
New features
|
||||
------------
|
||||
^^^^^^^^^^^^
|
||||
|
||||
- Page orientations detected are now reported in a summary comment
|
||||
|
||||
|
||||
Fixes
|
||||
-----
|
||||
^^^^^
|
||||
|
||||
- Show stack trace if unexpected errors occur
|
||||
- Treat "too few characters" error message from Tesseract as a reason to skip that page rather than
|
||||
@@ -274,10 +298,10 @@ Fixes
|
||||
|
||||
|
||||
v4.0.2
|
||||
======
|
||||
------
|
||||
|
||||
Fixes
|
||||
-----
|
||||
^^^^^
|
||||
|
||||
- Fixed compatibility with Tesseract 3.04.01 release, particularly its different way of outputting
|
||||
orientation information
|
||||
@@ -286,19 +310,19 @@ Fixes
|
||||
|
||||
|
||||
v4.0.1
|
||||
======
|
||||
------
|
||||
|
||||
Fixes
|
||||
-----
|
||||
^^^^^
|
||||
|
||||
- Fixed a KeyError if tesseract fails to find page orientation information
|
||||
|
||||
|
||||
v4.0
|
||||
====
|
||||
----
|
||||
|
||||
New features
|
||||
------------
|
||||
^^^^^^^^^^^^
|
||||
|
||||
- Automatic page rotation (``-r``) is now available. It uses ignores any prior rotation information
|
||||
on PDFs and sets rotation based on the dominant orientation of detectable text. This feature is
|
||||
@@ -308,7 +332,7 @@ New features
|
||||
|
||||
|
||||
Fixes
|
||||
-----
|
||||
^^^^^
|
||||
|
||||
- Fixed an issue where lossless reconstruction could cause some pages to be appear incorrectly
|
||||
if the page was rotated by the user in Acrobat after being scanned (specifically if it a /Rotate tag)
|
||||
@@ -317,7 +341,7 @@ Fixes
|
||||
|
||||
|
||||
Changes
|
||||
-------
|
||||
^^^^^^^
|
||||
|
||||
- Logging output is now much easier to read
|
||||
- ``--deskew`` is now performed by Leptonica instead of unpaper (#25)
|
||||
@@ -330,20 +354,20 @@ Changes
|
||||
|
||||
|
||||
v3.2.1
|
||||
======
|
||||
------
|
||||
|
||||
Changes
|
||||
-------
|
||||
^^^^^^^
|
||||
|
||||
- Fixed issue #47 "convert() got and unexpected keyword argument 'dpi'" by upgrading to img2pdf 0.2
|
||||
- Tweaked the Dockerfiles
|
||||
|
||||
|
||||
v3.2
|
||||
====
|
||||
----
|
||||
|
||||
New features
|
||||
------------
|
||||
^^^^^^^^^^^^
|
||||
|
||||
- Lossless reconstruction: when possible, OCRmyPDF will inject text layers without
|
||||
otherwise manipulating the content and layout of a PDF page. For example, a PDF containing a mix
|
||||
@@ -355,25 +379,25 @@ New features
|
||||
for the polyglots among us. It is much larger.
|
||||
|
||||
Changes
|
||||
-------
|
||||
^^^^^^^
|
||||
|
||||
- JPEG transcoding quality is now 95 instead of the default 75. Bigger file sizes for less degradation.
|
||||
|
||||
|
||||
|
||||
v3.1.1
|
||||
======
|
||||
------
|
||||
|
||||
Changes
|
||||
-------
|
||||
^^^^^^^
|
||||
|
||||
- Fixed bug that caused incorrect page size and DPI calculations on documents with mixed page sizes
|
||||
|
||||
v3.1
|
||||
====
|
||||
----
|
||||
|
||||
Changes
|
||||
-------
|
||||
^^^^^^^
|
||||
|
||||
- Default output format is now PDF/A-2b instead of PDF/A-1b
|
||||
- Python 3.5 and macOS El Capitan are now supported platforms - no changes were
|
||||
@@ -388,10 +412,10 @@ Changes
|
||||
- Set up Travis CI automatic integration testing
|
||||
|
||||
v3.0
|
||||
====
|
||||
----
|
||||
|
||||
New features
|
||||
------------
|
||||
^^^^^^^^^^^^
|
||||
|
||||
- Easier installation with a Docker container or Python's ``pip`` package manager
|
||||
- Eliminated many external dependencies, so it's easier to setup
|
||||
@@ -414,7 +438,7 @@ New features
|
||||
- Multiple images on the same PDF page are now supported
|
||||
|
||||
Changes
|
||||
-------
|
||||
^^^^^^^
|
||||
|
||||
- New, robust rewrite in Python 3.4+ with ruffus_ pipelines
|
||||
- Now uses Ghostscript 9.14's improved color conversion model to preserve PDF colors
|
||||
@@ -452,7 +476,7 @@ Changes
|
||||
.. _JHOVE: http://jhove.sourceforge.net/
|
||||
|
||||
Release candidates
|
||||
------------------
|
||||
^^^^^^^^^^^^^^^^^^
|
||||
|
||||
- rc9:
|
||||
|
||||
@@ -520,12 +544,12 @@ where ``settings.txt`` contains *one argument per line*, for example:
|
||||
|
||||
|
||||
Fixes
|
||||
-----
|
||||
^^^^^
|
||||
|
||||
- Handling of filenames containing spaces: fixed
|
||||
|
||||
Notes and known issues
|
||||
----------------------
|
||||
^^^^^^^^^^^^^^^^^^^^^^
|
||||
|
||||
- Some dependencies may work with lower versions than tested, so try
|
||||
overriding dependencies if they are "in the way" to see if they work.
|
||||
@@ -538,7 +562,7 @@ Notes and known issues
|
||||
|
||||
|
||||
v2.2-stable (2014-09-29)
|
||||
========================
|
||||
------------------------
|
||||
|
||||
OCRmyPDF versions 1 and 2 were implemented as shell scripts. OCRmyPDF 3.0+ is a fork that gradually replaced all shell scripts with Python while maintaining the existing command line arguments. No one is maintaining old versions.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
PDF Security Issues
|
||||
PDF security issues
|
||||
===================
|
||||
|
||||
OCRmyPDF should only be used on PDFs you trust. It is not designed to protect you against malware.
|
||||
|
||||
@@ -46,6 +46,10 @@ def complain(message):
|
||||
print(*textwrap.wrap(message), file=sys.stderr)
|
||||
|
||||
|
||||
# Hack to help debugger context find /usr/local/bin
|
||||
if 'IDE_PROJECT_ROOTS' in os.environ:
|
||||
os.environ['PATH'] = '/usr/local/bin:' + os.environ['PATH']
|
||||
|
||||
if tesseract.version() < MINIMUM_TESS_VERSION:
|
||||
complain(
|
||||
"Please install tesseract {0} or newer "
|
||||
@@ -56,14 +60,9 @@ if tesseract.version() < MINIMUM_TESS_VERSION:
|
||||
# -------------
|
||||
# Parser
|
||||
|
||||
parser = cmdline.get_argparse(
|
||||
parser = argparse.ArgumentParser(
|
||||
prog=PROGRAM_NAME,
|
||||
version=VERSION,
|
||||
fromfile_prefix_chars='@',
|
||||
ignored_args=[
|
||||
'touch_files_only', 'recreate_database', 'checksum_file_name',
|
||||
'key_legend_in_graph', 'draw_graph_horizontally', 'flowchart_format',
|
||||
'forced_tasks', 'target_tasks', 'use_threads', 'jobs', 'log_file'],
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description="""\
|
||||
Generates a searchable PDF or PDF/A from a regular PDF.
|
||||
@@ -115,24 +114,22 @@ Online documentation is located at:
|
||||
""")
|
||||
|
||||
parser.add_argument(
|
||||
'input_file',
|
||||
'input_file', metavar="input_pdf_or_image",
|
||||
help="PDF file containing the images to be OCRed (or '-' to read from "
|
||||
"standard input)")
|
||||
parser.add_argument(
|
||||
'output_file',
|
||||
help="output searchable PDF file (or '-' to write to standard output)")
|
||||
'output_file', metavar="output_pdf",
|
||||
help="Output searchable PDF file (or '-' to write to standard output). "
|
||||
"Existing files will be ovewritten. If same as input file, the "
|
||||
"input file will be updated only if processing is successful.")
|
||||
parser.add_argument(
|
||||
'-l', '--language', action='append',
|
||||
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
|
||||
"all language packs installed in your system). To specify multiple "
|
||||
"languages, join them with '+' or issue this argument once for each "
|
||||
"language.")
|
||||
parser.add_argument(
|
||||
'-j', '--jobs', metavar='N', type=int,
|
||||
help="Use up to N CPU cores simultaneously (default: use all)")
|
||||
"all language packs installed in your system). Use -l eng+deu for "
|
||||
"multiple languages.")
|
||||
parser.add_argument(
|
||||
'--image-dpi', metavar='DPI', type=int,
|
||||
help="for input image instead of PDF, use this DPI instead of file's")
|
||||
help="For input image instead of PDF, use this DPI instead of file's.")
|
||||
parser.add_argument(
|
||||
'--output-type', choices=['pdfa', 'pdf'], default='pdfa',
|
||||
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
||||
@@ -141,47 +138,74 @@ parser.add_argument(
|
||||
"also has problems with full Unicode text. 'pdf' attempts to "
|
||||
"preserve file contents as much as possible.")
|
||||
|
||||
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
||||
# since that is the only invalid character for filepaths on all platforms
|
||||
# bool('\0') is True in Python
|
||||
parser.add_argument(
|
||||
'--sidecar', nargs='?', const='\0', default=None, metavar='FILE',
|
||||
help="Generate sidecar text files that contain the same text recognized "
|
||||
"by Tesseract. This may be useful for building a OCR text database. "
|
||||
"If FILE is omitted, the sidecar file be named {output_file}.txt "
|
||||
"If FILE is set to '-', the sidecar is written to stdout (a "
|
||||
"convenient way to preview OCR quality). The output file and sidecar "
|
||||
"may not both use stdout at the same time.")
|
||||
|
||||
parser.add_argument(
|
||||
'--version', action='version', version=VERSION,
|
||||
help="Print program version and exit")
|
||||
|
||||
jobcontrol = parser.add_argument_group(
|
||||
"Job control options")
|
||||
jobcontrol.add_argument(
|
||||
'-j', '--jobs', metavar='N', type=int,
|
||||
help="Use up to N CPU cores simultaneously (default: use all).")
|
||||
jobcontrol.add_argument(
|
||||
'-q', '--quiet', action='store_true', help="Suppress INFO messages")
|
||||
jobcontrol.add_argument(
|
||||
'-v', '--verbose', const="+", default=[], nargs='?', action="append",
|
||||
help="Print more verbose messages for each additional verbose level")
|
||||
|
||||
metadata = parser.add_argument_group(
|
||||
"Metadata options",
|
||||
"Set output PDF/A metadata (default: use input document's metadata)")
|
||||
"Set output PDF/A metadata (default: copy input document's metadata)")
|
||||
metadata.add_argument(
|
||||
'--title', type=str,
|
||||
help="set document title (place multiple words in quotes)")
|
||||
help="Set document title (place multiple words in quotes)")
|
||||
metadata.add_argument(
|
||||
'--author', type=str,
|
||||
help="set document author")
|
||||
help="Set document author")
|
||||
metadata.add_argument(
|
||||
'--subject', type=str,
|
||||
help="set document subject description")
|
||||
help="Set document subject description")
|
||||
metadata.add_argument(
|
||||
'--keywords', type=str,
|
||||
help="set document keywords")
|
||||
help="Set document keywords")
|
||||
|
||||
preprocessing = parser.add_argument_group(
|
||||
"Image preprocessing options",
|
||||
"Options to improve the quality of the final PDF and OCR")
|
||||
preprocessing.add_argument(
|
||||
'-r', '--rotate-pages', action='store_true',
|
||||
help="automatically rotate pages based on detected text orientation")
|
||||
help="Automatically rotate pages based on detected text orientation")
|
||||
preprocessing.add_argument(
|
||||
'--remove-background', action='store_true',
|
||||
help="attempt to remove background from gray or color pages, setting it "
|
||||
help="Attempt to remove background from gray or color pages, setting it "
|
||||
"to white ")
|
||||
preprocessing.add_argument(
|
||||
'-d', '--deskew', action='store_true',
|
||||
help="deskew each page before performing OCR")
|
||||
help="Deskew each page before performing OCR")
|
||||
preprocessing.add_argument(
|
||||
'-c', '--clean', action='store_true',
|
||||
help="clean pages from scanning artifacts before performing OCR, and send "
|
||||
help="Clean pages from scanning artifacts before performing OCR, and send "
|
||||
"the cleaned page to OCR, but do not include the cleaned page in "
|
||||
"the output ")
|
||||
"the output")
|
||||
preprocessing.add_argument(
|
||||
'-i', '--clean-final', action='store_true',
|
||||
help="clean page as above, and incorporate the cleaned image in the final "
|
||||
"PDF")
|
||||
help="Clean page as above, and incorporate the cleaned image in the final "
|
||||
"PDF. Might remove desired content.")
|
||||
preprocessing.add_argument(
|
||||
'--oversample', metavar='DPI', type=int, default=0,
|
||||
help="oversample images to at least the specified DPI, to improve OCR "
|
||||
help="Oversample images to at least the specified DPI, to improve OCR "
|
||||
"results slightly")
|
||||
|
||||
ocrsettings = parser.add_argument_group(
|
||||
@@ -189,11 +213,11 @@ ocrsettings = parser.add_argument_group(
|
||||
"Control how OCR is applied")
|
||||
ocrsettings.add_argument(
|
||||
'-f', '--force-ocr', action='store_true',
|
||||
help="rasterize any fonts or vector objects on each page, apply OCR, and "
|
||||
help="Rasterize any fonts or vector objects on each page, apply OCR, and "
|
||||
"save the rastered output (this rewrites the PDF)")
|
||||
ocrsettings.add_argument(
|
||||
'-s', '--skip-text', action='store_true',
|
||||
help="skip OCR on any pages that already contain text, but include the "
|
||||
help="Skip OCR on any pages that already contain text, but include the "
|
||||
"page in final output; useful for PDFs that contain a mix of "
|
||||
"images, text pages, and/or previously OCRed pages")
|
||||
ocrsettings.add_argument(
|
||||
@@ -203,23 +227,23 @@ ocrsettings.add_argument(
|
||||
|
||||
ocrsettings.add_argument(
|
||||
'--skip-big', type=float, metavar='MPixels',
|
||||
help="skip OCR on pages larger than the specified amount of megapixels, "
|
||||
help="Skip OCR on pages larger than the specified amount of megapixels, "
|
||||
"but include skipped pages in final output")
|
||||
|
||||
advanced = parser.add_argument_group(
|
||||
"Advanced",
|
||||
"Advanced options for power users")
|
||||
"Advanced options to control Tesseract's OCR behavior")
|
||||
advanced.add_argument(
|
||||
'--tesseract-config', action='append', metavar='CFG', default=[],
|
||||
help="additional Tesseract configuration files -- see documentation")
|
||||
help="Additional Tesseract configuration files -- see documentation")
|
||||
advanced.add_argument(
|
||||
'--tesseract-pagesegmode', action='store', type=int, metavar='PSM',
|
||||
choices=range(0, 14),
|
||||
help="set Tesseract page segmentation mode (see tesseract --help)")
|
||||
help="Set Tesseract page segmentation mode (see tesseract --help)")
|
||||
advanced.add_argument(
|
||||
'--tesseract-oem', action='store', type=int, metavar='MODE',
|
||||
choices=range(0, 4),
|
||||
help=("set Tesseract 4.0 OCR engine mode: "
|
||||
help=("Set Tesseract 4.0 OCR engine mode: "
|
||||
"0 - original Tesseract only; "
|
||||
"1 - neural nets LSTM only; "
|
||||
"2 - Tesseract + LSTM; "
|
||||
@@ -227,7 +251,7 @@ advanced.add_argument(
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdf-renderer', choices=['auto', 'tesseract', 'hocr', 'tess4'], default='auto',
|
||||
help="choose OCR PDF renderer - the default option is to let OCRmyPDF "
|
||||
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
|
||||
"choose. The 'tesseract' PDF renderer is more accurate and does a "
|
||||
"better job and document structure such as recognizing columns. It "
|
||||
"also does a better job on non-Latin languages. However, it does "
|
||||
@@ -237,22 +261,36 @@ advanced.add_argument(
|
||||
"to 'tesseract', requires tesseract 4, and gives superior results.")
|
||||
advanced.add_argument(
|
||||
'--tesseract-timeout', default=180.0, type=float, metavar='SECONDS',
|
||||
help='give up on OCR after the timeout, but copy the preprocessed page '
|
||||
help='Give up on OCR after the timeout, but copy the preprocessed page '
|
||||
'into the final output')
|
||||
advanced.add_argument(
|
||||
'--rotate-pages-threshold', default=14.0, type=float, metavar='CONFIDENCE',
|
||||
help="only rotate pages when confidence is above this value (arbitrary "
|
||||
help="Only rotate pages when confidence is above this value (arbitrary "
|
||||
"units reported by tesseract)")
|
||||
advanced.add_argument(
|
||||
'--pdfa-image-compression', choices=['auto', 'jpeg', 'lossless'],
|
||||
default='auto',
|
||||
help="Specify how to compress images in the output PDF/A. 'auto' lets "
|
||||
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
|
||||
"JPEG compression. 'lossless' uses PNG-style lossless compression "
|
||||
"for all images. Monochrome images are always compressed using a "
|
||||
"lossless codec. Compression settings "
|
||||
"are applied to all pages, including those for which OCR was "
|
||||
"skipped. Not supported for --output-type=pdf ; that setting "
|
||||
"preserves the original compression of all images.")
|
||||
|
||||
debugging = parser.add_argument_group(
|
||||
"Debugging",
|
||||
"Arguments to help with troubleshooting and debugging")
|
||||
debugging.add_argument(
|
||||
'-k', '--keep-temporary-files', action='store_true',
|
||||
help="keep temporary files (helpful for debugging)")
|
||||
help="Keep temporary files (helpful for debugging)")
|
||||
debugging.add_argument(
|
||||
'-g', '--debug-rendering', action='store_true',
|
||||
help="render each page twice with debug information on second page")
|
||||
help="Render each page twice with debug information on second page")
|
||||
debugging.add_argument(
|
||||
'--flowchart', type=str,
|
||||
help="Generate the pipeline execution flowchart")
|
||||
|
||||
|
||||
def check_options_languages(options, _log):
|
||||
@@ -269,7 +307,7 @@ def check_options_languages(options, _log):
|
||||
"data for the following requested languages: \n")
|
||||
for lang in (set(options.language) - tesseract.languages()):
|
||||
msg += lang + '\n'
|
||||
raise argparse.ArgumentError(msg)
|
||||
raise argparse.ArgumentError(None, msg)
|
||||
|
||||
|
||||
def check_options_output(options, log):
|
||||
@@ -295,13 +333,23 @@ def check_options_output(options, log):
|
||||
"--pdf-renderer=tesseract.")
|
||||
|
||||
lossless_reconstruction = False
|
||||
if options.pdf_renderer == 'hocr':
|
||||
if options.pdf_renderer in ('hocr', 'tess4'):
|
||||
if not any((options.deskew, options.clean_final, options.force_ocr,
|
||||
options.remove_background)):
|
||||
lossless_reconstruction = True
|
||||
options.lossless_reconstruction = lossless_reconstruction
|
||||
|
||||
|
||||
def check_options_sidecar(options, log):
|
||||
if options.sidecar == '\0':
|
||||
if options.output_file == '-':
|
||||
raise argparse.ArgumentError(
|
||||
None,
|
||||
"--sidecar filename must be specified when output file is "
|
||||
"stdout.")
|
||||
options.sidecar = options.output_file + '.txt'
|
||||
|
||||
|
||||
def check_options_preprocessing(options, log):
|
||||
if any((options.clean, options.clean_final)):
|
||||
from .exec import unpaper
|
||||
@@ -325,6 +373,7 @@ def check_options_preprocessing(options, log):
|
||||
def check_options_ocr_behavior(options, log):
|
||||
if options.force_ocr and options.skip_text:
|
||||
raise argparse.ArgumentError(
|
||||
None,
|
||||
"Error: --force-ocr and --skip-text are mutually incompatible.")
|
||||
|
||||
if options.redo_ocr and (options.skip_text or options.force_ocr):
|
||||
@@ -350,6 +399,12 @@ def check_options_advanced(options, log):
|
||||
raise MissingDependencyError(
|
||||
"--pdf-renderer tess4 requires Tesseract 4.x "
|
||||
"commit 3d9fb3b or later")
|
||||
if options.pdfa_image_compression != 'auto' and \
|
||||
options.output_type != 'pdfa':
|
||||
log.warning(
|
||||
"--pdfa-image-compression argument has no effect when "
|
||||
"--output-type is not 'pdfa'"
|
||||
)
|
||||
|
||||
|
||||
def check_options_metadata(options, log):
|
||||
@@ -371,6 +426,7 @@ def check_options(options, log):
|
||||
check_options_languages(options, log)
|
||||
check_options_metadata(options, log)
|
||||
check_options_output(options, log)
|
||||
check_options_sidecar(options, log)
|
||||
check_options_preprocessing(options, log)
|
||||
check_options_ocr_behavior(options, log)
|
||||
check_options_advanced(options, log)
|
||||
@@ -389,8 +445,9 @@ def check_options(options, log):
|
||||
# Logging
|
||||
|
||||
|
||||
def logging_factory(logger_name, listargs):
|
||||
log_file_name, verbose = listargs
|
||||
def logging_factory(logger_name, logger_args):
|
||||
verbose = logger_args['verbose']
|
||||
quiet = logger_args['quiet']
|
||||
|
||||
root_logger = logging.getLogger(logger_name)
|
||||
root_logger.setLevel(logging.DEBUG)
|
||||
@@ -400,6 +457,8 @@ def logging_factory(logger_name, listargs):
|
||||
handler.setFormatter(formatter_)
|
||||
if verbose:
|
||||
handler.setLevel(logging.DEBUG)
|
||||
elif quiet:
|
||||
handler.setLevel(logging.WARNING)
|
||||
else:
|
||||
handler.setLevel(logging.INFO)
|
||||
root_logger.addHandler(handler)
|
||||
@@ -425,7 +484,7 @@ def available_cpu_count():
|
||||
|
||||
|
||||
def cleanup_ruffus_error_message(msg):
|
||||
msg = re.sub(r'\s+', r' ', msg, re.MULTILINE)
|
||||
msg = re.sub(r'\s+', r' ', msg)
|
||||
msg = re.sub(r"\((.+?)\)", r'\1', msg)
|
||||
msg = msg.strip()
|
||||
return msg
|
||||
@@ -559,8 +618,10 @@ def run_pipeline():
|
||||
if not check_closed_streams(options):
|
||||
return ExitCode.bad_args
|
||||
|
||||
logger_args = {'verbose': options.verbose, 'quiet': options.quiet}
|
||||
|
||||
_log, _log_mutex = proxy_logger.make_shared_logger_and_proxy(
|
||||
logging_factory, __name__, [None, options.verbose])
|
||||
logging_factory, __name__, logger_args)
|
||||
_log.debug('ocrmypdf ' + VERSION)
|
||||
_log.debug('tesseract ' + tesseract.version())
|
||||
|
||||
@@ -600,7 +661,7 @@ def run_pipeline():
|
||||
return ExitCode.bad_args
|
||||
elif not is_file_writable(options.output_file):
|
||||
_log.error(textwrap.dedent("""\
|
||||
Cutput file location is not writable."""))
|
||||
Output file location is not writable."""))
|
||||
return ExitCode.file_access_error
|
||||
|
||||
manager = JobContextManager()
|
||||
@@ -646,7 +707,9 @@ def run_pipeline():
|
||||
_log.error(e)
|
||||
return ExitCode.other_error
|
||||
|
||||
if options.output_file != '-':
|
||||
if options.flowchart:
|
||||
_log.info("Flowchart saved to {}".format(options.flowchart))
|
||||
elif options.output_file != '-':
|
||||
if options.output_type == 'pdfa':
|
||||
pdfa_info = file_claims_pdfa(options.output_file)
|
||||
if pdfa_info['pass']:
|
||||
|
||||
@@ -48,7 +48,7 @@ class InputFileError(ExitCodeException):
|
||||
|
||||
|
||||
class SubprocessOutputError(ExitCodeException):
|
||||
exit_code = ExitCode.other_error
|
||||
exit_code = ExitCode.child_process_error
|
||||
|
||||
|
||||
class EncryptedPdfError(ExitCodeException):
|
||||
|
||||
@@ -5,8 +5,9 @@ from tempfile import NamedTemporaryFile
|
||||
from subprocess import run, PIPE, STDOUT, CalledProcessError
|
||||
from shutil import copy
|
||||
from functools import lru_cache
|
||||
import re
|
||||
from . import get_program
|
||||
from ..pdfa import SRGB_ICC_PROFILE
|
||||
from ..exceptions import SubprocessOutputError
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
@@ -27,6 +28,10 @@ def version():
|
||||
return version.strip()
|
||||
|
||||
|
||||
def _gs_error_reported(stream):
|
||||
return re.search(r'error', stream, flags=re.IGNORECASE)
|
||||
|
||||
|
||||
def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
|
||||
pageno=1):
|
||||
with NamedTemporaryFile(delete=True) as tmp:
|
||||
@@ -46,7 +51,7 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
|
||||
|
||||
p = run(args_gs, stdout=PIPE, stderr=STDOUT,
|
||||
universal_newlines=True)
|
||||
if 'error' in p.stdout.lower():
|
||||
if _gs_error_reported(p.stdout):
|
||||
log.error(p.stdout)
|
||||
else:
|
||||
log.debug(p.stdout)
|
||||
@@ -54,10 +59,32 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
|
||||
if p.returncode == 0:
|
||||
copy(tmp.name, output_file)
|
||||
else:
|
||||
log.error('Ghostscript rendering failed')
|
||||
log.error('Ghostscript rasterizing failed')
|
||||
raise SubprocessOutputError()
|
||||
|
||||
|
||||
def generate_pdfa(pdf_pages, output_file, log, threads=1):
|
||||
def generate_pdfa(pdf_pages, output_file, compression, log, threads=1):
|
||||
compression_args = []
|
||||
if compression == 'jpeg':
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=false",
|
||||
"-dColorImageFilter=/DCTEncode",
|
||||
"-dAutoFilterGrayImages=false",
|
||||
"-dGrayImageFilter=/DCTEncode",
|
||||
]
|
||||
elif compression == 'lossless':
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=false",
|
||||
"-dColorImageFilter=/FlateEncode",
|
||||
"-dAutoFilterGrayImages=false",
|
||||
"-dGrayImageFilter=/FlateEncode",
|
||||
]
|
||||
else:
|
||||
compression_args = [
|
||||
"-dAutoFilterColorImages=true",
|
||||
"-dAutoFilterGrayImages=true",
|
||||
]
|
||||
|
||||
with NamedTemporaryFile(delete=True) as gs_pdf:
|
||||
args_gs = [
|
||||
get_program("gs"),
|
||||
@@ -68,7 +95,8 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1):
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-dAutoRotatePages=/None",
|
||||
"-sColorConversionStrategy=/RGB",
|
||||
"-sProcessColorModel=DeviceRGB",
|
||||
"-sProcessColorModel=DeviceRGB"
|
||||
] + compression_args + [
|
||||
"-dJPEGQ=95",
|
||||
"-dPDFA=2",
|
||||
"-dPDFACompatibilityPolicy=1",
|
||||
@@ -78,7 +106,7 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1):
|
||||
p = run(args_gs, stdout=PIPE, stderr=STDOUT,
|
||||
universal_newlines=True)
|
||||
|
||||
if 'error' in p.stdout.lower():
|
||||
if _gs_error_reported(p.stdout):
|
||||
log.error(p.stdout)
|
||||
elif 'overprint mode not set' in p.stdout:
|
||||
# Unless someone is going to print PDF/A documents on a
|
||||
@@ -96,4 +124,5 @@ def generate_pdfa(pdf_pages, output_file, log, threads=1):
|
||||
# PDF/A - check PDF/A status elsewhere
|
||||
copy(gs_pdf.name, output_file)
|
||||
else:
|
||||
log.error('Ghostscript PDF/A failed')
|
||||
log.error('Ghostscript PDF/A rendering failed')
|
||||
raise SubprocessOutputError()
|
||||
@@ -11,6 +11,7 @@ from ..helpers import page_number
|
||||
from . import get_program
|
||||
from collections import namedtuple
|
||||
from textwrap import dedent
|
||||
import PyPDF2 as pypdf
|
||||
|
||||
from subprocess import Popen, PIPE, CalledProcessError, \
|
||||
TimeoutExpired, check_output, STDOUT, DEVNULL
|
||||
@@ -199,7 +200,7 @@ def page_timedout(log, input_file):
|
||||
log.warning(prefix + " took too long to OCR - skipping")
|
||||
|
||||
|
||||
def _generate_null_hocr(output_hocr, image):
|
||||
def _generate_null_hocr(output_hocr, output_sidecar, image):
|
||||
"""Produce a .hocr file that reports no text detected on a page that is
|
||||
the same size as the input image."""
|
||||
from PIL import Image
|
||||
@@ -209,22 +210,29 @@ def _generate_null_hocr(output_hocr, image):
|
||||
|
||||
with open(output_hocr, 'w', encoding="utf-8") as f:
|
||||
f.write(HOCR_TEMPLATE.format(w, h))
|
||||
with open(output_sidecar, 'w', encoding='utf-8') as f:
|
||||
f.write('[skipped page]')
|
||||
|
||||
|
||||
def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
def generate_hocr(input_file, output_files, language: list, engine_mode,
|
||||
tessconfig: list,
|
||||
timeout: float, pagesegmode: int, log):
|
||||
|
||||
badxml = os.path.splitext(output_hocr)[0] + '.badxml'
|
||||
output_hocr = next(o for o in output_files if o.endswith('.hocr'))
|
||||
output_sidecar = next(o for o in output_files if o.endswith('.txt'))
|
||||
prefix = os.path.splitext(output_hocr)[0]
|
||||
|
||||
args_tesseract = tess_base_args(language, engine_mode)
|
||||
|
||||
if pagesegmode is not None:
|
||||
args_tesseract.extend([psm(), str(pagesegmode)])
|
||||
|
||||
# Reminder: test suite tesseract spoofers will break after any changes
|
||||
# to the number of order parameters here
|
||||
args_tesseract.extend([
|
||||
input_file,
|
||||
badxml,
|
||||
prefix,
|
||||
'txt',
|
||||
'hocr'
|
||||
] + tessconfig)
|
||||
try:
|
||||
@@ -237,25 +245,30 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
# Temporary workaround to hocrTransform not being able to function if
|
||||
# it does not have a valid hOCR file.
|
||||
page_timedout(log, input_file)
|
||||
_generate_null_hocr(output_hocr, input_file)
|
||||
_generate_null_hocr(output_hocr, output_sidecar, input_file)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_file)
|
||||
if 'read_params_file: parameter not found' in e.output:
|
||||
raise TesseractConfigError() from e
|
||||
if 'Image too large' in e.output:
|
||||
_generate_null_hocr(output_hocr, input_file)
|
||||
_generate_null_hocr(output_hocr, output_sidecar, input_file)
|
||||
return
|
||||
|
||||
raise e from e
|
||||
else:
|
||||
tesseract_log_output(log, stdout, input_file)
|
||||
|
||||
if os.path.exists(badxml + '.html'):
|
||||
# Tesseract 3.02 appends suffix ".html" on its own (.badxml.html)
|
||||
shutil.move(badxml + '.html', badxml)
|
||||
elif os.path.exists(badxml + '.hocr'):
|
||||
# Tesseract 3.03 appends suffix ".hocr" on its own (.badxml.hocr)
|
||||
shutil.move(badxml + '.hocr', badxml)
|
||||
# Tesseract 3.02 appends suffix ".html" instead of ".hocr". For
|
||||
# consistency rename its output to .hocr
|
||||
if os.path.exists(prefix + '.html'):
|
||||
shutil.move(prefix + '.html', prefix + '.tmp')
|
||||
elif os.path.exists(prefix + '.hocr'):
|
||||
shutil.move(prefix + '.hocr', prefix + '.tmp')
|
||||
|
||||
# The sidecar text file will get the suffix .txt; rename it to
|
||||
# whatever caller wants it named
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_sidecar)
|
||||
|
||||
# Tesseract 3.03 inserts source filename into hocr file without
|
||||
# escaping it, creating invalid XML and breaking the parser.
|
||||
@@ -264,7 +277,7 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
|
||||
regex_nested_single_quotes = re.compile(
|
||||
r"""title='image "([^"]*)";""")
|
||||
with open(badxml, mode='r', encoding='utf-8') as f_in, \
|
||||
with open(prefix + '.tmp', mode='r', encoding='utf-8') as f_in, \
|
||||
open(output_hocr, mode='w', encoding='utf-8') as f_out:
|
||||
for line in f_in:
|
||||
line = regex_nested_single_quotes.sub(
|
||||
@@ -272,14 +285,36 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode,
|
||||
f_out.write(line)
|
||||
|
||||
|
||||
def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
||||
engine_mode, text_only: bool,
|
||||
def use_skip_page(text_only, skip_pdf, output_pdf, output_text):
|
||||
with open(output_text, 'w') as f:
|
||||
f.write('[skipped page]')
|
||||
|
||||
if not text_only:
|
||||
os.symlink(skip_pdf, output_pdf)
|
||||
return
|
||||
|
||||
# For text only we must create a blank page with dimensions identical
|
||||
# to the skip page because this is equivalent to a page with no text
|
||||
|
||||
pdf_in = pypdf.PdfFileReader(skip_pdf)
|
||||
page0 = pdf_in.pages[0]
|
||||
|
||||
with open(output_pdf, 'wb') as out:
|
||||
pdf_out = pypdf.PdfFileWriter()
|
||||
w, h = page0.mediaBox.getWidth(), page0.mediaBox.getHeight()
|
||||
pdf_out.addBlankPage(w, h)
|
||||
pdf_out.write(out)
|
||||
|
||||
|
||||
def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text,
|
||||
language: list, engine_mode, text_only: bool,
|
||||
tessconfig: list, timeout: float, pagesegmode: int, log):
|
||||
'''Use Tesseract to render a PDF.
|
||||
|
||||
input_image -- image to analyze
|
||||
skip_pdf -- if we time out, use this file as output
|
||||
output_pdf -- file to generate
|
||||
output_text -- OCR text file
|
||||
language -- list of languages to consider
|
||||
engine_mode -- engine mode argument for tess v4
|
||||
text_only -- enable tesseract text only mode?
|
||||
@@ -296,10 +331,15 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
||||
if text_only:
|
||||
args_tesseract.extend(['-c', 'textonly_pdf=1'])
|
||||
|
||||
prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes
|
||||
|
||||
# Reminder: test suite tesseract spoofers will break after any changes
|
||||
# to the number of order parameters here
|
||||
args_tesseract.extend([
|
||||
input_image,
|
||||
os.path.splitext(output_pdf)[0], # Tesseract appends suffix
|
||||
'pdf'
|
||||
prefix,
|
||||
'txt',
|
||||
'pdf',
|
||||
] + tessconfig)
|
||||
|
||||
try:
|
||||
@@ -307,16 +347,18 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list,
|
||||
stdout = check_output(
|
||||
args_tesseract, close_fds=True, stderr=STDOUT,
|
||||
universal_newlines=True, timeout=timeout)
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_text)
|
||||
except TimeoutExpired:
|
||||
page_timedout(log, input_image)
|
||||
shutil.copy(skip_pdf, output_pdf)
|
||||
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_image)
|
||||
if 'read_params_file: parameter not found' in e.output:
|
||||
raise TesseractConfigError() from e
|
||||
|
||||
if 'Image too large' in e.output:
|
||||
shutil.copy(skip_pdf, output_pdf)
|
||||
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
|
||||
return
|
||||
raise e from e
|
||||
else:
|
||||
|
||||
@@ -423,7 +423,7 @@ def _find_images(pdf, container, shorthand=None):
|
||||
|
||||
"""
|
||||
|
||||
if container.get('/Type') == '/Page':
|
||||
if container.get('/Type') == '/Page' and '/Contents' in container:
|
||||
# For a /Page the content stream is attached to the page's /Contents
|
||||
page = container
|
||||
contentstream = pypdf.pdf.ContentStream(page.getContents(), pdf)
|
||||
@@ -457,6 +457,9 @@ def _find_images(pdf, container, shorthand=None):
|
||||
|
||||
|
||||
def _page_has_text(pdf, page):
|
||||
if not '/Contents' in page:
|
||||
return False
|
||||
|
||||
# Simple test
|
||||
text = page.extractText()
|
||||
if text.strip() != '':
|
||||
|
||||
@@ -247,7 +247,7 @@ def is_ocr_required(pageinfo, log, options):
|
||||
"skipping all processing on this page"))
|
||||
ocr_required = False
|
||||
|
||||
if ocr_required and options.skip_big:
|
||||
if ocr_required and options.skip_big and pageinfo['images']:
|
||||
pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
|
||||
if pixel_count > (options.skip_big * 1000000):
|
||||
ocr_required = False
|
||||
@@ -393,15 +393,16 @@ def rasterize_with_ghostscript(
|
||||
pageinfo = get_pageinfo(input_file, context)
|
||||
|
||||
device = 'png16m' # 24-bit
|
||||
if all(image['comp'] == 1 for image in pageinfo['images']):
|
||||
if all(image['bpc'] == 1 for image in pageinfo['images']):
|
||||
device = 'pngmono'
|
||||
elif all(image['bpc'] > 1 and image['color'] == 'index'
|
||||
for image in pageinfo['images']):
|
||||
device = 'png256'
|
||||
elif all(image['bpc'] > 1 and image['color'] == 'gray'
|
||||
for image in pageinfo['images']):
|
||||
device = 'pnggray'
|
||||
if pageinfo['images']:
|
||||
if all(image['comp'] == 1 for image in pageinfo['images']):
|
||||
if all(image['bpc'] == 1 for image in pageinfo['images']):
|
||||
device = 'pngmono'
|
||||
elif all(image['bpc'] > 1 and image['color'] == 'index'
|
||||
for image in pageinfo['images']):
|
||||
device = 'png256'
|
||||
elif all(image['bpc'] > 1 and image['color'] == 'gray'
|
||||
for image in pageinfo['images']):
|
||||
device = 'pnggray'
|
||||
|
||||
log.debug("Rasterize {0} with {1}".format(
|
||||
os.path.basename(input_file), device))
|
||||
@@ -482,13 +483,13 @@ def select_ocr_image(
|
||||
|
||||
def ocr_tesseract_hocr(
|
||||
input_file,
|
||||
output_file,
|
||||
output_files,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
tesseract.generate_hocr(
|
||||
input_file=input_file,
|
||||
output_hocr=output_file,
|
||||
output_files=output_files,
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
tessconfig=options.tesseract_config,
|
||||
@@ -517,7 +518,10 @@ def select_visible_page_image(
|
||||
image = next(ii for ii in infiles if ii.endswith(image_suffix))
|
||||
|
||||
pageinfo = get_pageinfo(image, context)
|
||||
if all(orig_image['enc'] == 'jpeg' for orig_image in pageinfo['images']):
|
||||
if pageinfo['images'] and \
|
||||
all(im['enc'] == 'jpeg' for im in pageinfo['images']):
|
||||
log.debug('{:4d}: JPEG input -> JPEG output'.format(
|
||||
page_number(image)))
|
||||
# If all images were JPEGs originally, produce a JPEG as output
|
||||
im = Image.open(image)
|
||||
|
||||
@@ -575,12 +579,12 @@ def select_image_layer(
|
||||
|
||||
|
||||
def render_hocr_page(
|
||||
input_file,
|
||||
infiles,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
hocr = input_file
|
||||
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
||||
pageinfo = get_pageinfo(hocr, context)
|
||||
dpi = get_page_square_dpi(pageinfo, options)
|
||||
|
||||
@@ -589,14 +593,22 @@ def render_hocr_page(
|
||||
showBoundingboxes=False, invisibleText=True)
|
||||
|
||||
|
||||
def flatten_groups(groups):
|
||||
for obj in groups:
|
||||
if is_iterable_notstr(obj):
|
||||
yield from obj
|
||||
else:
|
||||
yield obj
|
||||
|
||||
|
||||
def render_hocr_debug_page(
|
||||
infiles,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
hocr = next(ii for ii in infiles if ii.endswith('.hocr'))
|
||||
image = next(ii for ii in infiles if ii.endswith('.image'))
|
||||
hocr = next(ii for ii in flatten_groups(infiles) if ii.endswith('.hocr'))
|
||||
image = next(ii for ii in flatten_groups(infiles) if ii.endswith('.image'))
|
||||
|
||||
pageinfo = get_pageinfo(image, context)
|
||||
dpi = get_page_square_dpi(pageinfo, options)
|
||||
@@ -611,8 +623,10 @@ def combine_layers(
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
text = next(ii for ii in infiles if ii.endswith('.text.pdf'))
|
||||
image = next(ii for ii in infiles if ii.endswith('.image-layer.pdf'))
|
||||
text = next(ii for ii in flatten_groups(infiles)
|
||||
if ii.endswith('.text.pdf'))
|
||||
image = next(ii for ii in flatten_groups(infiles)
|
||||
if ii.endswith('.image-layer.pdf'))
|
||||
|
||||
pdf_text = pypdf.PdfFileReader(open(text, "rb"))
|
||||
pdf_image = pypdf.PdfFileReader(open(image, "rb"))
|
||||
@@ -678,21 +692,27 @@ def combine_layers(
|
||||
|
||||
def ocr_tesseract_and_render_pdf(
|
||||
infiles,
|
||||
output_file,
|
||||
outfiles,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
input_image = next((ii for ii in infiles if ii.endswith('.image')), '')
|
||||
input_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
|
||||
output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
|
||||
output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
|
||||
|
||||
if not input_image:
|
||||
# Skipping this page
|
||||
re_symlink(input_pdf, output_file, log)
|
||||
re_symlink(input_pdf, output_pdf, log)
|
||||
with open(output_text, 'w') as f:
|
||||
f.write('[skipped page]')
|
||||
return
|
||||
|
||||
tesseract.generate_pdf(
|
||||
input_image=input_image,
|
||||
skip_pdf=input_pdf,
|
||||
output_pdf=output_file,
|
||||
output_pdf=output_pdf,
|
||||
output_text=output_text,
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
text_only=False,
|
||||
@@ -704,19 +724,23 @@ def ocr_tesseract_and_render_pdf(
|
||||
|
||||
def ocr_tesseract_textonly_pdf(
|
||||
infiles,
|
||||
output_file,
|
||||
outfiles,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
input_image = next((ii for ii in infiles if ii.endswith('.ocr.png')), '')
|
||||
if not input_image:
|
||||
raise ValueError("No image rendered?")
|
||||
|
||||
skip_pdf = next((ii for ii in infiles if ii.endswith('.pdf')))
|
||||
|
||||
output_pdf = next((ii for ii in outfiles if ii.endswith('.pdf')))
|
||||
output_text = next((ii for ii in outfiles if ii.endswith('.txt')))
|
||||
|
||||
tesseract.generate_pdf(
|
||||
input_image=input_image,
|
||||
skip_pdf=skip_pdf,
|
||||
output_pdf=output_file,
|
||||
output_pdf=output_pdf,
|
||||
output_text=output_text,
|
||||
language=options.language,
|
||||
engine_mode=options.tesseract_oem,
|
||||
text_only=True,
|
||||
@@ -773,7 +797,8 @@ def generate_postscript_stub(
|
||||
def skip_page(
|
||||
input_file,
|
||||
output_file,
|
||||
log):
|
||||
log,
|
||||
context):
|
||||
# The purpose of this step is its filter to forward only the skipped
|
||||
# files (.skip.oriented.pdf) while disregarding the processed ones
|
||||
# (.ocr.oriented.pdf). Alternative would be for merge_pages to filter
|
||||
@@ -782,7 +807,7 @@ def skip_page(
|
||||
|
||||
|
||||
def merge_pages_ghostscript(
|
||||
input_files,
|
||||
input_files_groups,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
@@ -800,17 +825,24 @@ def merge_pages_ghostscript(
|
||||
key += 1
|
||||
return key
|
||||
|
||||
input_files = (f for f in flatten_groups(input_files_groups)
|
||||
if not f.endswith('.txt'))
|
||||
pdf_pages = sorted(input_files, key=input_file_order)
|
||||
log.debug("Final pages: " + "\n".join(pdf_pages))
|
||||
ghostscript.generate_pdfa(pdf_pages, output_file, log, options.jobs or 1)
|
||||
ghostscript.generate_pdfa(
|
||||
pdf_pages, output_file, options.pdfa_image_compression,
|
||||
log, options.jobs or 1)
|
||||
|
||||
|
||||
def merge_pages_qpdf(
|
||||
input_files,
|
||||
input_files_groups,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
|
||||
input_files = list(f for f in flatten_groups(input_files_groups)
|
||||
if not f.endswith('.txt'))
|
||||
metadata_file = next(
|
||||
(ii for ii in input_files if ii.endswith('.repaired.pdf')))
|
||||
input_files.remove(metadata_file)
|
||||
@@ -844,6 +876,40 @@ def merge_pages_qpdf(
|
||||
qpdf.merge(pdf_pages, output_file)
|
||||
|
||||
|
||||
def merge_sidecars(
|
||||
input_files_groups,
|
||||
output_file,
|
||||
log,
|
||||
context):
|
||||
options = context.get_options()
|
||||
pdfinfo = context.get_pdfinfo()
|
||||
|
||||
txt_files = [None] * len(pdfinfo)
|
||||
|
||||
for infile in flatten_groups(input_files_groups):
|
||||
if infile.endswith('.txt'):
|
||||
idx = page_number(infile) - 1
|
||||
txt_files[idx] = infile
|
||||
|
||||
def write_pages(stream):
|
||||
for page_number, txt_file in enumerate(txt_files):
|
||||
if page_number != 0:
|
||||
stream.write('\f') # Form feed between pages
|
||||
if txt_file:
|
||||
with open(txt_file, 'r') as in_:
|
||||
stream.write(in_.read())
|
||||
else:
|
||||
stream.write('[OCR skipped on page {}]'.format(
|
||||
page_number + 1))
|
||||
|
||||
if output_file == '-':
|
||||
write_pages(sys.stdout)
|
||||
sys.stdout.flush()
|
||||
else:
|
||||
with open(output_file, 'w', encoding='utf-8') as out:
|
||||
write_pages(out)
|
||||
|
||||
|
||||
def copy_final(
|
||||
input_files,
|
||||
output_file,
|
||||
@@ -948,7 +1014,7 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_func=ocr_tesseract_hocr,
|
||||
input=task_select_ocr_image,
|
||||
filter=suffix(".ocr.png"),
|
||||
output=".hocr",
|
||||
output=[".hocr", ".txt"],
|
||||
extras=[log, context])
|
||||
task_ocr_tesseract_hocr.graphviz(fillcolor='"#00cc66"')
|
||||
task_ocr_tesseract_hocr.active_if(options.pdf_renderer == 'hocr')
|
||||
@@ -980,8 +1046,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_render_hocr_page = main_pipeline.transform(
|
||||
task_func=render_hocr_page,
|
||||
input=task_ocr_tesseract_hocr,
|
||||
filter=suffix('.hocr'),
|
||||
output='.text.pdf',
|
||||
filter=regex(r".*/(\d{6})(?:\.hocr)"),
|
||||
output=os.path.join(work_folder, r'\1.text.pdf'),
|
||||
extras=[log, context])
|
||||
task_render_hocr_page.graphviz(fillcolor='"#00cc66"')
|
||||
task_render_hocr_page.active_if(options.pdf_renderer == 'hocr')
|
||||
@@ -1001,7 +1067,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_func=ocr_tesseract_textonly_pdf,
|
||||
input=[task_select_ocr_image, task_orient_page],
|
||||
filter=regex(r".*/(\d{6})(?:\.ocr.png|\.ocr\.oriented\.pdf)"),
|
||||
output=os.path.join(work_folder, r'\1.text.pdf'),
|
||||
output=[os.path.join(work_folder, r'\1.text.pdf'),
|
||||
os.path.join(work_folder, r'\1.text.txt')],
|
||||
extras=[log, context])
|
||||
task_ocr_tesseract_textonly_pdf.graphviz(fillcolor='"#ff69b4"')
|
||||
task_ocr_tesseract_textonly_pdf.active_if(options.pdf_renderer == 'tess4')
|
||||
@@ -1024,7 +1091,8 @@ def build_pipeline(options, work_folder, log, context):
|
||||
task_func=ocr_tesseract_and_render_pdf,
|
||||
input=[task_select_visible_page_image, task_orient_page],
|
||||
filter=regex(r".*/(\d{6})(?:\.image|\.ocr\.oriented\.pdf)"),
|
||||
output=os.path.join(work_folder, r'\1.rendered.pdf'),
|
||||
output=[os.path.join(work_folder, r'\1.rendered.pdf'),
|
||||
os.path.join(work_folder, r'\1.rendered.txt')],
|
||||
extras=[log, context])
|
||||
task_ocr_tesseract_and_render_pdf.graphviz(fillcolor='"#66ccff"')
|
||||
task_ocr_tesseract_and_render_pdf.active_if(options.pdf_renderer == 'tesseract')
|
||||
@@ -1048,7 +1116,7 @@ def build_pipeline(options, work_folder, log, context):
|
||||
filter=suffix('.skip.oriented.pdf'),
|
||||
output='.done.pdf',
|
||||
output_dir=work_folder,
|
||||
extras=[log])
|
||||
extras=[log, context])
|
||||
|
||||
# Merge pages
|
||||
task_merge_pages_ghostscript = main_pipeline.merge(
|
||||
@@ -1073,6 +1141,15 @@ def build_pipeline(options, work_folder, log, context):
|
||||
extras=[log, context])
|
||||
task_merge_pages_qpdf.active_if(options.output_type == 'pdf')
|
||||
|
||||
task_merge_sidecars = main_pipeline.merge(
|
||||
task_func=merge_sidecars,
|
||||
input=[task_ocr_tesseract_hocr,
|
||||
task_ocr_tesseract_and_render_pdf,
|
||||
task_ocr_tesseract_textonly_pdf],
|
||||
output=options.sidecar,
|
||||
extras=[log, context])
|
||||
task_merge_sidecars.active_if(options.sidecar)
|
||||
|
||||
# Finalize
|
||||
task_copy_final = main_pipeline.merge(
|
||||
task_func=copy_final,
|
||||
|
||||
@@ -11,3 +11,4 @@ ignore =
|
||||
[tool:pytest]
|
||||
norecursedirs = lib .pc .git output cache resources
|
||||
testpaths = tests
|
||||
addopts = -n auto
|
||||
@@ -1,2 +1,3 @@
|
||||
pytest >= 3.0
|
||||
pytest-helpers-namespace
|
||||
pytest-helpers-namespace
|
||||
pytest-xdist
|
||||
@@ -34,6 +34,9 @@ In some cases they were converted from one image format to another without other
|
||||
* - typewriter.png, 2400dpi.pdf
|
||||
- `Wikimedia: Triumph typewrtier text Linzensoep`_
|
||||
* Creative Commons BY-SA 2.5
|
||||
* - baiona.png
|
||||
- `Wikimedia: Baionako udalerri mugakideak`_
|
||||
- Creative Commons BY-SA 4.0
|
||||
|
||||
|
||||
Files generated for this project
|
||||
@@ -85,6 +88,9 @@ under the terms of the license in LICENSE.rst.
|
||||
* - overlay.pdf
|
||||
- @maxandersen
|
||||
- PDF file generated by PDFPen pro that triggered content stream parse errors
|
||||
* - no_conentes.pdf
|
||||
- @jbarlow83
|
||||
- synthetic PDF with a blank page that has no /Contents entry
|
||||
|
||||
Assemblies
|
||||
==========
|
||||
@@ -115,4 +121,6 @@ These test resources are assemblies from other previously mentioned files, relea
|
||||
|
||||
.. _`Linux (Wikipedia Article)`: https://de.wikipedia.org/wiki/Linux
|
||||
|
||||
.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif
|
||||
.. _`Wikimedia: Triumph typewrtier text Linzensoep`: https://commons.wikimedia.org/wiki/File:Triumph.typewriter_text_Linzensoep.gif
|
||||
|
||||
.. _`Wikimedia: Baionako udalerri mugakideak`: https://commons.wikimedia.org/wiki/File:Baionako_udalerri_mugakideak.png
|
||||
BIN
tests/resources/baiona.png
Normal file
BIN
tests/resources/baiona.png
Normal file
Binary file not shown.
BIN
tests/resources/baiona_gray.png
Normal file
BIN
tests/resources/baiona_gray.png
Normal file
Binary file not shown.
BIN
tests/resources/no_contents.pdf
Normal file
BIN
tests/resources/no_contents.pdf
Normal file
Binary file not shown.
Binary file not shown.
31
tests/spoof/gs_raster_failure.py
Executable file
31
tests/spoof/gs_raster_failure.py
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
|
||||
"""Replicate Ghostscript raster failure while allowing rendering"""
|
||||
|
||||
|
||||
def real_ghostscript(argv):
|
||||
gs_args = ['gs'] + argv[1:]
|
||||
os.execvp("gs", gs_args)
|
||||
return # Not reachable
|
||||
|
||||
|
||||
def main():
|
||||
if '--version' in sys.argv:
|
||||
print('9.20')
|
||||
print('SPOOFED: ' + os.path.basename(__filename__))
|
||||
sys.exit(0)
|
||||
|
||||
# For any rendering calls (device == pdfwrite) call real ghostscript
|
||||
if '-sDEVICE=pdfwrite' in sys.argv:
|
||||
real_ghostscript(sys.argv)
|
||||
return
|
||||
|
||||
# Fail
|
||||
print("ERROR: Ghost story archive not found")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
31
tests/spoof/gs_render_failure.py
Executable file
31
tests/spoof/gs_render_failure.py
Executable file
@@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
|
||||
"""Replicate Ghostscript render failure while allowing rasterizing"""
|
||||
|
||||
|
||||
def real_ghostscript(argv):
|
||||
gs_args = ['gs'] + argv[1:]
|
||||
os.execvp("gs", gs_args)
|
||||
return # Not reachable
|
||||
|
||||
|
||||
def main():
|
||||
if '--version' in sys.argv:
|
||||
print('9.20')
|
||||
print('SPOOFED: ' + os.path.basename(__filename__))
|
||||
sys.exit(0)
|
||||
|
||||
# For any rasterize calls (device != pdfwrite) call real ghostscript
|
||||
if '-sDEVICE=pdfwrite' not in sys.argv:
|
||||
real_ghostscript(sys.argv)
|
||||
return
|
||||
|
||||
# Fail
|
||||
print("ERROR: Casper is not a friendly ghost")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -36,6 +36,10 @@ def real_tesseract():
|
||||
|
||||
def main():
|
||||
operation = sys.argv[-1]
|
||||
sidecar = False
|
||||
if sys.argv[-2] == 'txt':
|
||||
sidecar = True
|
||||
|
||||
# For anything unexpected operation, defer to real tesseract binary
|
||||
# Currently this includes all use of "--tesseract-config"
|
||||
if operation != 'hocr' and operation != 'pdf' and operation != 'stdout':
|
||||
@@ -92,16 +96,22 @@ def main():
|
||||
return
|
||||
|
||||
if operation == 'stdout':
|
||||
# tesseract [--options] ... input stdout
|
||||
input_file = sys.argv[-2]
|
||||
output_file = 'stdout'
|
||||
sidecar_file = ''
|
||||
else:
|
||||
input_file = sys.argv[-3]
|
||||
output_file = sys.argv[-2]
|
||||
# tesseract [--options] ... input output txt hocr|pdf
|
||||
input_file = sys.argv[-4]
|
||||
output_file = sys.argv[-3]
|
||||
sidecar_file = sys.argv[-3]
|
||||
|
||||
if operation == 'hocr':
|
||||
output_file += '.hocr'
|
||||
sidecar_file += '.txt'
|
||||
elif operation == 'pdf':
|
||||
output_file += '.pdf'
|
||||
sidecar_file += '.txt'
|
||||
|
||||
with open(input_file, 'rb') as f:
|
||||
m.update(f.read())
|
||||
@@ -112,6 +122,8 @@ def main():
|
||||
print("Tesseract cache hit", file=sys.stderr)
|
||||
if operation != 'stdout':
|
||||
shutil.copy(cache_name, output_file)
|
||||
if sidecar:
|
||||
shutil.copy(cache_name + '.sidecar', sidecar_file)
|
||||
|
||||
# Replicate output
|
||||
with open(cache_name + '.stdout', 'rb') as f:
|
||||
@@ -149,6 +161,8 @@ def main():
|
||||
shutil.copy(output_file, cache_name)
|
||||
else:
|
||||
print("Could not find output file", file=sys.stderr)
|
||||
if sidecar and os.path.exists(sidecar_file):
|
||||
shutil.copy(sidecar_file, cache_name + '.sidecar')
|
||||
else:
|
||||
open(cache_name, 'w').close()
|
||||
|
||||
|
||||
@@ -53,18 +53,22 @@ def main():
|
||||
print('List of available languages (1):\neng', file=sys.stderr)
|
||||
sys.exit(0)
|
||||
elif sys.argv[-1] == 'hocr':
|
||||
inputf = sys.argv[-3]
|
||||
output = sys.argv[-2]
|
||||
inputf = sys.argv[-4]
|
||||
output = sys.argv[-3]
|
||||
with Image.open(inputf) as im, \
|
||||
open(output + '.hocr', 'w', encoding='utf-8') as f:
|
||||
w, h = im.size
|
||||
f.write(HOCR_TEMPLATE.format(str(w), str(h)))
|
||||
with open(output + '.txt', 'w') as f:
|
||||
f.write('')
|
||||
elif sys.argv[-1] == 'pdf':
|
||||
inputf = sys.argv[-3]
|
||||
output = sys.argv[-2]
|
||||
inputf = sys.argv[-4]
|
||||
output = sys.argv[-3]
|
||||
pdf_bytes = img2pdf.convert([inputf], dpi=300)
|
||||
with open(output + '.pdf', 'wb') as f:
|
||||
f.write(pdf_bytes)
|
||||
with open(output + '.txt', 'w') as f:
|
||||
f.write('')
|
||||
elif sys.argv[-1] == 'stdout':
|
||||
inputf = sys.argv[-2]
|
||||
print("""Orientation: 0
|
||||
|
||||
@@ -51,6 +51,16 @@ def spoof_no_tess_pdfa_warning():
|
||||
return spoof(tesseract='tesseract_noop.py', gs='gs_feature_elision.py')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spoof_no_tess_gs_render_fail():
|
||||
return spoof(tesseract='tesseract_noop.py', gs='gs_render_failure.py')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spoof_no_tess_gs_raster_fail():
|
||||
return spoof(tesseract='tesseract_noop.py', gs='gs_raster_failure.py')
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spoof_qpdf_always_error():
|
||||
return spoof(qpdf='qpdf_dummy_return2.py')
|
||||
@@ -137,14 +147,18 @@ def test_remove_background(spoof_tesseract_noop, resources, outdir):
|
||||
@pytest.mark.parametrize("output_type", ['pdf', 'pdfa'])
|
||||
def test_exotic_image(spoof_tesseract_cache, pdf, renderer, output_type,
|
||||
resources, outdir):
|
||||
outfile = outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer)
|
||||
check_ocrmypdf(
|
||||
resources / pdf,
|
||||
outdir / 'test_{0}_{1}.pdf'.format(pdf, renderer),
|
||||
outfile,
|
||||
'-dc',
|
||||
'-v', '1',
|
||||
'--output-type', output_type,
|
||||
'--sidecar',
|
||||
'--pdf-renderer', renderer, env=spoof_tesseract_cache)
|
||||
|
||||
assert outfile.with_suffix('.pdf.txt').exists()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("output_type", [
|
||||
'pdfa', 'pdf'
|
||||
@@ -738,6 +752,11 @@ def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf):
|
||||
"Checks for a Decimal quantize error with high DPI, etc"
|
||||
check_ocrmypdf(resources / '2400dpi.pdf', outpdf,
|
||||
env=spoof_tesseract_cache)
|
||||
pdfinfo = pdf_get_all_pageinfo(outpdf)
|
||||
|
||||
image = pdfinfo[0]['images'][0]
|
||||
assert image['dpi_w'] == image['dpi_h']
|
||||
assert image['dpi_w'] == 2400
|
||||
|
||||
|
||||
def test_overlay(spoof_tesseract_noop, resources, outpdf):
|
||||
@@ -835,3 +854,138 @@ def test_pagesize_consistency(renderer, resources, outpdf):
|
||||
|
||||
assert isclose(before_dims[0], after_dims[0])
|
||||
assert isclose(before_dims[1], after_dims[1])
|
||||
|
||||
|
||||
def test_skip_big_with_no_images(spoof_tesseract_noop, resources, outpdf):
|
||||
check_ocrmypdf(resources / 'blank.pdf', outpdf,
|
||||
'--skip-big', '5',
|
||||
'--force-ocr',
|
||||
env=spoof_tesseract_noop)
|
||||
|
||||
|
||||
def test_gs_render_failure(spoof_no_tess_gs_render_fail, resources, outpdf):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'blank.pdf', outpdf,
|
||||
env=spoof_no_tess_gs_render_fail)
|
||||
print(err)
|
||||
assert p.returncode == ExitCode.child_process_error
|
||||
|
||||
|
||||
def test_gs_raster_failure(spoof_no_tess_gs_raster_fail, resources, outpdf):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'ccitt.pdf', outpdf,
|
||||
env=spoof_no_tess_gs_raster_fail)
|
||||
print(err)
|
||||
assert p.returncode == ExitCode.child_process_error
|
||||
|
||||
|
||||
def test_no_contents(spoof_tesseract_noop, resources, outpdf):
|
||||
check_ocrmypdf(resources / 'no_contents.pdf', outpdf, '--force-ocr',
|
||||
env=spoof_tesseract_noop)
|
||||
|
||||
|
||||
@pytest.mark.parametrize('image', [
|
||||
'baiona.png',
|
||||
'baiona_gray.png',
|
||||
'congress.jpg'
|
||||
])
|
||||
def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec,
|
||||
resources, image, outpdf):
|
||||
from PIL import Image
|
||||
|
||||
input_file = str(resources / image)
|
||||
output_file = str(outpdf)
|
||||
|
||||
im = Image.open(input_file)
|
||||
|
||||
# Runs: ocrmypdf - output.pdf < testfile
|
||||
with open(input_file, 'rb') as input_stream:
|
||||
p_args = ocrmypdf_exec + [
|
||||
'--image-dpi', '150', '--output-type', 'pdf', '-', output_file]
|
||||
p = Popen(
|
||||
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
|
||||
stdin=input_stream, env=spoof_tesseract_noop)
|
||||
out, err = p.communicate()
|
||||
|
||||
assert p.returncode == ExitCode.ok
|
||||
|
||||
pdfinfo = pdf_get_all_pageinfo(output_file)
|
||||
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
|
||||
if input_file.endswith('.png'):
|
||||
assert pdfimage['enc'] != 'jpeg', \
|
||||
"Lossless compression changed to lossy!"
|
||||
elif input_file.endswith('.jpg'):
|
||||
assert pdfimage['enc'] == 'jpeg', \
|
||||
"Lossy compression changed to lossless!"
|
||||
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
|
||||
assert pdfimage['color'] == 'rgb', \
|
||||
"Colorspace changed"
|
||||
elif im.mode.startswith('L'):
|
||||
assert pdfimage['color'] == 'gray', \
|
||||
"Colorspace changed"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('image,compression', [
|
||||
('baiona.png', 'jpeg'),
|
||||
('baiona_gray.png', 'lossless'),
|
||||
('congress.jpg', 'lossless')
|
||||
])
|
||||
def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,
|
||||
resources, image, compression, outpdf):
|
||||
from PIL import Image
|
||||
|
||||
input_file = str(resources / image)
|
||||
output_file = str(outpdf)
|
||||
|
||||
im = Image.open(input_file)
|
||||
|
||||
# Runs: ocrmypdf - output.pdf < testfile
|
||||
with open(input_file, 'rb') as input_stream:
|
||||
p_args = ocrmypdf_exec + [
|
||||
'--image-dpi', '150', '--output-type', 'pdfa',
|
||||
'--pdfa-image-compression', compression,
|
||||
'-', output_file]
|
||||
p = Popen(
|
||||
p_args, close_fds=True, stdout=PIPE, stderr=PIPE,
|
||||
stdin=input_stream, env=spoof_tesseract_noop)
|
||||
out, err = p.communicate()
|
||||
|
||||
assert p.returncode == ExitCode.ok
|
||||
|
||||
pdfinfo = pdf_get_all_pageinfo(output_file)
|
||||
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
|
||||
if compression == 'jpeg':
|
||||
assert pdfimage['enc'] == 'jpeg'
|
||||
elif compression == 'lossless':
|
||||
assert pdfimage['enc'] == 'image'
|
||||
|
||||
if im.mode.startswith('RGB') or im.mode.startswith('BGR'):
|
||||
assert pdfimage['color'] == 'rgb', \
|
||||
"Colorspace changed"
|
||||
elif im.mode.startswith('L'):
|
||||
assert pdfimage['color'] == 'gray', \
|
||||
"Colorspace changed"
|
||||
|
||||
|
||||
def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf):
|
||||
sidecar = outpdf + '.txt'
|
||||
check_ocrmypdf(
|
||||
resources / 'multipage.pdf', outpdf,
|
||||
'--skip-text',
|
||||
'--sidecar', sidecar,
|
||||
env=spoof_tesseract_cache)
|
||||
|
||||
pdfinfo = pdf_get_all_pageinfo(str(resources / 'multipage.pdf'))
|
||||
num_pages = len(pdfinfo)
|
||||
|
||||
with open(sidecar, 'r') as f:
|
||||
ocr_text = f.read()
|
||||
|
||||
# There should a formfeed between each pair of pages, so the count of
|
||||
# formfeeds is the page count less one
|
||||
assert ocr_text.count('\f') == num_pages - 1, \
|
||||
"Sidecar page count does not match PDF page count"
|
||||
|
||||
@@ -107,3 +107,11 @@ def test_form_xobject(resources):
|
||||
pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
|
||||
pdfimage = pdfinfo[0]['images'][0]
|
||||
assert pdfimage['width'] == 50
|
||||
|
||||
|
||||
def test_no_contents(resources):
|
||||
filename = resources / 'no_contents.pdf'
|
||||
|
||||
pdfinfo = pageinfo.pdf_get_all_pageinfo(str(filename))
|
||||
assert len(pdfinfo[0]['images']) == 0
|
||||
assert pdfinfo[0]['has_text'] == False
|
||||
@@ -6,11 +6,42 @@ from ocrmypdf.exceptions import ExitCode
|
||||
from ocrmypdf.exec import tesseract
|
||||
from ocrmypdf import pageinfo
|
||||
import sys
|
||||
import os
|
||||
import PyPDF2 as pypdf
|
||||
|
||||
|
||||
spoof = pytest.helpers.spoof
|
||||
|
||||
|
||||
def tess4_possible_location():
|
||||
"""The location of tesseract 4 may be OCRMYPDF_TESS4, OCRMYPDF_TESSERACT,
|
||||
or the installed version on PATH."""
|
||||
return os.environ.get('OCRMYPDF_TESS4') or \
|
||||
os.environ.get('OCRMYPDF_TESSERACT') or \
|
||||
'tesseract'
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def ensure_tess4():
|
||||
return spoof(tesseract=tess4_possible_location())
|
||||
|
||||
|
||||
def tess4_available():
|
||||
"""Check if a tesseract 4 binary is available, even if it's not the
|
||||
official "tesseract" on PATH
|
||||
|
||||
"""
|
||||
old_environ = os.environ.copy()
|
||||
try:
|
||||
os.environ['OCRMYPDF_TESSERACT'] = tess4_possible_location()
|
||||
return tesseract.v4() and tesseract.has_textonly_pdf()
|
||||
finally:
|
||||
os.environ = old_environ
|
||||
|
||||
|
||||
# Skip all tests in this file if not tesseract 4
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not (tesseract.v4() and tesseract.has_textonly_pdf()),
|
||||
not tess4_available(),
|
||||
reason="tesseract 4.0 with textonly_pdf feature required")
|
||||
|
||||
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
||||
@@ -18,14 +49,16 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
|
||||
spoof = pytest.helpers.spoof
|
||||
|
||||
|
||||
def test_textonly_pdf(resources, outdir):
|
||||
def test_textonly_pdf(ensure_tess4, resources, outdir):
|
||||
check_ocrmypdf(
|
||||
resources / 'linn.pdf',
|
||||
outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4')
|
||||
outdir / 'linn_textonly.pdf', '--pdf-renderer', 'tess4',
|
||||
'--sidecar', 'foo',
|
||||
env=ensure_tess4)
|
||||
|
||||
|
||||
@pytest.mark.skipif(sys.version_info < (3, 5), reason="needs math.isclose")
|
||||
def test_pagesize_consistency_tess4(resources, outpdf):
|
||||
def test_pagesize_consistency_tess4(ensure_tess4, resources, outpdf):
|
||||
from math import isclose
|
||||
|
||||
infile = resources / 'linn.pdf'
|
||||
@@ -35,9 +68,47 @@ def test_pagesize_consistency_tess4(resources, outpdf):
|
||||
check_ocrmypdf(
|
||||
infile,
|
||||
outpdf, '--pdf-renderer', 'tess4',
|
||||
'--clean', '--deskew', '--remove-background', '--clean-final')
|
||||
'--clean', '--deskew', '--remove-background', '--clean-final',
|
||||
env=ensure_tess4)
|
||||
|
||||
after_dims = pytest.helpers.first_page_dimensions(outpdf)
|
||||
|
||||
assert isclose(before_dims[0], after_dims[0])
|
||||
assert isclose(before_dims[1], after_dims[1])
|
||||
|
||||
|
||||
@pytest.mark.parametrize('basename', ['graph_ocred.pdf', 'cardinal.pdf'])
|
||||
def test_skip_pages_does_not_replicate(
|
||||
ensure_tess4, resources, basename, outdir):
|
||||
infile = resources / basename
|
||||
outpdf = outdir / basename
|
||||
|
||||
check_ocrmypdf(
|
||||
infile,
|
||||
outpdf, '--pdf-renderer', 'tess4', '--force-ocr',
|
||||
'--tesseract-timeout', '0',
|
||||
env=ensure_tess4
|
||||
)
|
||||
|
||||
info_in = pageinfo.pdf_get_all_pageinfo(str(infile))
|
||||
|
||||
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
|
||||
for page in info:
|
||||
assert len(page['images']) == 1, "skipped page was replicated"
|
||||
|
||||
for n in range(len(info_in)):
|
||||
assert info[n]['width_inches'] == info_in[n]['width_inches']
|
||||
|
||||
|
||||
def test_content_preservation(ensure_tess4, resources, outpdf):
|
||||
infile = resources / 'masks.pdf'
|
||||
|
||||
check_ocrmypdf(
|
||||
infile,
|
||||
outpdf, '--pdf-renderer', 'tess4', '--tesseract-timeout', '0',
|
||||
env=ensure_tess4
|
||||
)
|
||||
|
||||
info = pageinfo.pdf_get_all_pageinfo(str(outpdf))
|
||||
page = info[0]
|
||||
assert len(page['images']) > 1, "masked were rasterized"
|
||||
Reference in New Issue
Block a user