diff --git a/.travis.yml b/.travis.yml index 71509931..2ba7ba3d 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,9 +1,9 @@ dist: trusty language: python cache: + ccache: true pip: true directories: - - packages - $HOME/Library/Caches/Homebrew env: @@ -50,7 +50,7 @@ before_install: | install: - pip3 install ".$EXTRAS" -- pip3 install -r requirements.txt -r test_requirements.txt +- pip3 install -r test_requirements.txt script: - tesseract --version diff --git a/.travis/linux_before_install.sh b/.travis/linux_before_install.sh index 18730bd4..a4ff1622 100644 --- a/.travis/linux_before_install.sh +++ b/.travis/linux_before_install.sh @@ -8,7 +8,6 @@ sudo add-apt-repository ppa:heyarje/libav-11 -y sudo apt-get update -qq sudo apt-get install -y \ ghostscript \ - qpdf \ poppler-utils \ libavformat56 \ libavcodec56 \ @@ -27,5 +26,17 @@ sudo apt-get install -y --no-install-recommends \ pip install --upgrade pip mkdir -p packages -[ -f packages/unpaper_6.1-1.deb ] || wget -q 'https://www.dropbox.com/s/vaq0kbwi6e6au80/unpaper_6.1-1.deb?raw=1' -O packages/unpaper_6.1-1.deb +wget -q 'https://www.dropbox.com/s/vaq0kbwi6e6au80/unpaper_6.1-1.deb?raw=1' -O packages/unpaper_6.1-1.deb sudo dpkg -i packages/unpaper_6.1-1.deb + +if [ ! -f /usr/local/bin/qpdf ]; then + export QPDF_RELEASE='https://github.com/qpdf/qpdf/releases/download/release-qpdf-8.0.2/qpdf-8.0.2.tar.gz' + mkdir qpdf + wget -q $QPDF_RELEASE -O - | tar xz -C qpdf --strip-components=1 + cd qpdf/ + export PATH="/usr/local/opt/ccache/libexec:$PATH" + ./configure --prefix=/usr + make -j 2 + sudo make install + cd .. +fi diff --git a/docs/advanced.rst b/docs/advanced.rst index 349a953e..5f592079 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -136,4 +136,4 @@ The ``tesseract`` renderer creates a PDF with the image and text layers precompo If a PDF created with this renderer using Tesseract versions older than 3.05.00 is then passed through Ghostscript's pdfwrite feature, the OCR text *may* be corrupted. The ``--output-type=pdfa`` argument will produce a warning in this situation. -*This renderer is deprecated and will be removed whenever support for older versions of Tesseract is dropped.* +*This renderer is deprecated and will be removed whenever support for older versions of Tesseract is dropped.* \ No newline at end of file diff --git a/docs/cookbook.rst b/docs/cookbook.rst index 8d8cffd3..a3c4dbef 100644 --- a/docs/cookbook.rst +++ b/docs/cookbook.rst @@ -55,6 +55,8 @@ OCR will attempt to automatic correct the rotation of each page. This can help f You can increase (decrease) the parameter ``--rotate-pages-threshold`` to make page rotation more (less) aggressive. +If the page is "just a little off horizontal", like a crooked picture, then you want ``--deskew``. ``--rotate-pages`` is for when the cardinal angle is wrong. + OCR languages other than English """""""""""""""""""""""""""""""" @@ -81,15 +83,28 @@ This produces a file named "output.pdf" and a companion text file named "output. OCR images, not PDFs -------------------- -Use a program like `img2pdf `_ to convert your images to PDFs, and then pipe the results to run ocrmypdf: +If you are starting with images, you can just use Tesseract 3.04 or later directly to convert images to PDFs: + +.. code-block:: bash + + tesseract my-image.jpg output-prefix pdf + +.. code-block:: bash + + # When there are multiple images + tesseract text-file-containing-list-of-image-filenames.txt output-prefix pdf + +Tesseract's PDF output is quite good – OCRmyPDF uses it by internally by default. However, OCRmyPDF has many features not available in Tesseract like like image processing, metadata control, and PDF/A generation. + +Use a program like `img2pdf `_ to convert your images to PDFs, and then pipe the results to run ocrmypdf. The `-` tells ocrmypdf to read standard input. .. code-block:: bash img2pdf my-images*.jpg | ocrmypdf - myfile.pdf -``img2pdf`` also has features to control the position of images on a page, if desired. +``img2pdf`` is recommended because it does an excellent job at generating PDFs without transcoding images. -For convenience, OCRmyPDF can convert single images to PDFs on its own. If the resolution (dots per inch, DPI) of an image is not set or is incorrect, it can be overridden with ``--image-dpi``. (As 1 inch is 2.54 cm, 1 dpi = 0.39 dpcm). +For convenience, OCRmyPDF can also convert single images to PDFs on its own. If the resolution (dots per inch, DPI) of an image is not set or is incorrect, it can be overridden with ``--image-dpi``. (As 1 inch is 2.54 cm, 1 dpi = 0.39 dpcm). .. code-block:: bash @@ -101,11 +116,6 @@ If you have multiple images, you must use ``img2pdf`` to convert the images to P ImageMagick ``convert`` can also convert a group of images to PDF, but in the author's experience it takes a long time, transcodes unnecessarily and gives poor results. -You can also use Tesseract 3.04+ directly to convert single page images or multi-page TIFFs to PDF: - -.. code-block:: bash - - tesseract my-image.jpg output-prefix pdf Image processing ---------------- diff --git a/docs/installation.rst b/docs/installation.rst index 0443acae..58da52e3 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -1,12 +1,9 @@ Installation ============ -OCRmyPDF requires Python 3.5 (or newer) and Tesseract 3.04 (or newer). - -Python 3.6 and Tesseract 4.x are recommended for best OCR results and best performance. - -OCRmyPDF 6.x adds a dependency on PyMuPDF ("fitz"). This library is not widely available in platform distributions, and it improves OCRmyPDF in certain conditions. Consider installing OCRmyPDF from the Python binary wheels, which include a precompiled version of this library. +The easiest way to install OCRmyPDF to follow the steps for your operating system/platform. +If you want to use the latest version of OCRmyPDF, your best bet is to install the most recent version your platform provides, and then upgrade that version by installing the Python binary wheels. .. contents:: Platform-specific steps :depth: 1 @@ -345,11 +342,56 @@ where /c/Users/sampleuser is a Unix representation of the Windows path C:\\Users `Bash on Ubuntu on Windows `_ should also be a viable route for running the OCRmyPDF Docker container. + +Installing with Python pip +-------------------------- + +First, install `your platform's version `_ of ``ocrmypdf``, if available, as a way of ensuring that external dependencies are (mostly) satisified, even though the platform version may be out of date. Use ``ocrmypdf --version`` to confirm what version was installed. + +Then you can install the latest OCRmyPDF from the Python wheels. First try: + +.. code-block:: bash + + pip3 install --user ocrmypdf + +You should then be able to run ``ocrmypdf --version`` and see that the latest version was located. + +Since ``pip3 install --user`` does not work correctly on some platforms, notably Ubuntu 16.04 and older, and the Homebrew version of Python, instead use this for a system wide installation: + +.. code-block:: bash + + pip3 install ocrmypdf + +Requirements for pip and HEAD install +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +OCRmyPDF currently requires these external programs to be installed: + +- Python 3.5 or newer +- Tesseract 3.04 or newer +- Ghostscript 9.15 or newer +- qpdf 7.0.0 or newer + +The following dependencies are recommended: + +- Python 3.6 +- Tesseract 4.00 or newer +- Ghostscript 9.22 or newer +- qpdf 8.0.2 or newer +- unpaper 6.1 +- PyMuPDF 1.12.5 or newer + +These are in addition to the Python packaging dependencies, meaning that unfortunately, the ``pip install`` command cannot satisfy all of them. + +Python 3.6 and Tesseract 4.x are recommended for best OCR results and best performance. + +The library PyMuPDF is not widely available in platform distributions, and it improves OCRmyPDF in certain conditions. Consider installing OCRmyPDF from the Python binary wheels, which include a precompiled version of this library. + + Installing HEAD revision from sources ------------------------------------- -If you have ``git`` and Python 3.5 or newer installed, you can install from source. When the ``pip`` installer runs, -it will alert you if dependencies are missing. +If you have ``git`` and Python 3.5 or newer installed, you can install from source. When the ``pip`` installer runs, it will alert you if dependencies are missing. To install the HEAD revision from sources in the current Python 3 environment: diff --git a/docs/introduction.rst b/docs/introduction.rst index a96ae7cd..f92bf935 100644 --- a/docs/introduction.rst +++ b/docs/introduction.rst @@ -79,7 +79,8 @@ OCRmyPDF is limited by the Tesseract OCR engine. As such it experiences these l * It is not always good at analyzing the natural reading order of documents. For example, it may fail to recognize that a document contains two columns and join text across the columns. * Poor quality scans may produce poor quality OCR. Garbage in, garbage out. * PDFs that use transparent layers are not currently checked in the test suite, so they may not work correctly. - +* It does not expose information about what font family text belongs to. + OCRmyPDF is also limited by the PDF specification: * PDF encodes the position of text glyphs but does not encode document structure. There is no markup that divides a document in sections, paragraphs, sentences, or even words (since blank spaces are not represented). As such all elements of document structure including the spaces between words must be derived heuristically. Some PDF viewers do a better job of this than others. diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 017fae17..f9ab4676 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -5,16 +5,43 @@ OCRmyPDF uses `semantic versioning `_ for its command line i The OCRmyPDF package itself does not contain a public API, although it is fairly stable and breaking changes are usually timed with a major release. A future release will clearly define the stable public API. +.. Issue regex + find: [^`]\#([0-9]{1,3})[^0-9] + replace: `#$1 `_ + +v6.1.4 +------ + +- Fix issue #248, ``--clean`` argument may remove OCR from left column of text on certain documents. We now set ``--layout none`` to suppress this. + +- The test cache was updated to reflect the change above. + +- Change test suite to accommodate Ghostscript 9.23's new ability to insert JPEGs into PDFs without transcoding. + +- XMP metadata in PDFs is now examined using ``defusedxml`` for safety. + +- If an external process exits with a signal when asked to report its version, we now print the system error message instead of suppressing it. This occurred when the required executable was found but was missing a shared library. + +- qpdf 7.0.0 or newer is now required as the test suite can no longer pass without it. + +Notes +~~~~~ + +- An apparent `regression in Ghostscript 9.23 `_ will cause some ocrmypdf output files to become invalid in rare cases; the workaround for the moment is to set ``--force-ocr``. + + v6.1.3 ------ -- Fix issue #247, ``/CreationDate`` metadata not copied from input to output. +- Fix issue `#247 `_, ``/CreationDate`` metadata not copied from input to output. + +- A warning is now issued when Python 3.5 is used on files with a large page count, as this case is known to regress to single core performance. The cause of this problem is unknown. v6.1.2 ------ -- Upgrade to PyMuPDF v1.12.5 which includes a more complete fix to #239. +- Upgrade to PyMuPDF v1.12.5 which includes a more complete fix to `#239 `_. - Add ``defusedxml`` dependency. @@ -30,11 +57,11 @@ v6.1.0 - PyMuPDF is now an optional but recommended dependency, to alleviate installation difficulties on platforms that have less access to PyMuPDF than the author anticipated. Install OCRmyPDF with ``pip install ocrmypdf[fitz]`` to use it to its full potential. -- Fix ``FileExistsError`` that could occur if OCR timed out while it was generating the output file. (#218) +- Fix ``FileExistsError`` that could occur if OCR timed out while it was generating the output file. (`#218 `_) - Fix table of contents/bookmarks all being redirected to page 1 when generating a PDF/A (with PyMuPDF). (Without PyMuPDF the table of contents is removed in PDF/A mode.) -- Fix "RuntimeError: invalid key in dict" when table of contents/bookmarks titles contained the character ``)``. (#239) +- Fix "RuntimeError: invalid key in dict" when table of contents/bookmarks titles contained the character ``)``. (`#239 `_) - Added a new argument ``--skip-repair`` to skip the initial PDF repair step if the PDF is already well-formed (because another program repaired it). @@ -58,21 +85,21 @@ v6.0.0 + The ``--pdf-renderer tess4`` alias for ``sandwich`` was removed. -- Fixed an issue where OCRmyPDF failed to detect existing text on pages, depending on how the text and fonts were encoded within the PDF. (#233, #232) +- Fixed an issue where OCRmyPDF failed to detect existing text on pages, depending on how the text and fonts were encoded within the PDF. (`#233 `_, `#232 `_) -- Fixed an issue that caused dramatic inflation of file sizes when ``--skip-text --output-type pdf`` was used. OCRmyPDF now removes duplicate resources such as fonts, images and other objects that it generates. (#237) +- Fixed an issue that caused dramatic inflation of file sizes when ``--skip-text --output-type pdf`` was used. OCRmyPDF now removes duplicate resources such as fonts, images and other objects that it generates. (`#237 `_) -- Improved performance of the initial page splitting step. Originally this step was not believed to be expensive and ran in a process. Large file testing revealed it to be a bottleneck, so it is now parallelized. On a 700 page file with quad core machine, this change saves about 2 minutes. (#234) +- Improved performance of the initial page splitting step. Originally this step was not believed to be expensive and ran in a process. Large file testing revealed it to be a bottleneck, so it is now parallelized. On a 700 page file with quad core machine, this change saves about 2 minutes. (`#234 `_) -- The test suite now includes a cache that can be used to speed up test runs across platforms. This also does not require computing checksums, so it's faster. (#217) +- The test suite now includes a cache that can be used to speed up test runs across platforms. This also does not require computing checksums, so it's faster. (`#217 `_) v5.7.0 ------ -- Fixed an issue that caused poor CPU utilization on machines more than 4 cores when running Tesseract 4. (Related to issue #217.) +- Fixed an issue that caused poor CPU utilization on machines more than 4 cores when running Tesseract 4. (Related to issue `#217 `_.) -- The 'hocr' renderer has been improved. The 'sandwich' and 'tesseract' renderers are still better for most use cases, but 'hocr' may be useful for people who work with the PDF.js renderer in English/ASCII languages. (#225) +- The 'hocr' renderer has been improved. The 'sandwich' and 'tesseract' renderers are still better for most use cases, but 'hocr' may be useful for people who work with the PDF.js renderer in English/ASCII languages. (`#225 `_) + It now formats text in a matter that is easier for certain PDF viewers to select and extract copy and paste text. This should help macOS Preview and PDF.js in particular. + The appearance of selected text and behavior of selecting text is improved. @@ -95,7 +122,7 @@ v5.6.2 v5.6.1 ------ -- Fix issue #219: change how the final output file is created to avoid triggering permission errors when the output is a special file such as ``/dev/null`` +- Fix issue `#219 `_: change how the final output file is created to avoid triggering permission errors when the output is a special file such as ``/dev/null`` - Fix test suite failures due to a qpdf 8.0.0 regression and Python 3.5's handling of symlink - The "encrypted PDF" error message was different depending on the type of PDF encryption. Now a single clear message appears for all types of PDF encryption. - ocrmypdf is now in Homebrew. Homebrew users are advised to the version of ocrmypdf in the official homebrew-core formulas rather than the private tap. @@ -105,7 +132,7 @@ v5.6.1 v5.6.0 ------ -- Fix issue #216: preserve "text as curves" PDFs without rasterizing file +- Fix issue `#216 `_: preserve "text as curves" PDFs without rasterizing file - Related to the above, messages about rasterizing are more consistent - For consistency versions minor releases will now get the trailing .0 they always should have had. @@ -122,11 +149,11 @@ v5.5 v5.4.4 ------ -- Fix issue #181: fix final merge failure for PDFs with more pages than the system file handle limit (``ulimit -n``) -- Fix issue #200: an uncommon syntax for formatting decimal numbers in a PDF would cause qpdf to issue a warning, which ocrmypdf treated as an error. Now this the warning is relayed. +- Fix issue `#181 `_: fix final merge failure for PDFs with more pages than the system file handle limit (``ulimit -n``) +- Fix issue `#200 `_: an uncommon syntax for formatting decimal numbers in a PDF would cause qpdf to issue a warning, which ocrmypdf treated as an error. Now this the warning is relayed. - Fix an issue where intermediate PDFs would be created at version 1.3 instead of the version of the original file. It's possible but unlikely this had side effects. -- A warning is now issued when older versions of qpdf are used since issues like #200 cause qpdf to infinite-loop -- Address issue #140: if Tesseract outputs invalid UTF-8, escape it and print its message instead of aborting with a Unicode error +- A warning is now issued when older versions of qpdf are used since issues like `#200 `_ cause qpdf to infinite-loop +- Address issue `#140 `_: if Tesseract outputs invalid UTF-8, escape it and print its message instead of aborting with a Unicode error - Adding previously unlisted setup requirement, pytest-runner - Update documentation: fix an error in the example script for Synology with Docker images, improved security guidance, advised ``pip install --user`` @@ -184,9 +211,9 @@ v5.3.1 v5.3 ---- -- Added ``--user-words`` and ``--user-patterns`` arguments which are forwarded to Tesseract OCR as words and regular expressions respective to use to guide OCR. Supplying a list of subject-domain words should assist Tesseract with resolving words. (#165) -- Using a non Latin-1 language with the "hocr" renderer now warns about possible OCR quality and recommends workarounds (#176) -- Output file path added to error message when that location is not writable (#175) +- Added ``--user-words`` and ``--user-patterns`` arguments which are forwarded to Tesseract OCR as words and regular expressions respective to use to guide OCR. Supplying a list of subject-domain words should assist Tesseract with resolving words. (`#165 `_) +- Using a non Latin-1 language with the "hocr" renderer now warns about possible OCR quality and recommends workarounds (`#176 `_) +- Output file path added to error message when that location is not writable (`#175 `_) - Otherwise valid PDFs with leading whitespace at the beginning of the file are now accepted @@ -207,7 +234,7 @@ v5.1 v5.0.1 ------ -- Fixed issue #169, exception due to failure to create sidecar text files on some versions of Tesseract 3.04, including the jbarlow83/ocrmypdf Docker image +- Fixed issue `#169 `_, exception due to failure to create sidecar text files on some versions of Tesseract 3.04, including the jbarlow83/ocrmypdf Docker image v5.0 @@ -219,33 +246,33 @@ v5.0 + Support for Tesseract 3.02 and 3.03 dropped. Tesseract 3.04 or newer is required. Tesseract 4.00 (alpha) is supported. + The OCRmyPDF.sh script was removed. -- Add a new feature, ``--sidecar``, which allows creating "sidecar" text files which contain the OCR results in plain text. These OCR text is more reliable than extracting text from PDFs. Closes #126. -- New feature: ``--pdfa-image-compression``, which allows overriding Ghostscript's lossy-or-lossless image encoding heuristic and making all images JPEG encoded or lossless encoded as desired. Fixes #163. -- Fixed issue #143, added ``--quiet`` to suppress "INFO" messages -- Fixed issue #164, a typo +- Add a new feature, ``--sidecar``, which allows creating "sidecar" text files which contain the OCR results in plain text. These OCR text is more reliable than extracting text from PDFs. Closes `#126 `_. +- New feature: ``--pdfa-image-compression``, which allows overriding Ghostscript's lossy-or-lossless image encoding heuristic and making all images JPEG encoded or lossless encoded as desired. Fixes `#163 `_. +- Fixed issue `#143 `_, added ``--quiet`` to suppress "INFO" messages +- Fixed issue `#164 `_, a typo - Removed the command line parameters ``-n`` and ``--just-print`` since they have not worked for some time (reported as Ubuntu bug `#1687308 `_) v4.5.6 ------ -- Fixed issue #156, 'NoneType' object has no attribute 'getObject' on pages with no optional /Contents record. This should resolve all issues related to pages with no /Contents record. -- Fixed issue #158, ocrmypdf now stops and terminates if Ghostscript fails on an intermediate step, as it is not possible to proceed. -- Fixed issue #160, exception thrown on certain invalid arguments instead of error message +- Fixed issue `#156 `_, 'NoneType' object has no attribute 'getObject' on pages with no optional /Contents record. This should resolve all issues related to pages with no /Contents record. +- Fixed issue `#158 `_, ocrmypdf now stops and terminates if Ghostscript fails on an intermediate step, as it is not possible to proceed. +- Fixed issue `#160 `_, exception thrown on certain invalid arguments instead of error message v4.5.5 ------ - Automated update of macOS homebrew tap -- Fixed issue #154, KeyError '/Contents' when searching for text on blank pages that have no /Contents record. Note: incomplete fix for this issue. +- Fixed issue `#154 `_, KeyError '/Contents' when searching for text on blank pages that have no /Contents record. Note: incomplete fix for this issue. v4.5.4 ------ -- Fix ``--skip-big`` raising an exception if a page contains no images (#152) (thanks to @TomRaz) -- Fix an issue where pages with no images might trigger "cannot write mode P as JPEG" (#151) +- Fix ``--skip-big`` raising an exception if a page contains no images (`#152 `_) (thanks to @TomRaz) +- Fix an issue where pages with no images might trigger "cannot write mode P as JPEG" (`#151 `_) v4.5.3 @@ -260,7 +287,7 @@ v4.5.3 v4.5.2 ------ -- Fix issue #147. ``--pdf-renderer tess4 --clean`` will produce an oversized page containing the original image in the bottom left corner, due to loss DPI information. +- Fix issue `#147 `_. ``--pdf-renderer tess4 --clean`` will produce an oversized page containing the original image in the bottom left corner, due to loss DPI information. - Make "using Tesseract 4.0" warning less ominous - Set up machinery for homebrew OCRmyPDF tap @@ -268,13 +295,13 @@ v4.5.2 v4.5.1 ------ -- Fix issue #137, proportions of images with a non-square pixel aspect ratio would be distorted in output for ``--force-ocr`` and some other combinations of flags +- Fix issue `#137 `_, proportions of images with a non-square pixel aspect ratio would be distorted in output for ``--force-ocr`` and some other combinations of flags v4.5 ---- -- Exotic PDFs containing "Form XObjects" are now supported (issue #134; PDF reference manual 8.10), and images they contain are taken into account when determining the resolution for rasterizing +- Exotic PDFs containing "Form XObjects" are now supported (issue `#134 `_; PDF reference manual 8.10), and images they contain are taken into account when determining the resolution for rasterizing - The Tesseract 4 Docker image no longer includes all languages, because it took so long to build something would tend to fail - OCRmyPDF now warns about using ``--pdf-renderer tesseract`` with Tesseract 3.04 or lower due to issues with Ghostscript corrupting the OCR text in these cases @@ -367,14 +394,14 @@ v4.3 v4.2.5 ------ -- Fixed an issue (#100) with PDFs that omit the optional /BitsPerComponent parameter on images +- Fixed an issue (`#100 `_) with PDFs that omit the optional /BitsPerComponent parameter on images - Removed non-free file milk.pdf v4.2.4 ------ -- Fixed an error (#90) caused by PDFs that use stencil masks properly +- Fixed an error (`#90 `_) caused by PDFs that use stencil masks properly - Fixed handling of PDFs that try to draw images or stencil masks without properly setting up the graphics state (such images are now ignored for the purposes of calculating DPI) v4.2.3 @@ -407,7 +434,7 @@ v4.2.1 v4.2 ---- -- ocrmypdf will now try to convert single image files to PDFs if they are provided as input (#15) +- ocrmypdf will now try to convert single image files to PDFs if they are provided as input (`#15 `_) + This is a basic convenience feature. It only supports a single image and always makes the image fill the whole page. + For better control over image to PDF conversion, use ``img2pdf`` (one of ocrmypdf's dependencies) @@ -423,9 +450,9 @@ v4.2 + supports users who want to use OCRmyPDF to reconstruct text information in PDFs with damaged Unicode maps (copy and paste text does not match displayed text) + supports reinterpreting PDFs where text was rendered as curves for printing, and text needs to be recovered - + fixes issue #82 + + fixes issue `#82 `_ -- Fixes an issue where, with certain settings, monochrome images in PDFs would be converted to 8-bit grayscale, increasing file size (#79) +- Fixes an issue where, with certain settings, monochrome images in PDFs would be converted to 8-bit grayscale, increasing file size (`#79 `_) - Support for Ubuntu 12.04 LTS "precise" has been dropped in favor of (roughly) Ubuntu 14.04 LTS "trusty" + Some Ubuntu "PPAs" (backports) are needed to make it work @@ -541,7 +568,7 @@ New features - Automatic page rotation (``-r``) is now available. It uses ignores any prior rotation information on PDFs and sets rotation based on the dominant orientation of detectable text. This feature is - fairly reliable but some false positives occur especially if there is not much text to work with. (#4) + fairly reliable but some false positives occur especially if there is not much text to work with. (`#4 `_) - Deskewing is now performed using Leptonica instead of unpaper. Leptonica is faster and more reliable at image deskewing than unpaper. @@ -552,14 +579,14 @@ Fixes - Fixed an issue where lossless reconstruction could cause some pages to be appear incorrectly if the page was rotated by the user in Acrobat after being scanned (specifically if it a /Rotate tag) - Fixed an issue where lossless reconstruction could misalign the graphics layer with respect to - text layer if the page had been cropped such that its origin is not (0, 0) (#49) + text layer if the page had been cropped such that its origin is not (0, 0) (`#49 `_) Changes ^^^^^^^ - Logging output is now much easier to read -- ``--deskew`` is now performed by Leptonica instead of unpaper (#25) +- ``--deskew`` is now performed by Leptonica instead of unpaper (`#25 `_) - libffi is now required - Some changes were made to the Docker and Travis build environments to support libffi - ``--pdf-renderer=tesseract`` now displays a warning if the Tesseract version is less than 3.04.01, @@ -574,7 +601,7 @@ v3.2.1 Changes ^^^^^^^ -- Fixed issue #47 "convert() got and unexpected keyword argument 'dpi'" by upgrading to img2pdf 0.2 +- Fixed issue `#47 `_ "convert() got and unexpected keyword argument 'dpi'" by upgrading to img2pdf 0.2 - Tweaked the Dockerfiles @@ -618,7 +645,7 @@ Changes - Python 3.5 and macOS El Capitan are now supported platforms - no changes were needed to implement support - Improved some error messages related to missing input files -- Fixed issue #20 - uppercase .PDF extension not accepted +- Fixed issue `#20 `_ - uppercase .PDF extension not accepted - Fixed an issue where OCRmyPDF failed to text that certain pages contained previously OCR'ed text, such as OCR text produced by Tesseract 3.04 - Inserts /Creator tag into PDFs so that errors can be traced back to this project @@ -695,14 +722,14 @@ Release candidates - rc9: - - fix issue #118: report error if ghostscript iccprofiles are missing - - fixed another issue related to #111: PDF rasterized to palette file + - fix issue `#118 `_: report error if ghostscript iccprofiles are missing + - fixed another issue related to `#111 `_: PDF rasterized to palette file - add support image files with a palette - don't try to validate PDF file after an exception occurs - rc8: - - fix issue #111: exception thrown if PDF is missing DocumentInfo dictionary + - fix issue `#111 `_: exception thrown if PDF is missing DocumentInfo dictionary - rc7: diff --git a/requirements.txt b/requirements.txt index c39fc2a3..d7dec8b0 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,9 +2,9 @@ # setup.py lists a separate set of requirements that are looser to simplify # installation ruffus == 2.6.3 -Pillow == 5.0.0 +Pillow == 5.1.0 reportlab == 3.4.0 PyPDF2 == 1.26.0 img2pdf == 0.2.4 cffi == 1.11.5 -PyMuPDF == 1.12.4 +PyMuPDF == 1.12.5 \ No newline at end of file diff --git a/setup.py b/setup.py index cdc1848e..5420aa6e 100644 --- a/setup.py +++ b/setup.py @@ -186,7 +186,7 @@ if not forced and command.startswith('install') or \ ) check_external_program( program='qpdf', - need_version='5.1.1', # limited by Travis CI / Ubuntu 14.04 backports + need_version='7.0.0', # test suite known to fail on 5.1.1 package='qpdf', version_check_args=['--version'] ) @@ -243,7 +243,9 @@ setup( 'cffi >= 1.9.1', # must be a setup and install requirement 'defusedxml >= 0.5.0', # pure Python, so track HEAD closely 'img2pdf >= 0.2.4', # pure Python, so track HEAD closely - 'Pillow >= 4.0.0', # Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3 + 'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"', + # Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3 + # block 5.1.0, broken wheels 'PyPDF2 >= 1.26', # pure Python, so track HEAD closely 'reportlab >= 3.3.0', # oldest released version with sane image handling 'ruffus == 2.6.3', # pinned - ocrmypdf implements a 2.6.3 workaround diff --git a/src/ocrmypdf/exec/__init__.py b/src/ocrmypdf/exec/__init__.py index 85e09ebe..f670c726 100644 --- a/src/ocrmypdf/exec/__init__.py +++ b/src/ocrmypdf/exec/__init__.py @@ -24,6 +24,7 @@ from subprocess import run, STDOUT, PIPE, CalledProcessError from ..exceptions import MissingDependencyError + def get_version(program, *, version_arg='--version', regex=r'(\d+(\.\d+)*)'): "Get the version of the specified program" @@ -37,6 +38,10 @@ def get_version(program, *, stdout=PIPE, stderr=STDOUT, check=True) output = proc.stdout except CalledProcessError as e: + if e.returncode < 0: + raise MissingDependencyError( + "Ran program '{}' but it exited with an error:\n{}".format( + program, e.output)) from e raise MissingDependencyError( "Could not find program '{}' on the PATH".format( program)) from e diff --git a/src/ocrmypdf/exec/ghostscript.py b/src/ocrmypdf/exec/ghostscript.py index 783a8478..7292034f 100644 --- a/src/ocrmypdf/exec/ghostscript.py +++ b/src/ocrmypdf/exec/ghostscript.py @@ -127,6 +127,11 @@ def generate_pdfa(pdf_pages, output_file, compression, log, "-dAutoFilterGrayImages=true", ] + # Older versions of Ghostscript expect a leading slash in + # sColorConversionStrategy, newer ones should not have it. See Ghostscript + # git commit fe1c025d. + strategy = 'RGB' if version() >= '9.19' else '/RGB' + with NamedTemporaryFile(delete=True) as gs_pdf: args_gs = [ "gs", @@ -137,7 +142,7 @@ def generate_pdfa(pdf_pages, output_file, compression, log, "-dNumRenderingThreads=" + str(threads), "-sDEVICE=pdfwrite", "-dAutoRotatePages=/None", - "-sColorConversionStrategy=/RGB", + "-sColorConversionStrategy=" + strategy, "-sProcessColorModel=DeviceRGB" ] + compression_args + [ "-dJPEGQ=95", diff --git a/src/ocrmypdf/exec/unpaper.py b/src/ocrmypdf/exec/unpaper.py index 2afea60b..e41ee038 100644 --- a/src/ocrmypdf/exec/unpaper.py +++ b/src/ocrmypdf/exec/unpaper.py @@ -57,15 +57,15 @@ def run(input_file, output_file, dpi, log, mode_args): else: im = im.convert(mode='RGB') except IOError as e: - log.error( - "Could not convert image with type " + im.mode) + log.error("Could not convert image with type " + im.mode) + im.close() raise MissingDependencyError() from e try: suffix = SUFFIXES[im.mode] except KeyError: - log.error( - "Failed to convert image to a supported format.") + log.error("Failed to convert image to a supported format.") + im.close() raise MissingDependencyError() from e with NamedTemporaryFile(suffix=suffix) as input_pnm, \ @@ -90,20 +90,9 @@ def run(input_file, output_file, dpi, log, mode_args): Image.open(output_pnm.name).save(output_file, dpi=(dpi, dpi)) -def deskew(input_file, output_file, dpi, log): - run(input_file, output_file, dpi, log, [ - '--mask-scan-size', '100', # don't blank out narrow columns - '--no-border-align', # don't align visible content to borders - '--no-mask-center', # don't center visible content within page - '--no-grayfilter', # don't remove light gray areas - '--no-blackfilter', # don't remove solid black areas - '--no-noisefilter', # don't remove salt and pepper noise - '--no-blurfilter' # don't remove blurry objects/debris - ]) - - def clean(input_file, output_file, dpi, log): run(input_file, output_file, dpi, log, [ + '--layout', 'none', '--mask-scan-size', '100', # don't blank out narrow columns '--no-border-align', # don't align visible content to borders '--no-mask-center', # don't center visible content within page diff --git a/src/ocrmypdf/pdfa.py b/src/ocrmypdf/pdfa.py index e4947bfa..ffd0ff77 100644 --- a/src/ocrmypdf/pdfa.py +++ b/src/ocrmypdf/pdfa.py @@ -23,6 +23,8 @@ from datetime import datetime from xml.parsers.expat import ExpatError import pkg_resources import PyPDF2 as pypdf +from defusedxml.minidom import parseString as defused_parseString +from unittest.mock import patch ICC_PROFILE_RELPATH = 'data/sRGB.icc' @@ -221,7 +223,9 @@ def file_claims_pdfa(filename): """ pdf = pypdf.PdfFileReader(filename) try: - xmp = pdf.getXmpMetadata() + # Monkeypatch PyPDF2 to use defusedxml as its XML parser, for safety + with patch('xml.dom.minidom.parseString', new=defused_parseString): + xmp = pdf.getXmpMetadata() except ExpatError: return {'pass': False, 'output': 'pdf', 'conformance': 'Invalid XML metadata'} diff --git a/tests/cache/2400dpi/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/2400dpi/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index 2f445903..ba61422d 100644 Binary files a/tests/cache/2400dpi/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/2400dpi/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/aspect/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/aspect/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index dae550f5..0b7aa523 100644 Binary files a/tests/cache/aspect/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/aspect/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/aspect/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/aspect/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin index 76ed3f5b..280c9b7e 100644 Binary files a/tests/cache/aspect/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/aspect/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin b/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin index c92a2c19..6f6cc21d 100644 Binary files a/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin and b/tests/cache/aspect/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin differ diff --git a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index 04aa6280..12582148 100644 Binary files a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin index 04aa6280..12582148 100644 Binary files a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin and b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin index 04aa6280..12582148 100644 Binary files a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin and b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin index 04aa6280..12582148 100644 Binary files a/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin and b/tests/cache/cardinal/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/ccitt/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/ccitt/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index 62112e95..e41b5d6d 100644 Binary files a/tests/cache/ccitt/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/ccitt/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/ccitt/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/ccitt/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin index d83c0c08..e72bbaeb 100644 Binary files a/tests/cache/ccitt/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/ccitt/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/francais/__-l__fra__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/francais/__-l__fra__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index e3d225ae..6075e6e5 100644 Binary files a/tests/cache/francais/__-l__fra__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/francais/__-l__fra__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/graph_ocred/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/graph_ocred/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index beab72f8..6160b29f 100644 Binary files a/tests/cache/graph_ocred/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/graph_ocred/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/jbig2/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/jbig2/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index 2720dd5c..cd20be4f 100644 Binary files a/tests/cache/jbig2/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/jbig2/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/jbig2/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/jbig2/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin index f0958dd2..ae7015d4 100644 Binary files a/tests/cache/jbig2/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/jbig2/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/jbig2/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin b/tests/cache/jbig2/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin index cba01b22..9467e200 100644 Binary files a/tests/cache/jbig2/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin and b/tests/cache/jbig2/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin differ diff --git a/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index e83723d3..73f70df1 100644 Binary files a/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/stderr.bin b/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/stderr.bin index 68381697..829ca735 100644 Binary files a/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/stderr.bin and b/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/stderr.bin differ diff --git a/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/txt.bin b/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/txt.bin index 9816c8ce..5dc39a20 100644 Binary files a/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/txt.bin and b/tests/cache/lichtenstein/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/txt.bin differ diff --git a/tests/cache/lichtenstein/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/lichtenstein/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin index 51e7453e..2da94f8b 100644 Binary files a/tests/cache/lichtenstein/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/lichtenstein/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/manifest.json b/tests/cache/manifest.json deleted file mode 100644 index 8e3393a9..00000000 --- a/tests/cache/manifest.json +++ /dev/null @@ -1,47 +0,0 @@ -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.9t3fsoql/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.9t3fsoql/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/palette.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.mg78jtto/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.mg78jtto/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/jbig2.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.z0iuk012/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.z0iuk012/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/ccitt.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.h2o3yo13/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.h2o3yo13/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/lichtenstein.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.tav12ppr/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.tav12ppr/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/ccitt.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.8w9ao33a/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.8w9ao33a/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/jbig2.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.e1af_65a/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.e1af_65a/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/palette.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ylml9ah3/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ylml9ah3/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/ccitt.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1o424xo0/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1o424xo0/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/lichtenstein.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "osd", "--psm", "0", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000001.ocr.preview.jpg", "stdout"], "argv_slug": "__-l__osd__--psm__0__000001.ocr.preview.jpg__stdout", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "osd", "--psm", "0", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000003.ocr.preview.jpg", "stdout"], "argv_slug": "__-l__osd__--psm__0__000003.ocr.preview.jpg__stdout", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "osd", "--psm", "0", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000004.ocr.preview.jpg", "stdout"], "argv_slug": "__-l__osd__--psm__0__000004.ocr.preview.jpg__stdout", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "osd", "--psm", "0", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000002.ocr.preview.jpg", "stdout"], "argv_slug": "__-l__osd__--psm__0__000002.ocr.preview.jpg__stdout", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.z552uoqg/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.z552uoqg/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/skew.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1yo3m6jc/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1yo3m6jc/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/skew.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.9b6v5qah/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.9b6v5qah/000001", "hocr", "txt"], "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/skew.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000004.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000004.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000003.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000003.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000002.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000002.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.udmc7x8w/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/cardinal.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.caos0t47/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.caos0t47/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/graph_ocred.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000005.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000005.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000004.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000004.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000003.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000003.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000006.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000006.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "--psm", "7", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.qdd9eyjq/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.qdd9eyjq/000001", "hocr", "txt"], "argv_slug": "__-l__eng__--psm__7__000001.ocr.png__000001__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/skew.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.1pk_2ejc/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.bp6c363s/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.bp6c363s/000001", "hocr", "txt"], "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/aspect.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.chi5902h/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.chi5902h/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/aspect.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "--psm", "7", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.5nrsbqn2/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.5nrsbqn2/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__--psm__7__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/skew.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.9ftp2zvl/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.9ftp2zvl/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/aspect.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "fra", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eqhklb0y/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eqhklb0y/000001.text", "pdf", "txt"], "argv_slug": "__-l__fra__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/francais.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.832q8ia9/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.832q8ia9/000001", "hocr", "txt"], "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/jbig2.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000005.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000005", "hocr", "txt"], "argv_slug": "__-l__eng__000005.ocr.png__000005__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000004.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000004", "hocr", "txt"], "argv_slug": "__-l__eng__000004.ocr.png__000004__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000003.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000003", "hocr", "txt"], "argv_slug": "__-l__eng__000003.ocr.png__000003__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000006.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000006", "hocr", "txt"], "argv_slug": "__-l__eng__000006.ocr.png__000006__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ksa5wt_o/000001", "hocr", "txt"], "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.v21jys8r/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.v21jys8r/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/2400dpi.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000005.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000005.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000005.image__000005.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000004.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000004.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000004.image__000004.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000003.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000003.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000003.image__000003.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000006.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000006.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000006.image__000006.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000001.image", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.eersguc3/000001.rendered", "pdf", "txt"], "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.8js59f0_/000002.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.8js59f0_/000002.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/multipage.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "eng", "-c", "textonly_pdf=1", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.fbs5vlu1/000001.ocr.png", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.fbs5vlu1/000001.text", "pdf", "txt"], "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/poster.pdf"} -{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.4.0-x86_64-i386-64bit", "python": "3.6.4", "args": ["-l", "osd", "--psm", "0", "/var/folders/37/78_114p552q16vv6vmgm5kr00000gn/T/com.github.ocrmypdf.ab_bvzic/000001.ocr.preview.jpg", "stdout"], "argv_slug": "__-l__osd__--psm__0__000001.ocr.preview.jpg__stdout", "sourcefile": "/Users/jb/Documents/src/OCRmyPDF-dev/tests/resources/poster.pdf"} diff --git a/tests/cache/manifest.jsonl b/tests/cache/manifest.jsonl new file mode 100644 index 00000000..74db9071 --- /dev/null +++ b/tests/cache/manifest.jsonl @@ -0,0 +1,60 @@ +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/palette.pdf", "args": ["-l", "eng", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/ccitt.pdf", "args": ["-l", "eng", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/lichtenstein.pdf", "args": ["-l", "eng", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/graph_ocred.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__osd__--psm__0__000001.ocr.preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001.ocr.preview.jpg", "stdout"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__osd__--psm__0__000003.ocr.preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000003.ocr.preview.jpg", "stdout"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__osd__--psm__0__000004.ocr.preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000004.ocr.preview.jpg", "stdout"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__osd__--psm__0__000002.ocr.preview.jpg__stdout", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000002.ocr.preview.jpg", "stdout"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004.ocr.png", "$TMPDIR/000004.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002.ocr.png", "$TMPDIR/000002.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003.ocr.png", "$TMPDIR/000003.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002.ocr.png", "$TMPDIR/000002.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004.ocr.png", "$TMPDIR/000004.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003.ocr.png", "$TMPDIR/000003.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004.ocr.png", "$TMPDIR/000004.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003.ocr.png", "$TMPDIR/000003.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002.ocr.png", "$TMPDIR/000002.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004.ocr.png", "$TMPDIR/000004.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002.ocr.png", "$TMPDIR/000002.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt", "sourcefile": "resources/cardinal.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003.ocr.png", "$TMPDIR/000003.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000003.ocr.png__000003__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003.ocr.png", "$TMPDIR/000003", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000004.ocr.png__000004__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004.ocr.png", "$TMPDIR/000004", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000005.ocr.png__000005__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005.ocr.png", "$TMPDIR/000005", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000006.ocr.png__000006__hocr__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006.ocr.png", "$TMPDIR/000006", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__--psm__7__000001.ocr.png__000001__hocr__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/aspect.pdf", "args": ["-l", "eng", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__--psm__7__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/skew.pdf", "args": ["-l", "eng", "--psm", "7", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000004.image__000004.rendered__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000004.image", "$TMPDIR/000004.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000003.image__000003.rendered__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000003.image", "$TMPDIR/000003.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.image__000001.rendered__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000001.image", "$TMPDIR/000001.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/2400dpi.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000005.image__000005.rendered__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000005.image", "$TMPDIR/000005.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000001.ocr.png__000001__hocr__txt", "sourcefile": "resources/jbig2.pdf", "args": ["-l", "eng", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001", "hocr", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__000006.image__000006.rendered__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "$TMPDIR/000006.image", "$TMPDIR/000006.rendered", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000003.ocr.png", "$TMPDIR/000003.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000004.ocr.png", "$TMPDIR/000004.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000005.ocr.png", "$TMPDIR/000005.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000006.ocr.png", "$TMPDIR/000006.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt", "sourcefile": "resources/multipage.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000002.ocr.png", "$TMPDIR/000002.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__fra__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/francais.pdf", "args": ["-l", "fra", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__osd__--psm__0__000001.ocr.preview.jpg__stdout", "sourcefile": "resources/poster.pdf", "args": ["-l", "osd", "--psm", "0", "$TMPDIR/000001.ocr.preview.jpg", "stdout"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} +{"tesseract_version": "tesseract 4.0.0-beta.1 leptonica-1.75.3 libjpeg 9c : libpng 1.6.34 : libtiff 4.0.9 : zlib 1.2.11 : libopenjp2 2.3.0 Found AVX2 Found AVX Found SSE ", "platform": "Darwin-17.5.0-x86_64-i386-64bit", "python": "3.6.5", "argv_slug": "__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt", "sourcefile": "resources/poster.pdf", "args": ["-l", "eng", "-c", "textonly_pdf=1", "$TMPDIR/000001.ocr.png", "$TMPDIR/000001.text", "pdf", "txt"]} diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index fb5aea3f..b000f494 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin index 77f376df..fd5f8d97 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000002.ocr.png__000002.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin index 0eea41e6..9257db49 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000003.ocr.png__000003.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin index 922532ec..04a10174 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000004.ocr.png__000004.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/pdf.bin index c441b8f2..c61ee9ca 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/txt.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/txt.bin index 8214d0ee..f4fd0829 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/txt.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000005.ocr.png__000005.text__pdf__txt/txt.bin differ diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/pdf.bin index 80584cb6..8e143315 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/txt.bin b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/txt.bin index 8214d0ee..bad62d40 100644 Binary files a/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/txt.bin and b/tests/cache/multipage/__-l__eng__-c__textonly_pdf=1__000006.ocr.png__000006.text__pdf__txt/txt.bin differ diff --git a/tests/cache/multipage/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin index 29a4f587..fe02a930 100644 Binary files a/tests/cache/multipage/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin index 2efdf93d..45de3bad 100644 Binary files a/tests/cache/multipage/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin and b/tests/cache/multipage/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin differ diff --git a/tests/cache/multipage/__-l__eng__000003.image__000003.rendered__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__000003.image__000003.rendered__pdf__txt/pdf.bin index 06bf409e..505c72ed 100644 Binary files a/tests/cache/multipage/__-l__eng__000003.image__000003.rendered__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__000003.image__000003.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__000003.ocr.png__000003__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000003.ocr.png__000003__hocr__txt/hocr.bin index 4a76ff96..38b67707 100644 Binary files a/tests/cache/multipage/__-l__eng__000003.ocr.png__000003__hocr__txt/hocr.bin and b/tests/cache/multipage/__-l__eng__000003.ocr.png__000003__hocr__txt/hocr.bin differ diff --git a/tests/cache/multipage/__-l__eng__000004.image__000004.rendered__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__000004.image__000004.rendered__pdf__txt/pdf.bin index 84e0b575..51feea8c 100644 Binary files a/tests/cache/multipage/__-l__eng__000004.image__000004.rendered__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__000004.image__000004.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__000004.ocr.png__000004__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000004.ocr.png__000004__hocr__txt/hocr.bin index 03ae36d5..c4e74550 100644 Binary files a/tests/cache/multipage/__-l__eng__000004.ocr.png__000004__hocr__txt/hocr.bin and b/tests/cache/multipage/__-l__eng__000004.ocr.png__000004__hocr__txt/hocr.bin differ diff --git a/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/pdf.bin index b9b64b6b..f38644dd 100644 Binary files a/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/txt.bin b/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/txt.bin index 8214d0ee..f4fd0829 100644 Binary files a/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/txt.bin and b/tests/cache/multipage/__-l__eng__000005.image__000005.rendered__pdf__txt/txt.bin differ diff --git a/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/hocr.bin index 77142d82..ed7c176e 100644 Binary files a/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/hocr.bin and b/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/hocr.bin differ diff --git a/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/txt.bin b/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/txt.bin index 8214d0ee..f4fd0829 100644 Binary files a/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/txt.bin and b/tests/cache/multipage/__-l__eng__000005.ocr.png__000005__hocr__txt/txt.bin differ diff --git a/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/pdf.bin b/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/pdf.bin index 93d04700..5d4bfd9d 100644 Binary files a/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/pdf.bin and b/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/txt.bin b/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/txt.bin index 8214d0ee..bad62d40 100644 Binary files a/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/txt.bin and b/tests/cache/multipage/__-l__eng__000006.image__000006.rendered__pdf__txt/txt.bin differ diff --git a/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/hocr.bin b/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/hocr.bin index 09e0017c..b8d3374c 100644 Binary files a/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/hocr.bin and b/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/hocr.bin differ diff --git a/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/txt.bin b/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/txt.bin index 8214d0ee..bad62d40 100644 Binary files a/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/txt.bin and b/tests/cache/multipage/__-l__eng__000006.ocr.png__000006__hocr__txt/txt.bin differ diff --git a/tests/cache/palette/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/palette/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index a462026b..ceb1c49e 100644 Binary files a/tests/cache/palette/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/palette/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/palette/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/palette/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin index 165624b9..1fa10b57 100644 Binary files a/tests/cache/palette/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/palette/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/poster/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/poster/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index 4575efbd..103d549f 100644 Binary files a/tests/cache/poster/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/poster/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/skew/__-l__eng__--psm__7__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/skew/__-l__eng__--psm__7__000001.image__000001.rendered__pdf__txt/pdf.bin index be7955a7..0b282672 100644 Binary files a/tests/cache/skew/__-l__eng__--psm__7__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/skew/__-l__eng__--psm__7__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/skew/__-l__eng__--psm__7__000001.ocr.png__000001__hocr__txt/hocr.bin b/tests/cache/skew/__-l__eng__--psm__7__000001.ocr.png__000001__hocr__txt/hocr.bin index b049f13e..e1ade02a 100644 Binary files a/tests/cache/skew/__-l__eng__--psm__7__000001.ocr.png__000001__hocr__txt/hocr.bin and b/tests/cache/skew/__-l__eng__--psm__7__000001.ocr.png__000001__hocr__txt/hocr.bin differ diff --git a/tests/cache/skew/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin b/tests/cache/skew/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin index 9f0188d0..1ba0b3b7 100644 Binary files a/tests/cache/skew/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin and b/tests/cache/skew/__-l__eng__-c__textonly_pdf=1__000001.ocr.png__000001.text__pdf__txt/pdf.bin differ diff --git a/tests/cache/skew/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin b/tests/cache/skew/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin index e93f3d90..a1f0ab40 100644 Binary files a/tests/cache/skew/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin and b/tests/cache/skew/__-l__eng__000001.image__000001.rendered__pdf__txt/pdf.bin differ diff --git a/tests/cache/skew/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin b/tests/cache/skew/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin index 3c5bef0d..e1e7ae6d 100644 Binary files a/tests/cache/skew/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin and b/tests/cache/skew/__-l__eng__000001.ocr.png__000001__hocr__txt/hocr.bin differ diff --git a/tests/spoof/tesseract_cache.py b/tests/spoof/tesseract_cache.py index 9aea655f..6e0110fa 100755 --- a/tests/spoof/tesseract_cache.py +++ b/tests/spoof/tesseract_cache.py @@ -29,6 +29,7 @@ import subprocess import argparse import json import platform +import re """Cache output of tesseract to speed up test suite @@ -85,7 +86,8 @@ parser.add_argument('-c', action='append') parser.add_argument('--psm', type=int) parser.add_argument('--oem', type=int) -CACHE_ROOT = Path(__file__).resolve().parent.parent / 'cache' +TESTS_ROOT = Path(__file__).resolve().parent.parent +CACHE_ROOT = TESTS_ROOT / 'cache' def real_tesseract(): tess_args = ['tesseract'] + sys.argv[1:] @@ -178,12 +180,17 @@ def main(): manifest['tesseract_version'] = __version__.replace('\n', ' ') manifest['platform'] = platform.platform() manifest['python'] = platform.python_version() - manifest['args'] = sys.argv[1:] manifest['argv_slug'] = argv_slug - manifest['sourcefile'] = source + manifest['sourcefile'] = str(Path(source).relative_to(TESTS_ROOT)) + def clean_sys_argv(): + for arg in sys.argv[1:]: + yield re.sub(r'.*/com.github.ocrmypdf[^/]+[/](.*)', + r'$TMPDIR/\1', arg) + manifest['args'] = list(clean_sys_argv()) with (Path(CACHE_ROOT) / 'manifest.jsonl').open('a') as f: json.dump(manifest, f) f.write('\n') + f.flush() if __name__ == '__main__': diff --git a/tests/test_main.py b/tests/test_main.py index 34151d20..3d8b03d6 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -607,7 +607,14 @@ def test_closed_streams(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf): def test_masks(spoof_tesseract_noop, resources, outpdf): - check_ocrmypdf(resources / 'masks.pdf', outpdf, env=spoof_tesseract_noop) + p, out, err = run_ocrmypdf( + resources / 'masks.pdf', outpdf, env=spoof_tesseract_noop) + + if ghostscript.version() == '9.23' and \ + p.returncode == ExitCode.invalid_output_pdf: + pytest.xfail('https://bugs.ghostscript.com/show_bug.cgi?id=699216') + + assert p.returncode == ExitCode.ok def test_linearized_pdf_and_indirect_object(spoof_tesseract_noop, @@ -898,8 +905,14 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec, if compression == "jpeg": assert pdfimage.enc == Encoding.jpeg - elif compression == 'lossless': - assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000) + else: + if ghostscript.version() >= '9.23': + # Ghostscript 9.23 adds JPEG passthrough, which allows a JPEG to be + # copied without transcoding - so report + if image.endswith('jpg'): + assert pdfimage.enc == Encoding.jpeg + else: + assert pdfimage.enc not in (Encoding.jpeg, Encoding.jpeg2000) if im.mode.startswith('RGB') or im.mode.startswith('BGR'): assert pdfimage.color == Colorspace.rgb, \