Merge pull request #1 from jbarlow83/master

update master
2026-06-11 15:36:11 -04:00 · 2019-06-01 11:09:07 +02:00
parent 79c84eefa3 7e388f59af
commit 40b2ebcb37
14 changed files with 262 additions and 168 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
    rev: stable
    hooks:
    - id: black
-      language_version: python3.6
+      language_version: python3.7
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -490,3 +490,17 @@ To install all of the development and test requirements:
    pip install -r requirements/dev.txt -r requirements/test.txt
 To add JBIG2 encoding, see :ref:`jbig2`.
 Shell completions
 -----------------
 Completions for ``bash`` and ``fish`` are available in the project's
 ``misc/completion`` folder. The ``bash`` completions are likely ``zsh``
 compatible but this has not been confirmed. Package maintainers, please install
 these at the appropriate locations for your system.
 To manually install the ``bash`` completion, copy ``misc/completion/ocrmypdf.bash`` to
 ``/etc/bash_completion.d/ocrmypdf`` (rename the file).
 To manually install the ``fish`` completion, copy ``misc/completion/ocrmypdf.fish`` to
 ``~/.config/fish/completions/ocrmypdf.fish``.
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -13,6 +13,21 @@ Note that it is licensed under GPLv3, so scripts that ``import ocrmypdf`` and ar
   find:    [^`]\#([0-9]{1,3})[^0-9]
   replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_
 v8.3.0
 ------
 -   Improved the strategy for updating pages when a new image of the page was produced. We know attempt to preserve more content from the original file, for annotations in particular.
 -   For PDFs with more than 100 pages and a sequence where one PDF page was replaced and one or more subsequent ones were skipped, an intermediate file would be corrupted while grafting OCR text, causing processing to fail.
 -   Previously, we resized the images produced by Ghostscript by a small number of pixels to ensure the output image size was an exactly what we wanted. Having discovered a way to get Ghostscript to produce the exact image sizes we require, we eliminated the resizing step.
 -   Command line completions for ``bash`` are now available, in addition to ``fish``, both in ``misc/completion``. Package maintainers, please install these so users can take advantage.
 -   Updated requirements.
 -   pikepdf 1.3.0 is now required.
 v8.2.4
 ------
@@ -22,7 +37,7 @@ v8.2.4
 -   Minor optimization: we no longer traverse the table of contents to ensure all references in it are resolved, as changes to libqpdf have made this unnecessary.
-   pikepdf 1.2.0 is now required
+-   pikepdf 1.2.0 is now required.
 v8.2.3
 ------
--- a/misc/completion/ocrmypdf.bash
+++ b/misc/completion/ocrmypdf.bash
@@ -0,0 +1,87 @@
 # ocrmypdf completion                                     -*- shell-script -*-
 _ocrmypdf()
 {
    local cur prev cword words split
    _init_completion -s || return
    case $prev in
        --version|-h|--help)
            return
            ;;
        --user-words|--user-patterns|--tesseract-config)
            _filedir
            return
            ;;
        --output-type)
            COMPREPLY=( $( compgen -W 'pdfa pdf pdfa-1 pdfa-2 pdfa-3' -- \
                "$cur" ) )
            return
            ;;
        --pdf-renderer)
            COMPREPLY=( $( compgen -W 'auto hocr sandwich' -- "$cur" ) )
            return
            ;;
        --pdfa-image-compression)
            COMPREPLY=( $( compgen -W 'auto jpeg lossless' -- "$cur" ) )
            return
            ;;
        -O|--optimize|--tesseract-oem)
            COMPREPLY=( $( compgen -W '{0..3}' -- "$cur" ) )
            return
            ;;
        --jpeg-quality|--png-quality)
            COMPREPLY=( $( compgen -W '{0..100}' -- "$cur" ) )
            return
            ;;
        -l|--language)
            COMPREPLY=$( command tesseract --list-langs 2>/dev/null )
            COMPREPLY=( $( compgen -W '${COMPREPLY[@]##*:}' -- "$cur" ) )
            return
            ;;
        --image-dpi|--oversample|--skip-big|--max-image-mpixels|\
        --tesseract-timeout|--rotate-pages-threshold)
            COMPREPLY=( $( compgen -P "$cur" -W '{0..9}' ) )
            return
            ;;
        -j|--jobs)
            COMPREPLY=( $( compgen -W '{1..'$( _ncpus )'}' -- "$cur" ) )
            return
            ;;
        -v|--verbose)
            COMPREPLY=( $( compgen -W '{1..9}' -- "$cur" ) ) # max level ?
            return
            ;;
        --tesseract-pagesegmode)
            COMPREPLY=( $( compgen -W '{1..13}' -- "$cur" ) )
            return
            ;;
        --sidecar|--title|--author|--subject|--keywords|--unpaper-args)
            # argument required but no completions available
            return
            ;;
    esac
    $split && return
    if [[ $cur == -* ]]; then
        COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
            --sidecar --version --jobs --quiet --verbose --title --author
            --subject --keywords --rotate-pages --remove-background --deskew
            --clean --clean-final --unpaper-args --oversample --remove-vectors
            --mask-barcodes --threshold --force-ocr --skip-text --redo-ocr
            --skip-big --jpeg-quality --png-quality --jbig2-lossy
            --max-image-mpixels --tesseract-config --tesseract-pagesegmode
            --help --tesseract-oem --pdf-renderer --tesseract-timeout
            --rotate-pages-threshold --pdfa-image-compression --user-words
            --user-patterns --keep-temporary-files --flowchart --output-type' \
            --  "$cur" ) )
        return
    else
        _filedir
        return
    fi
 } &&
 complete -F _ocrmypdf ocrmypdf
 # ex: filetype=sh
--- a/misc/completion/ocrmypdf.fish
+++ b/misc/completion/ocrmypdf.fish
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -5,7 +5,7 @@ chardet == 3.0.4
 cffi == 1.12.2
 img2pdf == 0.3.3
 pdfminer.six == 20181108
-pikepdf == 1.2.0
+pikepdf == 1.3.0
 Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
 pycparser == 2.19
 python-xmp-toolkit == 2.0.1
--- a/setup.py
+++ b/setup.py
@@ -99,7 +99,7 @@ setup(
        'cffi >= 1.9.1',  # must be a setup and install requirement
        'img2pdf >= 0.3.0, < 0.4',  # pure Python, so track HEAD closely
        'pdfminer.six == 20181108 ; sys_platform != "darwin"',
-        'pikepdf >= 1.2.0, < 2',
+        'pikepdf >= 1.3.0, < 2',
        'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
        # Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
        # block 5.1.0, broken wheels
--- a/src/ocrmypdf/_weave.py
+++ b/src/ocrmypdf/_weave.py
@@ -15,6 +15,7 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 from contextlib import suppress
 from itertools import groupby
 from pathlib import Path
 import os
@@ -188,99 +189,6 @@ def _find_font(text, pdf_base):
        return None, None
 def _traverse_toc(pdf_base, visitor_fn, log):
    """
    Walk the table of contents, calling visitor_fn() at each node
    The /Outlines data structure is a messy data structure, but rather than
    navigating hierarchically we just track unique nodes.  Enqueue nodes when
    we find them, and never visit them again.  set() is awesome.  We look for
    the two types of object in the table of contents that can be page bookmarks
    and update the page entry.
    """
    visited = set()
    queue = set()
    link_keys = ('/Parent', '/First', '/Last', '/Prev', '/Next')
    if not '/Outlines' in pdf_base.root:
        return
    queue.add(pdf_base.root.Outlines.objgen)
    while queue:
        objgen = queue.pop()
        visited.add(objgen)
        node = pdf_base.get_object(objgen)
        log.debug('fix toc: exploring outline entries at %r', objgen)
        # Enumerate other nodes we could visit from here
        for key in link_keys:
            if key not in node:
                continue
            item = node[key]
            if not item.is_indirect:
                # Direct references are not allowed here, but it's not clear
                # what we should do if we find any. Removing them is an option:
                # node[key] = pdf_base.make_indirect(None)
                continue
            objgen = item.objgen
            if objgen not in visited:
                queue.add(objgen)
        if visitor_fn:
            visitor_fn(pdf_base, node, log)
 def _fix_toc(pdf_base, pageref_remap, log):
    """Repair the table of contents
    Whenever we replace a page wholesale, it gets assigned a new objgen number
    and other references to it within the PDF become invalid, most notably in
    the table of contents (/Outlines in PDF-speak).  In weave_layers we collect
    pageref_remap, a mapping that describes the new objgen number given an old
    one.  (objgen is a tuple, and the gen is almost always zero.)
    It may ultimately be better to find a way to rebuild a page in place.
    """
    if not pageref_remap:
        return
    def remap_dest(dest_node):
        """
        Inner helper function: change the objgen for any page from the old we
        invalidated to its new one.
        """
        try:
            pageref = dest_node[0]
            if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
                new_objgen = pageref_remap[pageref.objgen]
                dest_node[0] = pdf_base.get_object(new_objgen)
        except (IndexError, TypeError) as e:
            log.warning("This file may contain invalid table of contents entries")
            log.debug(e)
    def visit_remap_dest(pdf_base, node, log):
        """
        Visitor function to fix ToC entries
        Test for the two types of references to pages that can occur in ToCs.
        Both types have the same final format (an indirect reference to the
        target page).
        """
        if '/Dest' in node:
            # /Dest reference to another page (old method)
            remap_dest(node['/Dest'])
        elif '/A' in node:
            # /A (action) command set to "GoTo" (newer method)
            if '/S' in node['/A'] and node['/A']['/S'] == '/GoTo':
                remap_dest(node['/A']['/D'])
    _traverse_toc(pdf_base, visit_remap_dest, log)
 def weave_layers(infiles, output_file, log, context):
    """Apply text layer and/or image layer changes to baseline file
@@ -323,13 +231,13 @@ def weave_layers(infiles, output_file, log, context):
    pdf_base = pikepdf.open(path_base)
    font, font_key, procset = None, None, None
    pdfinfo = context.get_pdfinfo()
    pagerefs = {}
    procset = pdf_base.make_indirect(
        pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
    )
-    replacements = 0
+    emplacements = 1
    interim_count = 0
    # Iterate rest
    for page_num, layers in groups:
@@ -343,30 +251,25 @@ def weave_layers(infiles, output_file, log, context):
        if text and not font:
            font, font_key = _find_font(text, pdf_base)
-        replacing = False
+        emplaced_page = False
        content_rotation = pdfinfo[page_num - 1].rotation
        path_image = Path(image).resolve() if image else None
        if path_image is not None and path_image != path_base:
-            # We are replacing the old page with a rasterized PDF of the new
+            # We are updating the old page with a rasterized PDF of the new
-            # page
+            # page (without changing objgen, to preserve references)
-            log.debug("Replace")
+            log.debug("Emplacement update")
            old_objgen = pdf_base.pages[page_num - 1].objgen
            with pikepdf.open(image) as pdf_image:
-                replacements += 1
+                emplacements += 1
-                image_page = pdf_image.pages[0]
+                foreign_image_page = pdf_image.pages[0]
-                pdf_base.pages[page_num - 1] = image_page
+                pdf_base.pages.append(foreign_image_page)
-
+                local_image_page = pdf_base.pages[-1]
-            # We're adding a new page, which will get a new objgen number pair,
+                pdf_base.pages[page_num - 1].emplace(local_image_page)
-            # so we need to update any references to it.  qpdf did not like
+                del pdf_base.pages[-1]
-            # my attempt to update the old object in place, but that is an
+            emplaced_page = True
            # option to consider
            pagerefs[old_objgen] = pdf_base.pages[page_num - 1].objgen
            replacing = True
        autorotate_correction = context.get_rotation(page_num - 1)
-        if replacing:
+        if emplaced_page:
            content_rotation = autorotate_correction
        text_rotation = autorotate_correction
        text_misaligned = (text_rotation - content_rotation) % 360
@@ -395,7 +298,7 @@ def weave_layers(infiles, output_file, log, context):
            content_rotation - autorotate_correction
        ) % 360
-        if replacements % MAX_REPLACE_PAGES == 0:
+        if emplacements % MAX_REPLACE_PAGES == 0:
            # Periodically save and reload the Pdf object. This will keep a
            # lid on our memory usage for very large files. Attach the font to
            # page 1 even if page 1 doesn't use it, so we have a way to get it
@@ -405,14 +308,25 @@ def weave_layers(infiles, output_file, log, context):
            _update_page_resources(
                page=page0, font=font, font_key=font_key, procset=procset
            )
-            interim = output_file + f'_working{page_num}.pdf'
+
-            pdf_base.save(interim)
+            # We cannot read and write the same file, that will corrupt it
            # but we don't to keep more copies than we need to. Delete intermediates.
            # {interim_count} is the opened file we were updateing
            # {interim_count - 1} can be deleted
            # {interim_count + 1} is the new file will produce and open
            old_file = output_file + f'_working{interim_count - 1}.pdf'
            if not context.get_options().keep_temporary_files:
                with suppress(FileNotFoundError):
                    os.unlink(old_file)
            next_file = output_file + f'_working{interim_count + 1}.pdf'
            pdf_base.save(next_file)
            pdf_base.close()
-            pdf_base = pikepdf.open(interim)
+            pdf_base = pikepdf.open(next_file)
            procset = pdf_base.pages[0].Resources.ProcSet
-            font, font_key = None, None  # Reacquire this information
+            font, font_key = None, None  # Ensure we reacquire this information
            interim_count += 1
    _fix_toc(pdf_base, pagerefs, log)
    pdf_base.save(output_file)
    pdf_base.close()
--- a/src/ocrmypdf/exec/ghostscript.py
+++ b/src/ocrmypdf/exec/ghostscript.py
@@ -129,8 +129,7 @@ def rasterize_pdf(
    :param filter_vector: if True, remove vector graphics objects
    :return:
    """
-    res = xres, yres
+    res = round(xres, 6), round(yres, 6)
    int_res = round(xres), round(yres)
    if not page_dpi:
        page_dpi = res
@@ -145,7 +144,7 @@ def rasterize_pdf(
                f'-sDEVICE={raster_device}',
                f'-dFirstPage={pageno}',
                f'-dLastPage={pageno}',
-                f'-r{str(int_res[0])}x{str(int_res[1])}',
+                f'-r{res[0]:f}x{res[1]:f}',
            ]
            + (['-dFILTERVECTOR'] if filter_vector else [])
            + [
@@ -168,23 +167,8 @@ def rasterize_pdf(
            log.error('Ghostscript rasterizing failed')
            raise SubprocessOutputError()
        # Ghostscript only accepts integers for output resolution
        # if the resolution happens to be fractional, then the discrepancy
        # would change the size of the output page, especially if the DPI
        # is quite low. Resize the image to the expected size
        tmp.seek(0)
        with Image.open(tmp) as im:
            expected_size = (
                round(im.size[0] / int_res[0] * res[0]),
                round(im.size[1] / int_res[1] * res[1]),
            )
            if expected_size != im.size or page_dpi != (xres, yres):
                log.debug(
                    f"Ghostscript: resize output image {im.size} -> {expected_size}"
                )
                im = im.resize(expected_size)
            if rotation is not None:
                log.debug("Rotating output by %i", rotation)
                # rotation is a clockwise angle and Image.ROTATE_* is
@@ -269,7 +253,6 @@ def generate_pdfa(
                "-dBATCH",
                "-dNOPAUSE",
                "-dCompatibilityLevel=" + str(pdf_version),
                "-dNumRenderingThreads=" + str(threads),
                "-sDEVICE=pdfwrite",
                "-dAutoRotatePages=/None",
                "-sColorConversionStrategy=" + strategy,
--- a/tests/resources/link.pdf
+++ b/tests/resources/link.pdf
--- a/tests/test_ghostscript.py
+++ b/tests/test_ghostscript.py
@@ -0,0 +1,81 @@
 # © 2019 James R. Barlow: github.com/jbarlow83
 #
 # This file is part of OCRmyPDF.
 #
 # OCRmyPDF is free software: you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
 # the Free Software Foundation, either version 3 of the License, or
 # (at your option) any later version.
 #
 # OCRmyPDF is distributed in the hope that it will be useful,
 # but WITHOUT ANY WARRANTY; without even the implied warranty of
 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 # GNU General Public License for more details.
 #
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 import logging
 from decimal import Decimal
 import pikepdf
 import pytest
 from PIL import Image
 from ocrmypdf.exec.ghostscript import rasterize_pdf
@pytest.fixture
 def linn(resources):
    path = resources / 'linn.pdf'
    return path, pikepdf.open(path)
 def test_rasterize_size(linn, outdir, caplog):
    path, pdf = linn
    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
    target_size = Decimal('200.0'), Decimal('150.0')
    target_dpi = 42.0, 4242.0
    log = logging.getLogger()
    rasterize_pdf(
        path,
        outdir / 'out.png',
        target_size[0] / page_size[0],
        target_size[1] / page_size[1],
        raster_device='pngmono',
        log=log,
        page_dpi=target_dpi,
    )
    with Image.open(outdir / 'out.png') as im:
        assert im.size == target_size
        assert im.info['dpi'] == target_dpi
 def test_rasterize_rotated(linn, outdir, caplog):
    path, pdf = linn
    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
    target_size = Decimal('200.0'), Decimal('150.0')
    target_dpi = 42.0, 4242.0
    log = logging.getLogger()
    caplog.set_level(logging.DEBUG)
    rasterize_pdf(
        path,
        outdir / 'out.png',
        target_size[0] / page_size[0],
        target_size[1] / page_size[1],
        raster_device='pngmono',
        log=log,
        page_dpi=target_dpi,
        rotation=90,
    )
    with Image.open(outdir / 'out.png') as im:
        assert im.size == (target_size[1], target_size[0])
        assert im.info['dpi'] == (target_dpi[1], target_dpi[0])
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -285,7 +285,7 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
        assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
-def test_metadata_fixup_warning(resources, outdir):
+def test_metadata_fixup_warning(resources, outdir, caplog):
    from ocrmypdf._pipeline import metadata_fixup
    input_files = [
@@ -296,7 +296,7 @@ def test_metadata_fixup_warning(resources, outdir):
    for f in input_files:
        copyfile(resources / 'graph.pdf', f)
-    log = MagicMock()
+    log = logging.getLogger()
    context = MagicMock()
    metadata_fixup(
        input_files_groups=input_files,
@@ -304,7 +304,8 @@ def test_metadata_fixup_warning(resources, outdir):
        log=log,
        context=context,
    )
-    log.warning.assert_not_called()
+    for record in caplog.records:
        assert record.levelname != 'WARNING'
    # Now add some metadata that will not be copyable
    graph = pikepdf.open(outdir / 'graph.repaired.pdf')
@@ -312,7 +313,7 @@ def test_metadata_fixup_warning(resources, outdir):
        meta['prism2:publicationName'] = 'OCRmyPDF Test'
    graph.save(outdir / 'graph.repaired.pdf')
-    log = MagicMock()
+    log = logging.getLogger()
    context = MagicMock()
    metadata_fixup(
        input_files_groups=input_files,
@@ -320,7 +321,7 @@ def test_metadata_fixup_warning(resources, outdir):
        log=log,
        context=context,
    )
-    log.warning.assert_called_once()
+    assert any(record.levelname == 'WARNING' for record in caplog.records)
 def test_prevent_gs_invalid_xml(resources, outdir):
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@@ -16,9 +16,10 @@
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 import argparse
 import logging
 from os import fspath
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch
 import pytest
@@ -57,7 +58,7 @@ def test_no_unpaper(resources, no_outpdf):
    with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
        mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
        with pytest.raises(SystemExit):
-            main.check_options(options, log=MagicMock())
+            main.check_options(options, log=logging.getLogger())
 def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
--- a/tests/test_weave.py
+++ b/tests/test_weave.py
@@ -15,35 +15,15 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
 from unittest.mock import MagicMock
 import logging
 import os
 import pytest
 import pikepdf
 from ocrmypdf._weave import _fix_toc, _update_page_resources
 check_ocrmypdf = pytest.helpers.check_ocrmypdf
 def test_invalid_toc(resources, outdir, caplog):
    pdf = pikepdf.open(resources / 'toc.pdf')
    # Corrupt a TOC entry
    pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
    pdf.save(outdir / 'test.pdf')
    pdf = pikepdf.open(outdir / 'test.pdf')
    remap = {}
    remap[pdf.pages[0].objgen] = pdf.pages[0].objgen  # Dummy remap
    # Confirm we complain about the TOC and don't throw an exception
    log = logging.getLogger()
    _fix_toc(pdf, remap, log)
    assert 'invalid table of contents entries' in caplog.text
 def test_no_glyphless_weave(resources, outdir):
    pdf = pikepdf.open(resources / 'francais.pdf')
    pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
@@ -62,3 +42,21 @@ def test_no_glyphless_weave(resources, outdir):
        '0',
        env=env,
    )
@pytest.helpers.needs_pdfminer
 def test_links(resources, outpdf):
    check_ocrmypdf(
        resources / 'link.pdf',
        outpdf,
        '--redo-ocr',
        '--oversample',
        '200',
        '--output-type',
        'pdf',
    )
    pdf = pikepdf.open(outpdf)
    p1 = pdf.pages[0]
    p2 = pdf.pages[1]
    assert p1.Annots[0].A.D[0].objgen == p2.objgen
    assert p2.Annots[0].A.D[0].objgen == p1.objgen