Merge pull request #1 from jbarlow83/master

update master
2026-02-08 13:22:34 -05:00 · 2019-06-01 11:09:07 +02:00
parent 79c84eefa3 7e388f59af
commit 40b2ebcb37
14 changed files with 262 additions and 168 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,4 +3,4 @@ repos:
    rev: stable
    hooks:
    - id: black
-      language_version: python3.6
+      language_version: python3.7
--- a/docs/installation.rst
+++ b/docs/installation.rst
@@ -490,3 +490,17 @@ To install all of the development and test requirements:
    pip install -r requirements/dev.txt -r requirements/test.txt

 To add JBIG2 encoding, see :ref:`jbig2`.
+
+Shell completions
+-----------------
+
+Completions for ``bash`` and ``fish`` are available in the project's
+``misc/completion`` folder. The ``bash`` completions are likely ``zsh``
+compatible but this has not been confirmed. Package maintainers, please install
+these at the appropriate locations for your system.
+
+To manually install the ``bash`` completion, copy ``misc/completion/ocrmypdf.bash`` to
+``/etc/bash_completion.d/ocrmypdf`` (rename the file).
+
+To manually install the ``fish`` completion, copy ``misc/completion/ocrmypdf.fish`` to
+``~/.config/fish/completions/ocrmypdf.fish``.
--- a/docs/release_notes.rst
+++ b/docs/release_notes.rst
@@ -13,6 +13,21 @@ Note that it is licensed under GPLv3, so scripts that ``import ocrmypdf`` and ar
   find:    [^`]\#([0-9]{1,3})[^0-9]
   replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_

+v8.3.0
+------
+
+-   Improved the strategy for updating pages when a new image of the page was produced. We know attempt to preserve more content from the original file, for annotations in particular.
+
+-   For PDFs with more than 100 pages and a sequence where one PDF page was replaced and one or more subsequent ones were skipped, an intermediate file would be corrupted while grafting OCR text, causing processing to fail.
+
+-   Previously, we resized the images produced by Ghostscript by a small number of pixels to ensure the output image size was an exactly what we wanted. Having discovered a way to get Ghostscript to produce the exact image sizes we require, we eliminated the resizing step.
+
+-   Command line completions for ``bash`` are now available, in addition to ``fish``, both in ``misc/completion``. Package maintainers, please install these so users can take advantage.
+
+-   Updated requirements.
+
+-   pikepdf 1.3.0 is now required.
+
 v8.2.4
 ------

@@ -22,7 +37,7 @@ v8.2.4

 -   Minor optimization: we no longer traverse the table of contents to ensure all references in it are resolved, as changes to libqpdf have made this unnecessary.

-   pikepdf 1.2.0 is now required
+-   pikepdf 1.2.0 is now required.

 v8.2.3
 ------
--- a/misc/completion/ocrmypdf.bash
+++ b/misc/completion/ocrmypdf.bash
@@ -0,0 +1,87 @@
+# ocrmypdf completion                                     -*- shell-script -*-
+
+_ocrmypdf()
+{
+    local cur prev cword words split
+    _init_completion -s || return
+
+    case $prev in
+        --version|-h|--help)
+            return
+            ;;
+        --user-words|--user-patterns|--tesseract-config)
+            _filedir
+            return
+            ;;
+        --output-type)
+            COMPREPLY=( $( compgen -W 'pdfa pdf pdfa-1 pdfa-2 pdfa-3' -- \
+                "$cur" ) )
+            return
+            ;;
+        --pdf-renderer)
+            COMPREPLY=( $( compgen -W 'auto hocr sandwich' -- "$cur" ) )
+            return
+            ;;
+        --pdfa-image-compression)
+            COMPREPLY=( $( compgen -W 'auto jpeg lossless' -- "$cur" ) )
+            return
+            ;;
+        -O|--optimize|--tesseract-oem)
+            COMPREPLY=( $( compgen -W '{0..3}' -- "$cur" ) )
+            return
+            ;;
+        --jpeg-quality|--png-quality)
+            COMPREPLY=( $( compgen -W '{0..100}' -- "$cur" ) )
+            return
+            ;;
+        -l|--language)
+            COMPREPLY=$( command tesseract --list-langs 2>/dev/null )
+            COMPREPLY=( $( compgen -W '${COMPREPLY[@]##*:}' -- "$cur" ) )
+            return
+            ;;
+        --image-dpi|--oversample|--skip-big|--max-image-mpixels|\
+        --tesseract-timeout|--rotate-pages-threshold)
+            COMPREPLY=( $( compgen -P "$cur" -W '{0..9}' ) )
+            return
+            ;;
+        -j|--jobs)
+            COMPREPLY=( $( compgen -W '{1..'$( _ncpus )'}' -- "$cur" ) )
+            return
+            ;;
+        -v|--verbose)
+            COMPREPLY=( $( compgen -W '{1..9}' -- "$cur" ) ) # max level ?
+            return
+            ;;
+        --tesseract-pagesegmode)
+            COMPREPLY=( $( compgen -W '{1..13}' -- "$cur" ) )
+            return
+            ;;
+        --sidecar|--title|--author|--subject|--keywords|--unpaper-args)
+            # argument required but no completions available
+            return
+            ;;
+    esac
+
+    $split && return
+
+    if [[ $cur == -* ]]; then
+        COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
+            --sidecar --version --jobs --quiet --verbose --title --author
+            --subject --keywords --rotate-pages --remove-background --deskew
+            --clean --clean-final --unpaper-args --oversample --remove-vectors
+            --mask-barcodes --threshold --force-ocr --skip-text --redo-ocr
+            --skip-big --jpeg-quality --png-quality --jbig2-lossy
+            --max-image-mpixels --tesseract-config --tesseract-pagesegmode
+            --help --tesseract-oem --pdf-renderer --tesseract-timeout
+            --rotate-pages-threshold --pdfa-image-compression --user-words
+            --user-patterns --keep-temporary-files --flowchart --output-type' \
+            --  "$cur" ) )
+        return
+    else
+        _filedir
+        return
+    fi
+} &&
+complete -F _ocrmypdf ocrmypdf
+
+# ex: filetype=sh
--- a/misc/completion/ocrmypdf.fish
+++ b/misc/completion/ocrmypdf.fish
--- a/requirements/main.txt
+++ b/requirements/main.txt
@@ -5,7 +5,7 @@ chardet == 3.0.4
 cffi == 1.12.2
 img2pdf == 0.3.3
 pdfminer.six == 20181108
-pikepdf == 1.2.0
+pikepdf == 1.3.0
 Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
 pycparser == 2.19
 python-xmp-toolkit == 2.0.1
--- a/setup.py
+++ b/setup.py
@@ -99,7 +99,7 @@ setup(
        'cffi >= 1.9.1',  # must be a setup and install requirement
        'img2pdf >= 0.3.0, < 0.4',  # pure Python, so track HEAD closely
        'pdfminer.six == 20181108 ; sys_platform != "darwin"',
-        'pikepdf >= 1.2.0, < 2',
+        'pikepdf >= 1.3.0, < 2',
        'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
        # Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
        # block 5.1.0, broken wheels
--- a/src/ocrmypdf/_weave.py
+++ b/src/ocrmypdf/_weave.py
@@ -15,6 +15,7 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

+from contextlib import suppress
 from itertools import groupby
 from pathlib import Path
 import os
@@ -188,99 +189,6 @@ def _find_font(text, pdf_base):
        return None, None


-def _traverse_toc(pdf_base, visitor_fn, log):
-    """
-    Walk the table of contents, calling visitor_fn() at each node
-
-    The /Outlines data structure is a messy data structure, but rather than
-    navigating hierarchically we just track unique nodes.  Enqueue nodes when
-    we find them, and never visit them again.  set() is awesome.  We look for
-    the two types of object in the table of contents that can be page bookmarks
-    and update the page entry.
-
-    """
-
-    visited = set()
-    queue = set()
-    link_keys = ('/Parent', '/First', '/Last', '/Prev', '/Next')
-
-    if not '/Outlines' in pdf_base.root:
-        return
-
-    queue.add(pdf_base.root.Outlines.objgen)
-    while queue:
-        objgen = queue.pop()
-        visited.add(objgen)
-        node = pdf_base.get_object(objgen)
-        log.debug('fix toc: exploring outline entries at %r', objgen)
-
-        # Enumerate other nodes we could visit from here
-        for key in link_keys:
-            if key not in node:
-                continue
-            item = node[key]
-            if not item.is_indirect:
-                # Direct references are not allowed here, but it's not clear
-                # what we should do if we find any. Removing them is an option:
-                # node[key] = pdf_base.make_indirect(None)
-                continue
-            objgen = item.objgen
-            if objgen not in visited:
-                queue.add(objgen)
-
-        if visitor_fn:
-            visitor_fn(pdf_base, node, log)
-
-
-def _fix_toc(pdf_base, pageref_remap, log):
-    """Repair the table of contents
-
-    Whenever we replace a page wholesale, it gets assigned a new objgen number
-    and other references to it within the PDF become invalid, most notably in
-    the table of contents (/Outlines in PDF-speak).  In weave_layers we collect
-    pageref_remap, a mapping that describes the new objgen number given an old
-    one.  (objgen is a tuple, and the gen is almost always zero.)
-
-    It may ultimately be better to find a way to rebuild a page in place.
-
-    """
-
-    if not pageref_remap:
-        return
-
-    def remap_dest(dest_node):
-        """
-        Inner helper function: change the objgen for any page from the old we
-        invalidated to its new one.
-        """
-        try:
-            pageref = dest_node[0]
-            if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
-                new_objgen = pageref_remap[pageref.objgen]
-                dest_node[0] = pdf_base.get_object(new_objgen)
-        except (IndexError, TypeError) as e:
-            log.warning("This file may contain invalid table of contents entries")
-            log.debug(e)
-
-    def visit_remap_dest(pdf_base, node, log):
-        """
-        Visitor function to fix ToC entries
-
-        Test for the two types of references to pages that can occur in ToCs.
-        Both types have the same final format (an indirect reference to the
-        target page).
-        """
-        if '/Dest' in node:
-            # /Dest reference to another page (old method)
-            remap_dest(node['/Dest'])
-        elif '/A' in node:
-            # /A (action) command set to "GoTo" (newer method)
-            if '/S' in node['/A'] and node['/A']['/S'] == '/GoTo':
-                remap_dest(node['/A']['/D'])
-
-    _traverse_toc(pdf_base, visit_remap_dest, log)
-
-
 def weave_layers(infiles, output_file, log, context):
    """Apply text layer and/or image layer changes to baseline file

@@ -323,13 +231,13 @@ def weave_layers(infiles, output_file, log, context):
    pdf_base = pikepdf.open(path_base)
    font, font_key, procset = None, None, None
    pdfinfo = context.get_pdfinfo()
-    pagerefs = {}

    procset = pdf_base.make_indirect(
        pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
    )

-    replacements = 0
+    emplacements = 1
+    interim_count = 0

    # Iterate rest
    for page_num, layers in groups:
@@ -343,30 +251,25 @@ def weave_layers(infiles, output_file, log, context):
        if text and not font:
            font, font_key = _find_font(text, pdf_base)

-        replacing = False
+        emplaced_page = False
        content_rotation = pdfinfo[page_num - 1].rotation

        path_image = Path(image).resolve() if image else None
        if path_image is not None and path_image != path_base:
-            # We are replacing the old page with a rasterized PDF of the new
-            # page
-            log.debug("Replace")
-            old_objgen = pdf_base.pages[page_num - 1].objgen
-
+            # We are updating the old page with a rasterized PDF of the new
+            # page (without changing objgen, to preserve references)
+            log.debug("Emplacement update")
            with pikepdf.open(image) as pdf_image:
-                replacements += 1
-                image_page = pdf_image.pages[0]
-                pdf_base.pages[page_num - 1] = image_page
-
-            # We're adding a new page, which will get a new objgen number pair,
-            # so we need to update any references to it.  qpdf did not like
-            # my attempt to update the old object in place, but that is an
-            # option to consider
-            pagerefs[old_objgen] = pdf_base.pages[page_num - 1].objgen
-            replacing = True
+                emplacements += 1
+                foreign_image_page = pdf_image.pages[0]
+                pdf_base.pages.append(foreign_image_page)
+                local_image_page = pdf_base.pages[-1]
+                pdf_base.pages[page_num - 1].emplace(local_image_page)
+                del pdf_base.pages[-1]
+            emplaced_page = True

        autorotate_correction = context.get_rotation(page_num - 1)
-        if replacing:
+        if emplaced_page:
            content_rotation = autorotate_correction
        text_rotation = autorotate_correction
        text_misaligned = (text_rotation - content_rotation) % 360
@@ -395,7 +298,7 @@ def weave_layers(infiles, output_file, log, context):
            content_rotation - autorotate_correction
        ) % 360

-        if replacements % MAX_REPLACE_PAGES == 0:
+        if emplacements % MAX_REPLACE_PAGES == 0:
            # Periodically save and reload the Pdf object. This will keep a
            # lid on our memory usage for very large files. Attach the font to
            # page 1 even if page 1 doesn't use it, so we have a way to get it
@@ -405,14 +308,25 @@ def weave_layers(infiles, output_file, log, context):
            _update_page_resources(
                page=page0, font=font, font_key=font_key, procset=procset
            )
-            interim = output_file + f'_working{page_num}.pdf'
-            pdf_base.save(interim)
+
+            # We cannot read and write the same file, that will corrupt it
+            # but we don't to keep more copies than we need to. Delete intermediates.
+            # {interim_count} is the opened file we were updateing
+            # {interim_count - 1} can be deleted
+            # {interim_count + 1} is the new file will produce and open
+            old_file = output_file + f'_working{interim_count - 1}.pdf'
+            if not context.get_options().keep_temporary_files:
+                with suppress(FileNotFoundError):
+                    os.unlink(old_file)
+
+            next_file = output_file + f'_working{interim_count + 1}.pdf'
+            pdf_base.save(next_file)
            pdf_base.close()

-            pdf_base = pikepdf.open(interim)
+            pdf_base = pikepdf.open(next_file)
            procset = pdf_base.pages[0].Resources.ProcSet
-            font, font_key = None, None  # Reacquire this information
+            font, font_key = None, None  # Ensure we reacquire this information
+            interim_count += 1

-    _fix_toc(pdf_base, pagerefs, log)
    pdf_base.save(output_file)
    pdf_base.close()
--- a/src/ocrmypdf/exec/ghostscript.py
+++ b/src/ocrmypdf/exec/ghostscript.py
@@ -129,8 +129,7 @@ def rasterize_pdf(
    :param filter_vector: if True, remove vector graphics objects
    :return:
    """
-    res = xres, yres
-    int_res = round(xres), round(yres)
+    res = round(xres, 6), round(yres, 6)
    if not page_dpi:
        page_dpi = res

@@ -145,7 +144,7 @@ def rasterize_pdf(
                f'-sDEVICE={raster_device}',
                f'-dFirstPage={pageno}',
                f'-dLastPage={pageno}',
-                f'-r{str(int_res[0])}x{str(int_res[1])}',
+                f'-r{res[0]:f}x{res[1]:f}',
            ]
            + (['-dFILTERVECTOR'] if filter_vector else [])
            + [
@@ -168,23 +167,8 @@ def rasterize_pdf(
            log.error('Ghostscript rasterizing failed')
            raise SubprocessOutputError()

-        # Ghostscript only accepts integers for output resolution
-        # if the resolution happens to be fractional, then the discrepancy
-        # would change the size of the output page, especially if the DPI
-        # is quite low. Resize the image to the expected size
-
        tmp.seek(0)
        with Image.open(tmp) as im:
-            expected_size = (
-                round(im.size[0] / int_res[0] * res[0]),
-                round(im.size[1] / int_res[1] * res[1]),
-            )
-            if expected_size != im.size or page_dpi != (xres, yres):
-                log.debug(
-                    f"Ghostscript: resize output image {im.size} -> {expected_size}"
-                )
-                im = im.resize(expected_size)
-
            if rotation is not None:
                log.debug("Rotating output by %i", rotation)
                # rotation is a clockwise angle and Image.ROTATE_* is
@@ -269,7 +253,6 @@ def generate_pdfa(
                "-dBATCH",
                "-dNOPAUSE",
                "-dCompatibilityLevel=" + str(pdf_version),
-                "-dNumRenderingThreads=" + str(threads),
                "-sDEVICE=pdfwrite",
                "-dAutoRotatePages=/None",
                "-sColorConversionStrategy=" + strategy,
--- a/tests/resources/link.pdf
+++ b/tests/resources/link.pdf
--- a/tests/test_ghostscript.py
+++ b/tests/test_ghostscript.py
@@ -0,0 +1,81 @@
+# © 2019 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+from decimal import Decimal
+
+import pikepdf
+import pytest
+from PIL import Image
+
+from ocrmypdf.exec.ghostscript import rasterize_pdf
+
+
+@pytest.fixture
+def linn(resources):
+    path = resources / 'linn.pdf'
+    return path, pikepdf.open(path)
+
+
+def test_rasterize_size(linn, outdir, caplog):
+    path, pdf = linn
+    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
+    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
+    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
+    target_size = Decimal('200.0'), Decimal('150.0')
+    target_dpi = 42.0, 4242.0
+
+    log = logging.getLogger()
+    rasterize_pdf(
+        path,
+        outdir / 'out.png',
+        target_size[0] / page_size[0],
+        target_size[1] / page_size[1],
+        raster_device='pngmono',
+        log=log,
+        page_dpi=target_dpi,
+    )
+
+    with Image.open(outdir / 'out.png') as im:
+        assert im.size == target_size
+        assert im.info['dpi'] == target_dpi
+
+
+def test_rasterize_rotated(linn, outdir, caplog):
+    path, pdf = linn
+    page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
+    assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
+    page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
+    target_size = Decimal('200.0'), Decimal('150.0')
+    target_dpi = 42.0, 4242.0
+
+    log = logging.getLogger()
+    caplog.set_level(logging.DEBUG)
+    rasterize_pdf(
+        path,
+        outdir / 'out.png',
+        target_size[0] / page_size[0],
+        target_size[1] / page_size[1],
+        raster_device='pngmono',
+        log=log,
+        page_dpi=target_dpi,
+        rotation=90,
+    )
+
+    with Image.open(outdir / 'out.png') as im:
+        assert im.size == (target_size[1], target_size[0])
+        assert im.info['dpi'] == (target_dpi[1], target_dpi[0])
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -285,7 +285,7 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
        assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)


-def test_metadata_fixup_warning(resources, outdir):
+def test_metadata_fixup_warning(resources, outdir, caplog):
    from ocrmypdf._pipeline import metadata_fixup

    input_files = [
@@ -296,7 +296,7 @@ def test_metadata_fixup_warning(resources, outdir):
    for f in input_files:
        copyfile(resources / 'graph.pdf', f)

-    log = MagicMock()
+    log = logging.getLogger()
    context = MagicMock()
    metadata_fixup(
        input_files_groups=input_files,
@@ -304,7 +304,8 @@ def test_metadata_fixup_warning(resources, outdir):
        log=log,
        context=context,
    )
-    log.warning.assert_not_called()
+    for record in caplog.records:
+        assert record.levelname != 'WARNING'

    # Now add some metadata that will not be copyable
    graph = pikepdf.open(outdir / 'graph.repaired.pdf')
@@ -312,7 +313,7 @@ def test_metadata_fixup_warning(resources, outdir):
        meta['prism2:publicationName'] = 'OCRmyPDF Test'
    graph.save(outdir / 'graph.repaired.pdf')

-    log = MagicMock()
+    log = logging.getLogger()
    context = MagicMock()
    metadata_fixup(
        input_files_groups=input_files,
@@ -320,7 +321,7 @@ def test_metadata_fixup_warning(resources, outdir):
        log=log,
        context=context,
    )
-    log.warning.assert_called_once()
+    assert any(record.levelname == 'WARNING' for record in caplog.records)


 def test_prevent_gs_invalid_xml(resources, outdir):
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@@ -16,9 +16,10 @@
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

 import argparse
+import logging
 from os import fspath
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import patch

 import pytest

@@ -57,7 +58,7 @@ def test_no_unpaper(resources, no_outpdf):
    with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
        mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
        with pytest.raises(SystemExit):
-            main.check_options(options, log=MagicMock())
+            main.check_options(options, log=logging.getLogger())


 def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
--- a/tests/test_weave.py
+++ b/tests/test_weave.py
@@ -15,35 +15,15 @@
 # You should have received a copy of the GNU General Public License
 # along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.

-from unittest.mock import MagicMock
-import logging
 import os

 import pytest

 import pikepdf
-from ocrmypdf._weave import _fix_toc, _update_page_resources

 check_ocrmypdf = pytest.helpers.check_ocrmypdf


-def test_invalid_toc(resources, outdir, caplog):
-    pdf = pikepdf.open(resources / 'toc.pdf')
-
-    # Corrupt a TOC entry
-    pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
-    pdf.save(outdir / 'test.pdf')
-
-    pdf = pikepdf.open(outdir / 'test.pdf')
-    remap = {}
-    remap[pdf.pages[0].objgen] = pdf.pages[0].objgen  # Dummy remap
-
-    # Confirm we complain about the TOC and don't throw an exception
-    log = logging.getLogger()
-    _fix_toc(pdf, remap, log)
-    assert 'invalid table of contents entries' in caplog.text
-
-
 def test_no_glyphless_weave(resources, outdir):
    pdf = pikepdf.open(resources / 'francais.pdf')
    pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
@@ -62,3 +42,21 @@ def test_no_glyphless_weave(resources, outdir):
        '0',
        env=env,
    )
+
+
+@pytest.helpers.needs_pdfminer
+def test_links(resources, outpdf):
+    check_ocrmypdf(
+        resources / 'link.pdf',
+        outpdf,
+        '--redo-ocr',
+        '--oversample',
+        '200',
+        '--output-type',
+        'pdf',
+    )
+    pdf = pikepdf.open(outpdf)
+    p1 = pdf.pages[0]
+    p2 = pdf.pages[1]
+    assert p1.Annots[0].A.D[0].objgen == p2.objgen
+    assert p2.Annots[0].A.D[0].objgen == p1.objgen