Merge pull request #1 from jbarlow83/master

update master
This commit is contained in:
Frank
2019-06-01 11:09:07 +02:00
committed by GitHub
14 changed files with 262 additions and 168 deletions

View File

@@ -3,4 +3,4 @@ repos:
rev: stable
hooks:
- id: black
language_version: python3.6
language_version: python3.7

View File

@@ -490,3 +490,17 @@ To install all of the development and test requirements:
pip install -r requirements/dev.txt -r requirements/test.txt
To add JBIG2 encoding, see :ref:`jbig2`.
Shell completions
-----------------
Completions for ``bash`` and ``fish`` are available in the project's
``misc/completion`` folder. The ``bash`` completions are likely ``zsh``
compatible but this has not been confirmed. Package maintainers, please install
these at the appropriate locations for your system.
To manually install the ``bash`` completion, copy ``misc/completion/ocrmypdf.bash`` to
``/etc/bash_completion.d/ocrmypdf`` (rename the file).
To manually install the ``fish`` completion, copy ``misc/completion/ocrmypdf.fish`` to
``~/.config/fish/completions/ocrmypdf.fish``.

View File

@@ -13,6 +13,21 @@ Note that it is licensed under GPLv3, so scripts that ``import ocrmypdf`` and ar
find: [^`]\#([0-9]{1,3})[^0-9]
replace: `#$1 <https://github.com/jbarlow83/OCRmyPDF/issues/$1>`_
v8.3.0
------
- Improved the strategy for updating pages when a new image of the page was produced. We know attempt to preserve more content from the original file, for annotations in particular.
- For PDFs with more than 100 pages and a sequence where one PDF page was replaced and one or more subsequent ones were skipped, an intermediate file would be corrupted while grafting OCR text, causing processing to fail.
- Previously, we resized the images produced by Ghostscript by a small number of pixels to ensure the output image size was an exactly what we wanted. Having discovered a way to get Ghostscript to produce the exact image sizes we require, we eliminated the resizing step.
- Command line completions for ``bash`` are now available, in addition to ``fish``, both in ``misc/completion``. Package maintainers, please install these so users can take advantage.
- Updated requirements.
- pikepdf 1.3.0 is now required.
v8.2.4
------
@@ -22,7 +37,7 @@ v8.2.4
- Minor optimization: we no longer traverse the table of contents to ensure all references in it are resolved, as changes to libqpdf have made this unnecessary.
- pikepdf 1.2.0 is now required
- pikepdf 1.2.0 is now required.
v8.2.3
------

View File

@@ -0,0 +1,87 @@
# ocrmypdf completion -*- shell-script -*-
_ocrmypdf()
{
local cur prev cword words split
_init_completion -s || return
case $prev in
--version|-h|--help)
return
;;
--user-words|--user-patterns|--tesseract-config)
_filedir
return
;;
--output-type)
COMPREPLY=( $( compgen -W 'pdfa pdf pdfa-1 pdfa-2 pdfa-3' -- \
"$cur" ) )
return
;;
--pdf-renderer)
COMPREPLY=( $( compgen -W 'auto hocr sandwich' -- "$cur" ) )
return
;;
--pdfa-image-compression)
COMPREPLY=( $( compgen -W 'auto jpeg lossless' -- "$cur" ) )
return
;;
-O|--optimize|--tesseract-oem)
COMPREPLY=( $( compgen -W '{0..3}' -- "$cur" ) )
return
;;
--jpeg-quality|--png-quality)
COMPREPLY=( $( compgen -W '{0..100}' -- "$cur" ) )
return
;;
-l|--language)
COMPREPLY=$( command tesseract --list-langs 2>/dev/null )
COMPREPLY=( $( compgen -W '${COMPREPLY[@]##*:}' -- "$cur" ) )
return
;;
--image-dpi|--oversample|--skip-big|--max-image-mpixels|\
--tesseract-timeout|--rotate-pages-threshold)
COMPREPLY=( $( compgen -P "$cur" -W '{0..9}' ) )
return
;;
-j|--jobs)
COMPREPLY=( $( compgen -W '{1..'$( _ncpus )'}' -- "$cur" ) )
return
;;
-v|--verbose)
COMPREPLY=( $( compgen -W '{1..9}' -- "$cur" ) ) # max level ?
return
;;
--tesseract-pagesegmode)
COMPREPLY=( $( compgen -W '{1..13}' -- "$cur" ) )
return
;;
--sidecar|--title|--author|--subject|--keywords|--unpaper-args)
# argument required but no completions available
return
;;
esac
$split && return
if [[ $cur == -* ]]; then
COMPREPLY=( $( compgen -W '--language --image-dpi --output-type
--sidecar --version --jobs --quiet --verbose --title --author
--subject --keywords --rotate-pages --remove-background --deskew
--clean --clean-final --unpaper-args --oversample --remove-vectors
--mask-barcodes --threshold --force-ocr --skip-text --redo-ocr
--skip-big --jpeg-quality --png-quality --jbig2-lossy
--max-image-mpixels --tesseract-config --tesseract-pagesegmode
--help --tesseract-oem --pdf-renderer --tesseract-timeout
--rotate-pages-threshold --pdfa-image-compression --user-words
--user-patterns --keep-temporary-files --flowchart --output-type' \
-- "$cur" ) )
return
else
_filedir
return
fi
} &&
complete -F _ocrmypdf ocrmypdf
# ex: filetype=sh

View File

@@ -5,7 +5,7 @@ chardet == 3.0.4
cffi == 1.12.2
img2pdf == 0.3.3
pdfminer.six == 20181108
pikepdf == 1.2.0
pikepdf == 1.3.0
Pillow >= 5.0.0, != 5.1.0 ; sys_platform == "darwin"
pycparser == 2.19
python-xmp-toolkit == 2.0.1

View File

@@ -99,7 +99,7 @@ setup(
'cffi >= 1.9.1', # must be a setup and install requirement
'img2pdf >= 0.3.0, < 0.4', # pure Python, so track HEAD closely
'pdfminer.six == 20181108 ; sys_platform != "darwin"',
'pikepdf >= 1.2.0, < 2',
'pikepdf >= 1.3.0, < 2',
'Pillow >= 4.0.0, != 5.1.0 ; sys_platform == "darwin"',
# Pillow < 4 has BytesIO/TIFF bug w/img2pdf 0.2.3
# block 5.1.0, broken wheels

View File

@@ -15,6 +15,7 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from contextlib import suppress
from itertools import groupby
from pathlib import Path
import os
@@ -188,99 +189,6 @@ def _find_font(text, pdf_base):
return None, None
def _traverse_toc(pdf_base, visitor_fn, log):
"""
Walk the table of contents, calling visitor_fn() at each node
The /Outlines data structure is a messy data structure, but rather than
navigating hierarchically we just track unique nodes. Enqueue nodes when
we find them, and never visit them again. set() is awesome. We look for
the two types of object in the table of contents that can be page bookmarks
and update the page entry.
"""
visited = set()
queue = set()
link_keys = ('/Parent', '/First', '/Last', '/Prev', '/Next')
if not '/Outlines' in pdf_base.root:
return
queue.add(pdf_base.root.Outlines.objgen)
while queue:
objgen = queue.pop()
visited.add(objgen)
node = pdf_base.get_object(objgen)
log.debug('fix toc: exploring outline entries at %r', objgen)
# Enumerate other nodes we could visit from here
for key in link_keys:
if key not in node:
continue
item = node[key]
if not item.is_indirect:
# Direct references are not allowed here, but it's not clear
# what we should do if we find any. Removing them is an option:
# node[key] = pdf_base.make_indirect(None)
continue
objgen = item.objgen
if objgen not in visited:
queue.add(objgen)
if visitor_fn:
visitor_fn(pdf_base, node, log)
def _fix_toc(pdf_base, pageref_remap, log):
"""Repair the table of contents
Whenever we replace a page wholesale, it gets assigned a new objgen number
and other references to it within the PDF become invalid, most notably in
the table of contents (/Outlines in PDF-speak). In weave_layers we collect
pageref_remap, a mapping that describes the new objgen number given an old
one. (objgen is a tuple, and the gen is almost always zero.)
It may ultimately be better to find a way to rebuild a page in place.
"""
if not pageref_remap:
return
def remap_dest(dest_node):
"""
Inner helper function: change the objgen for any page from the old we
invalidated to its new one.
"""
try:
pageref = dest_node[0]
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
new_objgen = pageref_remap[pageref.objgen]
dest_node[0] = pdf_base.get_object(new_objgen)
except (IndexError, TypeError) as e:
log.warning("This file may contain invalid table of contents entries")
log.debug(e)
def visit_remap_dest(pdf_base, node, log):
"""
Visitor function to fix ToC entries
Test for the two types of references to pages that can occur in ToCs.
Both types have the same final format (an indirect reference to the
target page).
"""
if '/Dest' in node:
# /Dest reference to another page (old method)
remap_dest(node['/Dest'])
elif '/A' in node:
# /A (action) command set to "GoTo" (newer method)
if '/S' in node['/A'] and node['/A']['/S'] == '/GoTo':
remap_dest(node['/A']['/D'])
_traverse_toc(pdf_base, visit_remap_dest, log)
def weave_layers(infiles, output_file, log, context):
"""Apply text layer and/or image layer changes to baseline file
@@ -323,13 +231,13 @@ def weave_layers(infiles, output_file, log, context):
pdf_base = pikepdf.open(path_base)
font, font_key, procset = None, None, None
pdfinfo = context.get_pdfinfo()
pagerefs = {}
procset = pdf_base.make_indirect(
pikepdf.Object.parse(b'[ /PDF /Text /ImageB /ImageC /ImageI ]')
)
replacements = 0
emplacements = 1
interim_count = 0
# Iterate rest
for page_num, layers in groups:
@@ -343,30 +251,25 @@ def weave_layers(infiles, output_file, log, context):
if text and not font:
font, font_key = _find_font(text, pdf_base)
replacing = False
emplaced_page = False
content_rotation = pdfinfo[page_num - 1].rotation
path_image = Path(image).resolve() if image else None
if path_image is not None and path_image != path_base:
# We are replacing the old page with a rasterized PDF of the new
# page
log.debug("Replace")
old_objgen = pdf_base.pages[page_num - 1].objgen
# We are updating the old page with a rasterized PDF of the new
# page (without changing objgen, to preserve references)
log.debug("Emplacement update")
with pikepdf.open(image) as pdf_image:
replacements += 1
image_page = pdf_image.pages[0]
pdf_base.pages[page_num - 1] = image_page
# We're adding a new page, which will get a new objgen number pair,
# so we need to update any references to it. qpdf did not like
# my attempt to update the old object in place, but that is an
# option to consider
pagerefs[old_objgen] = pdf_base.pages[page_num - 1].objgen
replacing = True
emplacements += 1
foreign_image_page = pdf_image.pages[0]
pdf_base.pages.append(foreign_image_page)
local_image_page = pdf_base.pages[-1]
pdf_base.pages[page_num - 1].emplace(local_image_page)
del pdf_base.pages[-1]
emplaced_page = True
autorotate_correction = context.get_rotation(page_num - 1)
if replacing:
if emplaced_page:
content_rotation = autorotate_correction
text_rotation = autorotate_correction
text_misaligned = (text_rotation - content_rotation) % 360
@@ -395,7 +298,7 @@ def weave_layers(infiles, output_file, log, context):
content_rotation - autorotate_correction
) % 360
if replacements % MAX_REPLACE_PAGES == 0:
if emplacements % MAX_REPLACE_PAGES == 0:
# Periodically save and reload the Pdf object. This will keep a
# lid on our memory usage for very large files. Attach the font to
# page 1 even if page 1 doesn't use it, so we have a way to get it
@@ -405,14 +308,25 @@ def weave_layers(infiles, output_file, log, context):
_update_page_resources(
page=page0, font=font, font_key=font_key, procset=procset
)
interim = output_file + f'_working{page_num}.pdf'
pdf_base.save(interim)
# We cannot read and write the same file, that will corrupt it
# but we don't to keep more copies than we need to. Delete intermediates.
# {interim_count} is the opened file we were updateing
# {interim_count - 1} can be deleted
# {interim_count + 1} is the new file will produce and open
old_file = output_file + f'_working{interim_count - 1}.pdf'
if not context.get_options().keep_temporary_files:
with suppress(FileNotFoundError):
os.unlink(old_file)
next_file = output_file + f'_working{interim_count + 1}.pdf'
pdf_base.save(next_file)
pdf_base.close()
pdf_base = pikepdf.open(interim)
pdf_base = pikepdf.open(next_file)
procset = pdf_base.pages[0].Resources.ProcSet
font, font_key = None, None # Reacquire this information
font, font_key = None, None # Ensure we reacquire this information
interim_count += 1
_fix_toc(pdf_base, pagerefs, log)
pdf_base.save(output_file)
pdf_base.close()

View File

@@ -129,8 +129,7 @@ def rasterize_pdf(
:param filter_vector: if True, remove vector graphics objects
:return:
"""
res = xres, yres
int_res = round(xres), round(yres)
res = round(xres, 6), round(yres, 6)
if not page_dpi:
page_dpi = res
@@ -145,7 +144,7 @@ def rasterize_pdf(
f'-sDEVICE={raster_device}',
f'-dFirstPage={pageno}',
f'-dLastPage={pageno}',
f'-r{str(int_res[0])}x{str(int_res[1])}',
f'-r{res[0]:f}x{res[1]:f}',
]
+ (['-dFILTERVECTOR'] if filter_vector else [])
+ [
@@ -168,23 +167,8 @@ def rasterize_pdf(
log.error('Ghostscript rasterizing failed')
raise SubprocessOutputError()
# Ghostscript only accepts integers for output resolution
# if the resolution happens to be fractional, then the discrepancy
# would change the size of the output page, especially if the DPI
# is quite low. Resize the image to the expected size
tmp.seek(0)
with Image.open(tmp) as im:
expected_size = (
round(im.size[0] / int_res[0] * res[0]),
round(im.size[1] / int_res[1] * res[1]),
)
if expected_size != im.size or page_dpi != (xres, yres):
log.debug(
f"Ghostscript: resize output image {im.size} -> {expected_size}"
)
im = im.resize(expected_size)
if rotation is not None:
log.debug("Rotating output by %i", rotation)
# rotation is a clockwise angle and Image.ROTATE_* is
@@ -269,7 +253,6 @@ def generate_pdfa(
"-dBATCH",
"-dNOPAUSE",
"-dCompatibilityLevel=" + str(pdf_version),
"-dNumRenderingThreads=" + str(threads),
"-sDEVICE=pdfwrite",
"-dAutoRotatePages=/None",
"-sColorConversionStrategy=" + strategy,

BIN
tests/resources/link.pdf Normal file
View File

Binary file not shown.

81
tests/test_ghostscript.py Normal file
View File

@@ -0,0 +1,81 @@
# © 2019 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
from decimal import Decimal
import pikepdf
import pytest
from PIL import Image
from ocrmypdf.exec.ghostscript import rasterize_pdf
@pytest.fixture
def linn(resources):
path = resources / 'linn.pdf'
return path, pikepdf.open(path)
def test_rasterize_size(linn, outdir, caplog):
path, pdf = linn
page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
target_size = Decimal('200.0'), Decimal('150.0')
target_dpi = 42.0, 4242.0
log = logging.getLogger()
rasterize_pdf(
path,
outdir / 'out.png',
target_size[0] / page_size[0],
target_size[1] / page_size[1],
raster_device='pngmono',
log=log,
page_dpi=target_dpi,
)
with Image.open(outdir / 'out.png') as im:
assert im.size == target_size
assert im.info['dpi'] == target_dpi
def test_rasterize_rotated(linn, outdir, caplog):
path, pdf = linn
page_size_pts = (pdf.pages[0].MediaBox[2], pdf.pages[0].MediaBox[3])
assert pdf.pages[0].MediaBox[0] == pdf.pages[0].MediaBox[1] == 0
page_size = (page_size_pts[0] / Decimal(72), page_size_pts[1] / Decimal(72))
target_size = Decimal('200.0'), Decimal('150.0')
target_dpi = 42.0, 4242.0
log = logging.getLogger()
caplog.set_level(logging.DEBUG)
rasterize_pdf(
path,
outdir / 'out.png',
target_size[0] / page_size[0],
target_size[1] / page_size[1],
raster_device='pngmono',
log=log,
page_dpi=target_dpi,
rotation=90,
)
with Image.open(outdir / 'out.png') as im:
assert im.size == (target_size[1], target_size[0])
assert im.info['dpi'] == (target_dpi[1], target_dpi[0])

View File

@@ -285,7 +285,7 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
assert isinstance(p.root.Outlines.First, pikepdf.Dictionary)
def test_metadata_fixup_warning(resources, outdir):
def test_metadata_fixup_warning(resources, outdir, caplog):
from ocrmypdf._pipeline import metadata_fixup
input_files = [
@@ -296,7 +296,7 @@ def test_metadata_fixup_warning(resources, outdir):
for f in input_files:
copyfile(resources / 'graph.pdf', f)
log = MagicMock()
log = logging.getLogger()
context = MagicMock()
metadata_fixup(
input_files_groups=input_files,
@@ -304,7 +304,8 @@ def test_metadata_fixup_warning(resources, outdir):
log=log,
context=context,
)
log.warning.assert_not_called()
for record in caplog.records:
assert record.levelname != 'WARNING'
# Now add some metadata that will not be copyable
graph = pikepdf.open(outdir / 'graph.repaired.pdf')
@@ -312,7 +313,7 @@ def test_metadata_fixup_warning(resources, outdir):
meta['prism2:publicationName'] = 'OCRmyPDF Test'
graph.save(outdir / 'graph.repaired.pdf')
log = MagicMock()
log = logging.getLogger()
context = MagicMock()
metadata_fixup(
input_files_groups=input_files,
@@ -320,7 +321,7 @@ def test_metadata_fixup_warning(resources, outdir):
log=log,
context=context,
)
log.warning.assert_called_once()
assert any(record.levelname == 'WARNING' for record in caplog.records)
def test_prevent_gs_invalid_xml(resources, outdir):

View File

@@ -16,9 +16,10 @@
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import argparse
import logging
from os import fspath
from pathlib import Path
from unittest.mock import MagicMock, patch
from unittest.mock import patch
import pytest
@@ -57,7 +58,7 @@ def test_no_unpaper(resources, no_outpdf):
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
with pytest.raises(SystemExit):
main.check_options(options, log=MagicMock())
main.check_options(options, log=logging.getLogger())
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):

View File

@@ -15,35 +15,15 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from unittest.mock import MagicMock
import logging
import os
import pytest
import pikepdf
from ocrmypdf._weave import _fix_toc, _update_page_resources
check_ocrmypdf = pytest.helpers.check_ocrmypdf
def test_invalid_toc(resources, outdir, caplog):
pdf = pikepdf.open(resources / 'toc.pdf')
# Corrupt a TOC entry
pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
pdf.save(outdir / 'test.pdf')
pdf = pikepdf.open(outdir / 'test.pdf')
remap = {}
remap[pdf.pages[0].objgen] = pdf.pages[0].objgen # Dummy remap
# Confirm we complain about the TOC and don't throw an exception
log = logging.getLogger()
_fix_toc(pdf, remap, log)
assert 'invalid table of contents entries' in caplog.text
def test_no_glyphless_weave(resources, outdir):
pdf = pikepdf.open(resources / 'francais.pdf')
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
@@ -62,3 +42,21 @@ def test_no_glyphless_weave(resources, outdir):
'0',
env=env,
)
@pytest.helpers.needs_pdfminer
def test_links(resources, outpdf):
check_ocrmypdf(
resources / 'link.pdf',
outpdf,
'--redo-ocr',
'--oversample',
'200',
'--output-type',
'pdf',
)
pdf = pikepdf.open(outpdf)
p1 = pdf.pages[0]
p2 = pdf.pages[1]
assert p1.Annots[0].A.D[0].objgen == p2.objgen
assert p2.Annots[0].A.D[0].objgen == p1.objgen