Fix issue when weave handoff occurs with no OCR font present

If using --tesseract-timeout 0 and any image processing on a file with
more than 100 pages, the weave handoff will occur. Ensure this
works correctly even if no Glyphless font is present.

Closes #347
This commit is contained in:
James R. Barlow
2019-02-10 01:52:31 -08:00
parent df688742d5
commit 19e35db2b7
3 changed files with 42 additions and 12 deletions

View File

@@ -17,6 +17,7 @@
from itertools import groupby
from pathlib import Path
import os
import pikepdf
@@ -24,6 +25,9 @@ from .exec import tesseract
from .helpers import flatten_groups, page_number
MAX_OPEN_PAGE_PDFS = int(os.environ.get('_OCRMYPDF_MAX_OPEN_PAGE_PDFS', 100))
def _update_page_resources(*, page, font, font_key, procset):
"""Update this page's fonts with a reference to the Glyphless font"""
@@ -34,7 +38,7 @@ def _update_page_resources(*, page, font, font_key, procset):
fonts = resources['/Font']
except KeyError:
fonts = pikepdf.Dictionary({})
if font_key not in fonts:
if font_key is not None and font_key not in fonts:
fonts[font_key] = font
resources['/Font'] = fonts
@@ -177,6 +181,8 @@ def _find_font(text, pdf_base):
break
if pdf_text_font:
font = pdf_base.copy_foreign(pdf_text_font)
if font_key is None:
print('font_key is None')
return font, font_key
@@ -246,10 +252,10 @@ def _fix_toc(pdf_base, pageref_remap, log):
invalidated to its new one.
"""
try:
pageref = dest_node[0]
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
new_objgen = pageref_remap[pageref.objgen]
dest_node[0] = pdf_base.get_object(new_objgen)
pageref = dest_node[0]
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
new_objgen = pageref_remap[pageref.objgen]
dest_node[0] = pdf_base.get_object(new_objgen)
except (IndexError, TypeError) as e:
log.warning("This file may contain invalid table of contents entries")
log.debug(e)
@@ -392,7 +398,7 @@ def weave_layers(infiles, output_file, log, context):
content_rotation - autorotate_correction
) % 360
if len(keep_open) > 100:
if len(keep_open) > MAX_OPEN_PAGE_PDFS:
# qpdf limitations require us to keep files open when we intend
# to copy content from them before saving. However, we want to keep
# a lid on file handles and memory usage, so for big files we're
@@ -409,7 +415,7 @@ def weave_layers(infiles, output_file, log, context):
pdf_base = pikepdf.open(interim)
procset = pdf_base.pages[0].Resources.ProcSet
font = pdf_base.pages[0].Resources.Font.get(font_key)
font, font_key = None, None # Reacquire this information
_fix_toc(pdf_base, pagerefs, log)
pdf_base.save(output_file)

View File

@@ -349,7 +349,7 @@ def generate_pdf(
if os.path.exists(prefix + '.txt'):
shutil.move(prefix + '.txt', output_text)
except TimeoutExpired:
page_timedout(log, input_image)
page_timedout(log, input_image, timeout)
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
except CalledProcessError as e:
tesseract_log_output(log, e.output, input_image)

View File

@@ -17,20 +17,24 @@
from unittest.mock import MagicMock
import logging
import os
import pytest
import pikepdf
from ocrmypdf._weave import _fix_toc
from ocrmypdf._weave import _fix_toc, _update_page_resources
def test_invalid_toc(resources, tmpdir, caplog):
check_ocrmypdf = pytest.helpers.check_ocrmypdf
def test_invalid_toc(resources, outdir, caplog):
pdf = pikepdf.open(resources / 'toc.pdf')
# Corrupt a TOC entry
pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
pdf.save(tmpdir / 'test.pdf')
pdf.save(outdir / 'test.pdf')
pdf = pikepdf.open(tmpdir / 'test.pdf')
pdf = pikepdf.open(outdir / 'test.pdf')
remap = {}
remap[pdf.pages[0].objgen] = pdf.pages[0].objgen # Dummy remap
@@ -38,3 +42,23 @@ def test_invalid_toc(resources, tmpdir, caplog):
log = logging.getLogger()
_fix_toc(pdf, remap, log)
assert 'invalid table of contents entries' in caplog.text
def test_no_glyphless_weave(resources, outdir):
pdf = pikepdf.open(resources / 'francais.pdf')
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
pdf.pages.extend(pdf_aspect.pages)
pdf.pages.extend(pdf_cmyk.pages)
pdf.save(outdir / 'test.pdf')
env = os.environ.copy()
env['_OCRMYPDF_MAX_OPEN_PAGE_PDFS'] = '2'
check_ocrmypdf(
outdir / 'test.pdf',
outdir / 'out.pdf',
'--deskew',
'--tesseract-timeout',
'0',
env=env,
)