mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-05 13:16:55 -04:00
Fix issue when weave handoff occurs with no OCR font present
If using --tesseract-timeout 0 and any image processing on a file with more than 100 pages, the weave handoff will occur. Ensure this works correctly even if no Glyphless font is present. Closes #347
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
|
||||
from itertools import groupby
|
||||
from pathlib import Path
|
||||
import os
|
||||
|
||||
import pikepdf
|
||||
|
||||
@@ -24,6 +25,9 @@ from .exec import tesseract
|
||||
from .helpers import flatten_groups, page_number
|
||||
|
||||
|
||||
MAX_OPEN_PAGE_PDFS = int(os.environ.get('_OCRMYPDF_MAX_OPEN_PAGE_PDFS', 100))
|
||||
|
||||
|
||||
def _update_page_resources(*, page, font, font_key, procset):
|
||||
"""Update this page's fonts with a reference to the Glyphless font"""
|
||||
|
||||
@@ -34,7 +38,7 @@ def _update_page_resources(*, page, font, font_key, procset):
|
||||
fonts = resources['/Font']
|
||||
except KeyError:
|
||||
fonts = pikepdf.Dictionary({})
|
||||
if font_key not in fonts:
|
||||
if font_key is not None and font_key not in fonts:
|
||||
fonts[font_key] = font
|
||||
resources['/Font'] = fonts
|
||||
|
||||
@@ -177,6 +181,8 @@ def _find_font(text, pdf_base):
|
||||
break
|
||||
if pdf_text_font:
|
||||
font = pdf_base.copy_foreign(pdf_text_font)
|
||||
if font_key is None:
|
||||
print('font_key is None')
|
||||
return font, font_key
|
||||
|
||||
|
||||
@@ -246,10 +252,10 @@ def _fix_toc(pdf_base, pageref_remap, log):
|
||||
invalidated to its new one.
|
||||
"""
|
||||
try:
|
||||
pageref = dest_node[0]
|
||||
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
|
||||
new_objgen = pageref_remap[pageref.objgen]
|
||||
dest_node[0] = pdf_base.get_object(new_objgen)
|
||||
pageref = dest_node[0]
|
||||
if pageref['/Type'] == '/Page' and pageref.objgen in pageref_remap:
|
||||
new_objgen = pageref_remap[pageref.objgen]
|
||||
dest_node[0] = pdf_base.get_object(new_objgen)
|
||||
except (IndexError, TypeError) as e:
|
||||
log.warning("This file may contain invalid table of contents entries")
|
||||
log.debug(e)
|
||||
@@ -392,7 +398,7 @@ def weave_layers(infiles, output_file, log, context):
|
||||
content_rotation - autorotate_correction
|
||||
) % 360
|
||||
|
||||
if len(keep_open) > 100:
|
||||
if len(keep_open) > MAX_OPEN_PAGE_PDFS:
|
||||
# qpdf limitations require us to keep files open when we intend
|
||||
# to copy content from them before saving. However, we want to keep
|
||||
# a lid on file handles and memory usage, so for big files we're
|
||||
@@ -409,7 +415,7 @@ def weave_layers(infiles, output_file, log, context):
|
||||
|
||||
pdf_base = pikepdf.open(interim)
|
||||
procset = pdf_base.pages[0].Resources.ProcSet
|
||||
font = pdf_base.pages[0].Resources.Font.get(font_key)
|
||||
font, font_key = None, None # Reacquire this information
|
||||
|
||||
_fix_toc(pdf_base, pagerefs, log)
|
||||
pdf_base.save(output_file)
|
||||
|
||||
@@ -349,7 +349,7 @@ def generate_pdf(
|
||||
if os.path.exists(prefix + '.txt'):
|
||||
shutil.move(prefix + '.txt', output_text)
|
||||
except TimeoutExpired:
|
||||
page_timedout(log, input_image)
|
||||
page_timedout(log, input_image, timeout)
|
||||
use_skip_page(text_only, skip_pdf, output_pdf, output_text)
|
||||
except CalledProcessError as e:
|
||||
tesseract_log_output(log, e.output, input_image)
|
||||
|
||||
@@ -17,20 +17,24 @@
|
||||
|
||||
from unittest.mock import MagicMock
|
||||
import logging
|
||||
import os
|
||||
|
||||
import pytest
|
||||
|
||||
import pikepdf
|
||||
from ocrmypdf._weave import _fix_toc
|
||||
from ocrmypdf._weave import _fix_toc, _update_page_resources
|
||||
|
||||
def test_invalid_toc(resources, tmpdir, caplog):
|
||||
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
||||
|
||||
|
||||
def test_invalid_toc(resources, outdir, caplog):
|
||||
pdf = pikepdf.open(resources / 'toc.pdf')
|
||||
|
||||
# Corrupt a TOC entry
|
||||
pdf.Root.Outlines.Last.Dest = pikepdf.Array([None, 0.0, 0.1, 0.2])
|
||||
pdf.save(tmpdir / 'test.pdf')
|
||||
pdf.save(outdir / 'test.pdf')
|
||||
|
||||
pdf = pikepdf.open(tmpdir / 'test.pdf')
|
||||
pdf = pikepdf.open(outdir / 'test.pdf')
|
||||
remap = {}
|
||||
remap[pdf.pages[0].objgen] = pdf.pages[0].objgen # Dummy remap
|
||||
|
||||
@@ -38,3 +42,23 @@ def test_invalid_toc(resources, tmpdir, caplog):
|
||||
log = logging.getLogger()
|
||||
_fix_toc(pdf, remap, log)
|
||||
assert 'invalid table of contents entries' in caplog.text
|
||||
|
||||
|
||||
def test_no_glyphless_weave(resources, outdir):
|
||||
pdf = pikepdf.open(resources / 'francais.pdf')
|
||||
pdf_aspect = pikepdf.open(resources / 'aspect.pdf')
|
||||
pdf_cmyk = pikepdf.open(resources / 'cmyk.pdf')
|
||||
pdf.pages.extend(pdf_aspect.pages)
|
||||
pdf.pages.extend(pdf_cmyk.pages)
|
||||
pdf.save(outdir / 'test.pdf')
|
||||
|
||||
env = os.environ.copy()
|
||||
env['_OCRMYPDF_MAX_OPEN_PAGE_PDFS'] = '2'
|
||||
check_ocrmypdf(
|
||||
outdir / 'test.pdf',
|
||||
outdir / 'out.pdf',
|
||||
'--deskew',
|
||||
'--tesseract-timeout',
|
||||
'0',
|
||||
env=env,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user