Bug fix: --force-ocr should still run on pages with no images

Useful for people who want to reprocess text.

This also requires --oversample because DPI is undefined. To be fixed
in next commit.
This commit is contained in:
James R. Barlow
2016-07-27 15:06:49 -07:00
parent 8458a51860
commit 16e4d342d2
2 changed files with 32 additions and 15 deletions

View File

@@ -413,27 +413,34 @@ def is_ocr_required(pageinfo, log):
page = pageinfo['pageno'] + 1
ocr_required = True
if not pageinfo['images']:
# If the page has no images, then it contains vector content or text
# or both. It seems quite unlikely that one would find meaningful text
# from rasterizing vector content. So skip the page.
log.info(
"{0:4d}: page has no images - skipping OCR".format(page)
)
ocr_required = False
msg = "{0:4d}: page has no images - {1}"
if options.force_ocr:
# Someone wanted to do this to fix a PDF with text objects but a
# broken toUnicode mapping
log.warning(
msg.format(
page,
"rasterizing anyway because --force-ocr was specified"))
else:
#
log.info(msg.format(page,
"skipping all processing on this page"))
ocr_required = False
elif pageinfo['has_text']:
s = "{0:4d}: page already has text! {1}"
msg = "{0:4d}: page already has text! {1}"
if not options.force_ocr and not options.skip_text:
log.error(s.format(page,
"aborting (use --force-ocr to force OCR)"))
log.error(msg.format(page,
"aborting (use --force-ocr to force OCR)"))
sys.exit(ExitCode.already_done_ocr)
elif options.force_ocr:
log.info(s.format(page,
"rasterizing text and running OCR anyway"))
log.info(msg.format(page,
"rasterizing text and running OCR anyway"))
ocr_required = True
elif options.skip_text:
log.info(s.format(page,
"skipping all processing on this page"))
log.info(msg.format(page,
"skipping all processing on this page"))
ocr_required = False
if ocr_required and options.skip_big:
@@ -866,7 +873,6 @@ def add_text_layer(
if 'writeToStream' in str(e) or 'invalid literal' in str(e):
raise PdfMergeFailedError() from e
pdf_output = pypdf.PdfFileWriter()
pdf_output.addPage(page_text)

View File

@@ -420,6 +420,17 @@ def test_blank_input_pdf():
assert p.returncode == ExitCode.ok
def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash):
# As a correctness test, make sure that --force-ocr on a PDF with no
# content still triggers tesseract. If tesseract crashes, then it was
# called.
p, _, err = run_ocrmypdf_env(
'blank.pdf', 'wont_be_created.pdf', '--force-ocr',
env=spoof_tesseract_crash)
assert p.returncode == ExitCode.child_process_error, err
assert not os.path.exists(_outfile('wontwork.pdf'))
def test_french(spoof_tesseract_cache):
p, out, err = run_ocrmypdf_env(
'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)