mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Bug fix: --force-ocr should still run on pages with no images
Useful for people who want to reprocess text. This also requires --oversample because DPI is undefined. To be fixed in next commit.
This commit is contained in:
@@ -413,27 +413,34 @@ def is_ocr_required(pageinfo, log):
|
||||
page = pageinfo['pageno'] + 1
|
||||
ocr_required = True
|
||||
if not pageinfo['images']:
|
||||
# If the page has no images, then it contains vector content or text
|
||||
# or both. It seems quite unlikely that one would find meaningful text
|
||||
# from rasterizing vector content. So skip the page.
|
||||
log.info(
|
||||
"{0:4d}: page has no images - skipping OCR".format(page)
|
||||
)
|
||||
ocr_required = False
|
||||
msg = "{0:4d}: page has no images - {1}"
|
||||
|
||||
if options.force_ocr:
|
||||
# Someone wanted to do this to fix a PDF with text objects but a
|
||||
# broken toUnicode mapping
|
||||
log.warning(
|
||||
msg.format(
|
||||
page,
|
||||
"rasterizing anyway because --force-ocr was specified"))
|
||||
else:
|
||||
#
|
||||
log.info(msg.format(page,
|
||||
"skipping all processing on this page"))
|
||||
ocr_required = False
|
||||
elif pageinfo['has_text']:
|
||||
s = "{0:4d}: page already has text! – {1}"
|
||||
msg = "{0:4d}: page already has text! – {1}"
|
||||
|
||||
if not options.force_ocr and not options.skip_text:
|
||||
log.error(s.format(page,
|
||||
"aborting (use --force-ocr to force OCR)"))
|
||||
log.error(msg.format(page,
|
||||
"aborting (use --force-ocr to force OCR)"))
|
||||
sys.exit(ExitCode.already_done_ocr)
|
||||
elif options.force_ocr:
|
||||
log.info(s.format(page,
|
||||
"rasterizing text and running OCR anyway"))
|
||||
log.info(msg.format(page,
|
||||
"rasterizing text and running OCR anyway"))
|
||||
ocr_required = True
|
||||
elif options.skip_text:
|
||||
log.info(s.format(page,
|
||||
"skipping all processing on this page"))
|
||||
log.info(msg.format(page,
|
||||
"skipping all processing on this page"))
|
||||
ocr_required = False
|
||||
|
||||
if ocr_required and options.skip_big:
|
||||
@@ -866,7 +873,6 @@ def add_text_layer(
|
||||
if 'writeToStream' in str(e) or 'invalid literal' in str(e):
|
||||
raise PdfMergeFailedError() from e
|
||||
|
||||
|
||||
pdf_output = pypdf.PdfFileWriter()
|
||||
pdf_output.addPage(page_text)
|
||||
|
||||
|
||||
@@ -420,6 +420,17 @@ def test_blank_input_pdf():
|
||||
assert p.returncode == ExitCode.ok
|
||||
|
||||
|
||||
def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash):
|
||||
# As a correctness test, make sure that --force-ocr on a PDF with no
|
||||
# content still triggers tesseract. If tesseract crashes, then it was
|
||||
# called.
|
||||
p, _, err = run_ocrmypdf_env(
|
||||
'blank.pdf', 'wont_be_created.pdf', '--force-ocr',
|
||||
env=spoof_tesseract_crash)
|
||||
assert p.returncode == ExitCode.child_process_error, err
|
||||
assert not os.path.exists(_outfile('wontwork.pdf'))
|
||||
|
||||
|
||||
def test_french(spoof_tesseract_cache):
|
||||
p, out, err = run_ocrmypdf_env(
|
||||
'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)
|
||||
|
||||
Reference in New Issue
Block a user