From 16e4d342d2d18e88f36a1a12e679380655cdc577 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 27 Jul 2016 15:06:49 -0700 Subject: [PATCH] Bug fix: --force-ocr should still run on pages with no images Useful for people who want to reprocess text. This also requires --oversample because DPI is undefined. To be fixed in next commit. --- ocrmypdf/main.py | 36 +++++++++++++++++++++--------------- tests/test_main.py | 11 +++++++++++ 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index 6351ebda..c4098c68 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -413,27 +413,34 @@ def is_ocr_required(pageinfo, log): page = pageinfo['pageno'] + 1 ocr_required = True if not pageinfo['images']: - # If the page has no images, then it contains vector content or text - # or both. It seems quite unlikely that one would find meaningful text - # from rasterizing vector content. So skip the page. - log.info( - "{0:4d}: page has no images - skipping OCR".format(page) - ) - ocr_required = False + msg = "{0:4d}: page has no images - {1}" + + if options.force_ocr: + # Someone wanted to do this to fix a PDF with text objects but a + # broken toUnicode mapping + log.warning( + msg.format( + page, + "rasterizing anyway because --force-ocr was specified")) + else: + # + log.info(msg.format(page, + "skipping all processing on this page")) + ocr_required = False elif pageinfo['has_text']: - s = "{0:4d}: page already has text! – {1}" + msg = "{0:4d}: page already has text! – {1}" if not options.force_ocr and not options.skip_text: - log.error(s.format(page, - "aborting (use --force-ocr to force OCR)")) + log.error(msg.format(page, + "aborting (use --force-ocr to force OCR)")) sys.exit(ExitCode.already_done_ocr) elif options.force_ocr: - log.info(s.format(page, - "rasterizing text and running OCR anyway")) + log.info(msg.format(page, + "rasterizing text and running OCR anyway")) ocr_required = True elif options.skip_text: - log.info(s.format(page, - "skipping all processing on this page")) + log.info(msg.format(page, + "skipping all processing on this page")) ocr_required = False if ocr_required and options.skip_big: @@ -866,7 +873,6 @@ def add_text_layer( if 'writeToStream' in str(e) or 'invalid literal' in str(e): raise PdfMergeFailedError() from e - pdf_output = pypdf.PdfFileWriter() pdf_output.addPage(page_text) diff --git a/tests/test_main.py b/tests/test_main.py index 266a1a36..f36eb3ab 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -420,6 +420,17 @@ def test_blank_input_pdf(): assert p.returncode == ExitCode.ok +def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash): + # As a correctness test, make sure that --force-ocr on a PDF with no + # content still triggers tesseract. If tesseract crashes, then it was + # called. + p, _, err = run_ocrmypdf_env( + 'blank.pdf', 'wont_be_created.pdf', '--force-ocr', + env=spoof_tesseract_crash) + assert p.returncode == ExitCode.child_process_error, err + assert not os.path.exists(_outfile('wontwork.pdf')) + + def test_french(spoof_tesseract_cache): p, out, err = run_ocrmypdf_env( 'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)