Bug fix: --force-ocr should still run on pages with no images

Useful for people who want to reprocess text. This also requires --oversample because DPI is undefined. To be fixed in next commit.
2026-05-18 19:47:48 -04:00 · 2016-07-27 15:06:49 -07:00
parent 8458a51860
commit 16e4d342d2
2 changed files with 32 additions and 15 deletions
--- a/ocrmypdf/main.py
+++ b/ocrmypdf/main.py
@@ -413,27 +413,34 @@ def is_ocr_required(pageinfo, log):
    page = pageinfo['pageno'] + 1
    ocr_required = True
    if not pageinfo['images']:
-        # If the page has no images, then it contains vector content or text
-        # or both. It seems quite unlikely that one would find meaningful text
-        # from rasterizing vector content. So skip the page.
-        log.info(
-            "{0:4d}: page has no images - skipping OCR".format(page)
-        )
-        ocr_required = False
+        msg = "{0:4d}: page has no images - {1}"
+
+        if options.force_ocr:
+            # Someone wanted to do this to fix a PDF with text objects but a
+            # broken toUnicode mapping
+            log.warning(
+                msg.format(
+                    page,
+                    "rasterizing anyway because --force-ocr was specified"))
+        else:
+            #
+            log.info(msg.format(page,
+                                "skipping all processing on this page"))
+            ocr_required = False
    elif pageinfo['has_text']:
-        s = "{0:4d}: page already has text! – {1}"
+        msg = "{0:4d}: page already has text! – {1}"

        if not options.force_ocr and not options.skip_text:
-            log.error(s.format(page,
-                               "aborting (use --force-ocr to force OCR)"))
+            log.error(msg.format(page,
+                                 "aborting (use --force-ocr to force OCR)"))
            sys.exit(ExitCode.already_done_ocr)
        elif options.force_ocr:
-            log.info(s.format(page,
-                              "rasterizing text and running OCR anyway"))
+            log.info(msg.format(page,
+                                "rasterizing text and running OCR anyway"))
            ocr_required = True
        elif options.skip_text:
-            log.info(s.format(page,
-                              "skipping all processing on this page"))
+            log.info(msg.format(page,
+                                "skipping all processing on this page"))
            ocr_required = False

    if ocr_required and options.skip_big:
@@ -866,7 +873,6 @@ def add_text_layer(
        if 'writeToStream' in str(e) or 'invalid literal' in str(e):
            raise PdfMergeFailedError() from e

-
    pdf_output = pypdf.PdfFileWriter()
    pdf_output.addPage(page_text)

--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -420,6 +420,17 @@ def test_blank_input_pdf():
    assert p.returncode == ExitCode.ok


+def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash):
+    # As a correctness test, make sure that --force-ocr on a PDF with no
+    # content still triggers tesseract. If tesseract crashes, then it was
+    # called.
+    p, _, err = run_ocrmypdf_env(
+        'blank.pdf', 'wont_be_created.pdf', '--force-ocr',
+        env=spoof_tesseract_crash)
+    assert p.returncode == ExitCode.child_process_error, err
+    assert not os.path.exists(_outfile('wontwork.pdf'))
+
+
 def test_french(spoof_tesseract_cache):
    p, out, err = run_ocrmypdf_env(
        'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)