From 16e4d342d2d18e88f36a1a12e679380655cdc577 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Wed, 27 Jul 2016 15:06:49 -0700
Subject: [PATCH] Bug fix: --force-ocr should still run on pages with no images

Useful for people who want to reprocess text.

This also requires --oversample because DPI is undefined. To be fixed
in next commit.
---
 ocrmypdf/main.py   | 36 +++++++++++++++++++++---------------
 tests/test_main.py | 11 +++++++++++
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py
index 6351ebda..c4098c68 100755
--- a/ocrmypdf/main.py
+++ b/ocrmypdf/main.py
@@ -413,27 +413,34 @@ def is_ocr_required(pageinfo, log):
     page = pageinfo['pageno'] + 1
     ocr_required = True
     if not pageinfo['images']:
-        # If the page has no images, then it contains vector content or text
-        # or both. It seems quite unlikely that one would find meaningful text
-        # from rasterizing vector content. So skip the page.
-        log.info(
-            "{0:4d}: page has no images - skipping OCR".format(page)
-        )
-        ocr_required = False
+        msg = "{0:4d}: page has no images - {1}"
+
+        if options.force_ocr:
+            # Someone wanted to do this to fix a PDF with text objects but a
+            # broken toUnicode mapping
+            log.warning(
+                msg.format(
+                    page,
+                    "rasterizing anyway because --force-ocr was specified"))
+        else:
+            #
+            log.info(msg.format(page,
+                                "skipping all processing on this page"))
+            ocr_required = False
     elif pageinfo['has_text']:
-        s = "{0:4d}: page already has text! – {1}"
+        msg = "{0:4d}: page already has text! – {1}"
 
         if not options.force_ocr and not options.skip_text:
-            log.error(s.format(page,
-                               "aborting (use --force-ocr to force OCR)"))
+            log.error(msg.format(page,
+                                 "aborting (use --force-ocr to force OCR)"))
             sys.exit(ExitCode.already_done_ocr)
         elif options.force_ocr:
-            log.info(s.format(page,
-                              "rasterizing text and running OCR anyway"))
+            log.info(msg.format(page,
+                                "rasterizing text and running OCR anyway"))
             ocr_required = True
         elif options.skip_text:
-            log.info(s.format(page,
-                              "skipping all processing on this page"))
+            log.info(msg.format(page,
+                                "skipping all processing on this page"))
             ocr_required = False
 
     if ocr_required and options.skip_big:
@@ -866,7 +873,6 @@ def add_text_layer(
         if 'writeToStream' in str(e) or 'invalid literal' in str(e):
             raise PdfMergeFailedError() from e
 
-
     pdf_output = pypdf.PdfFileWriter()
     pdf_output.addPage(page_text)
 
diff --git a/tests/test_main.py b/tests/test_main.py
index 266a1a36..f36eb3ab 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -420,6 +420,17 @@ def test_blank_input_pdf():
     assert p.returncode == ExitCode.ok
 
 
+def test_force_ocr_on_pdf_with_no_images(spoof_tesseract_crash):
+    # As a correctness test, make sure that --force-ocr on a PDF with no
+    # content still triggers tesseract. If tesseract crashes, then it was
+    # called.
+    p, _, err = run_ocrmypdf_env(
+        'blank.pdf', 'wont_be_created.pdf', '--force-ocr',
+        env=spoof_tesseract_crash)
+    assert p.returncode == ExitCode.child_process_error, err
+    assert not os.path.exists(_outfile('wontwork.pdf'))
+
+
 def test_french(spoof_tesseract_cache):
     p, out, err = run_ocrmypdf_env(
         'francais.pdf', 'francais.pdf', '-l', 'fra', env=spoof_tesseract_cache)