Preserve "text as curves" vector content

Never updated the checking logic to deal with a pure vector file with no text that needs an OCR layer. This is doable, so allow it.
2026-05-04 04:35:49 -04:00 · 2018-02-07 16:05:48 -08:00
parent 4a61beae41
commit 1dfc32d7e6
2 changed files with 28 additions and 19 deletions
--- a/ocrmypdf/pipeline.py
+++ b/ocrmypdf/pipeline.py
@@ -244,7 +244,30 @@ def get_canvas_square_dpi(pageinfo, options):
 def is_ocr_required(pageinfo, log, options):
    page = pageinfo.pageno + 1
    ocr_required = True
-    if not pageinfo.images:
+
+    if pageinfo.has_text:
+        msg = "{0:4d}: page already has text! – {1}"
+
+        if not options.force_ocr and not options.skip_text:
+            log.error(msg.format(page,
+                                 "aborting (use --force-ocr to force OCR)"))
+            raise PriorOcrFoundError()
+        elif options.force_ocr:
+            log.info(msg.format(page,
+                                "rasterizing text and running OCR anyway"))
+            ocr_required = True
+        elif options.skip_text:
+            log.info(msg.format(page,
+                                "skipping all processing on this page"))
+            ocr_required = False
+    elif not pageinfo.images and not options.lossless_reconstruction:
+        # We found a page with no images and no text. That means it may
+        # have vector art that the user wants to OCR. If we determined
+        # lossless reconstruction is not possible then we have to rasterize
+        # the image. So if OCR is being forced, take that to mean YES, go 
+        # ahead and rasterize. If not forced, then pretend there's no text
+        # on the page at all so we don't lose anything.
+        # This could be made smarter by explicitly searching for vector art.
        if options.force_ocr and options.oversample:
            # The user really wants to reprocess this file
            log.info(
@@ -263,23 +286,9 @@ def is_ocr_required(pageinfo, log, options):
        else:
            log.info(
                "{0:4d}: page has no images - "
-                "skipping all processing on this page".format(page))
-            ocr_required = False
-
-    elif pageinfo.has_text:
-        msg = "{0:4d}: page already has text! – {1}"
-
-        if not options.force_ocr and not options.skip_text:
-            log.error(msg.format(page,
-                                 "aborting (use --force-ocr to force OCR)"))
-            raise PriorOcrFoundError()
-        elif options.force_ocr:
-            log.info(msg.format(page,
-                                "rasterizing text and running OCR anyway"))
-            ocr_required = True
-        elif options.skip_text:
-            log.info(msg.format(page,
-                                "skipping all processing on this page"))
+                "skipping all processing on this page to avoid losing detail. "
+                "Use --force-ocr if you wish to perform OCR on pages that "
+                "have vector content.".format(page))
            ocr_required = False

    if ocr_required and options.skip_big and pageinfo.images:
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -433,7 +433,7 @@ def test_klingon(resources, outpdf):

 def test_missing_docinfo(spoof_tesseract_noop, resources, outpdf):
    p, out, err = run_ocrmypdf(
-        resources / 'missing_docinfo.pdf', outpdf, '-l', 'eng', '-c',
+        resources / 'missing_docinfo.pdf', outpdf, '-l', 'eng', '--skip-text',
        env=spoof_tesseract_noop)
    assert p.returncode == ExitCode.ok, err