--redo-ocr now works in the presence of printable text

2026-05-19 12:04:44 -04:00 · 2018-10-25 16:53:47 -07:00
parent 7ba0ff5c36
commit 339afb02aa
3 changed files with 37 additions and 10 deletions
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -576,7 +576,7 @@ def select_ocr_image(
    options = context.get_options()
    pageinfo = get_pageinfo(image, context)

-    if options.force_ocr or (options.redo_ocr and pageinfo.only_ocr_text):
+    if options.force_ocr:
        re_symlink(image, output_file, log)
        return

@@ -590,7 +590,7 @@ def select_ocr_image(

        xres, yres = im.info['dpi']
        log.debug('resolution %r %r', xres, yres)
-        for textarea in pageinfo.get_textareas():
+        for textarea in pageinfo.get_textareas(visible=True):
            # Calculate resolution based on the image size and page dimensions
            # without regard whatever resolution is in pageinfo (may differ or
            # be None)
--- a/src/ocrmypdf/_weave.py
+++ b/src/ocrmypdf/_weave.py
@@ -380,9 +380,7 @@ def weave_layers(

        if text and font:
            # Graft the text layer onto this page, whether new or old
-#            strip_old = context.get_options().redo_ocr
-            strip_old = (context.get_options().redo_ocr
-                         and pdfinfo[page_num - 1].only_ocr_text)
+            strip_old = context.get_options().redo_ocr
            _weave_layers_graft(
                pdf_base=pdf_base, page_num=page_num, text=text, font=font,
                font_key=font_key, rotation=text_misaligned, procset=procset,
--- a/src/ocrmypdf/pdfinfo/init.py
+++ b/src/ocrmypdf/pdfinfo/init.py
@@ -594,15 +594,30 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):

    page = pdf.pages[pageno]

-    pageinfo['textinfo'] = _page_get_textblocks(
-        fspath(infile), pageno, xmltext=xmltext)
+    # pageinfo['textinfo'] = _page_get_textblocks(
+    #     fspath(infile), pageno, xmltext=xmltext)
+
+    from .layout import get_textblocks
+    with Path(infile).open('rb') as f:
+        pageinfo['objects'] = get_textblocks(f, pageno)

    mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
    width_pt = mediabox[2] - mediabox[0]
    height_pt = mediabox[3] - mediabox[1]

+    def bboxes(hierarchical_textinfo):
+        from pdfminer.layout import LTTextLine, LTTextBox
+        for obj in hierarchical_textinfo:
+            if isinstance(hierarchical_textinfo, (LTTextLine, LTTextBox)):
+                yield hierarchical_textinfo.bbox
+            else:
+                try:
+                    yield from bboxes(obj)
+                except TypeError:
+                    continue
+
    pageinfo['has_text'] = _page_has_text(
-        pageinfo['textinfo'], width_pt, height_pt)
+        bboxes(pageinfo['objects']), width_pt, height_pt)

    userunit = page.get('/UserUnit', Decimal(1.0))
    if not isinstance(userunit, Decimal):
@@ -732,8 +747,22 @@ class PageInfo:
    def images(self):
        return self._pageinfo['images']

-    def get_textareas(self):
-        yield from self._pageinfo['textinfo']
+    def get_textareas(self, visible=True, invisible=True):
+        def bboxes(objs):
+            from pdfminer.layout import LTTextBox
+            for obj in objs:
+                if isinstance(obj, LTTextBox):
+                    yield obj.bbox
+                else:
+                    try:
+                        yield from bboxes(obj)
+                    except TypeError:
+                        continue
+
+        if visible:
+            yield from bboxes(self._pageinfo['objects'][0])
+        if invisible:
+            yield from bboxes(self._pageinfo['objects'][1])

    @property
    def xres(self):