From 339afb02aa17ba244deeca1fd18cbe73f5e906f8 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Thu, 25 Oct 2018 16:53:47 -0700
Subject: [PATCH] --redo-ocr now works in the presence of printable text

---
 src/ocrmypdf/_pipeline.py        |  4 ++--
 src/ocrmypdf/_weave.py           |  4 +---
 src/ocrmypdf/pdfinfo/__init__.py | 39 ++++++++++++++++++++++++++++----
 3 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
index 3eb20d28..519e6ace 100644
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -576,7 +576,7 @@ def select_ocr_image(
     options = context.get_options()
     pageinfo = get_pageinfo(image, context)
 
-    if options.force_ocr or (options.redo_ocr and pageinfo.only_ocr_text):
+    if options.force_ocr:
         re_symlink(image, output_file, log)
         return
 
@@ -590,7 +590,7 @@ def select_ocr_image(
 
         xres, yres = im.info['dpi']
         log.debug('resolution %r %r', xres, yres)
-        for textarea in pageinfo.get_textareas():
+        for textarea in pageinfo.get_textareas(visible=True):
             # Calculate resolution based on the image size and page dimensions
             # without regard whatever resolution is in pageinfo (may differ or
             # be None)
diff --git a/src/ocrmypdf/_weave.py b/src/ocrmypdf/_weave.py
index 1574314d..c719796e 100644
--- a/src/ocrmypdf/_weave.py
+++ b/src/ocrmypdf/_weave.py
@@ -380,9 +380,7 @@ def weave_layers(
 
         if text and font:
             # Graft the text layer onto this page, whether new or old
-#            strip_old = context.get_options().redo_ocr
-            strip_old = (context.get_options().redo_ocr
-                         and pdfinfo[page_num - 1].only_ocr_text)
+            strip_old = context.get_options().redo_ocr
             _weave_layers_graft(
                 pdf_base=pdf_base, page_num=page_num, text=text, font=font,
                 font_key=font_key, rotation=text_misaligned, procset=procset,
diff --git a/src/ocrmypdf/pdfinfo/__init__.py b/src/ocrmypdf/pdfinfo/__init__.py
index bf89465d..0ea265f7 100644
--- a/src/ocrmypdf/pdfinfo/__init__.py
+++ b/src/ocrmypdf/pdfinfo/__init__.py
@@ -594,15 +594,30 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
 
     page = pdf.pages[pageno]
 
-    pageinfo['textinfo'] = _page_get_textblocks(
-        fspath(infile), pageno, xmltext=xmltext)
+    # pageinfo['textinfo'] = _page_get_textblocks(
+    #     fspath(infile), pageno, xmltext=xmltext)
+
+    from .layout import get_textblocks
+    with Path(infile).open('rb') as f:
+        pageinfo['objects'] = get_textblocks(f, pageno)
 
     mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
     width_pt = mediabox[2] - mediabox[0]
     height_pt = mediabox[3] - mediabox[1]
 
+    def bboxes(hierarchical_textinfo):
+        from pdfminer.layout import LTTextLine, LTTextBox
+        for obj in hierarchical_textinfo:
+            if isinstance(hierarchical_textinfo, (LTTextLine, LTTextBox)):
+                yield hierarchical_textinfo.bbox
+            else:
+                try:
+                    yield from bboxes(obj)
+                except TypeError:
+                    continue
+
     pageinfo['has_text'] = _page_has_text(
-        pageinfo['textinfo'], width_pt, height_pt)
+        bboxes(pageinfo['objects']), width_pt, height_pt)
 
     userunit = page.get('/UserUnit', Decimal(1.0))
     if not isinstance(userunit, Decimal):
@@ -732,8 +747,22 @@ class PageInfo:
     def images(self):
         return self._pageinfo['images']
 
-    def get_textareas(self):
-        yield from self._pageinfo['textinfo']
+    def get_textareas(self, visible=True, invisible=True):
+        def bboxes(objs):
+            from pdfminer.layout import LTTextBox
+            for obj in objs:
+                if isinstance(obj, LTTextBox):
+                    yield obj.bbox
+                else:
+                    try:
+                        yield from bboxes(obj)
+                    except TypeError:
+                        continue
+
+        if visible:
+            yield from bboxes(self._pageinfo['objects'][0])
+        if invisible:
+            yield from bboxes(self._pageinfo['objects'][1])
 
     @property
     def xres(self):