From df1fda74388edd2125852db7cf2dde9840c3e7e7 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Wed, 2 Dec 2015 23:16:36 -0800
Subject: [PATCH] pageinfo: workaround PyPDF extractText limitations on hidden
 text

It appears that extractText() does not find all text. At a glance it
may be that Tesseract's PDF renderer generates a font and uses glyphs
that map to different Unicode code points that PyPDF expects, so it
discards the content and finds nothing. As a proxy in lieu of better
PDF parsing, assume that a "GlyphLessFont" means there is a text there.

I had previously found it does not work to check for the presence of a
font on page. Some PDF generators create a font resource entry even if
the font is never called for.
---
 ocrmypdf/pageinfo.py | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py
index 7923ccc6..d1d76929 100644
--- a/ocrmypdf/pageinfo.py
+++ b/ocrmypdf/pageinfo.py
@@ -112,12 +112,16 @@ def _page_has_text(pdf, page):
 
     # More nuanced test to deal with quirks of Tesseract PDF generation
     # Check if there's a Glyphless font
-    font = page['/Resources']['/Font']
-    font_objects = list(font.keys())
-    for font_object in font_objects:
-        basefont = font[font_object]['/BaseFont']
-        if basefont.endswith('GlyphLessFont'):
-            return True
+    try:
+        font = page['/Resources']['/Font']
+    except KeyError:
+        pass
+    else:
+        font_objects = list(font.keys())
+        for font_object in font_objects:
+            basefont = font[font_object]['/BaseFont']
+            if basefont.endswith('GlyphLessFont'):
+                return True
 
     return False