From df1fda74388edd2125852db7cf2dde9840c3e7e7 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 2 Dec 2015 23:16:36 -0800 Subject: [PATCH] pageinfo: workaround PyPDF extractText limitations on hidden text It appears that extractText() does not find all text. At a glance it may be that Tesseract's PDF renderer generates a font and uses glyphs that map to different Unicode code points that PyPDF expects, so it discards the content and finds nothing. As a proxy in lieu of better PDF parsing, assume that a "GlyphLessFont" means there is a text there. I had previously found it does not work to check for the presence of a font on page. Some PDF generators create a font resource entry even if the font is never called for. --- ocrmypdf/pageinfo.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py index 7923ccc6..d1d76929 100644 --- a/ocrmypdf/pageinfo.py +++ b/ocrmypdf/pageinfo.py @@ -112,12 +112,16 @@ def _page_has_text(pdf, page): # More nuanced test to deal with quirks of Tesseract PDF generation # Check if there's a Glyphless font - font = page['/Resources']['/Font'] - font_objects = list(font.keys()) - for font_object in font_objects: - basefont = font[font_object]['/BaseFont'] - if basefont.endswith('GlyphLessFont'): - return True + try: + font = page['/Resources']['/Font'] + except KeyError: + pass + else: + font_objects = list(font.keys()) + for font_object in font_objects: + basefont = font[font_object]['/BaseFont'] + if basefont.endswith('GlyphLessFont'): + return True return False