From 339afb02aa17ba244deeca1fd18cbe73f5e906f8 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 25 Oct 2018 16:53:47 -0700 Subject: [PATCH] --redo-ocr now works in the presence of printable text --- src/ocrmypdf/_pipeline.py | 4 ++-- src/ocrmypdf/_weave.py | 4 +--- src/ocrmypdf/pdfinfo/__init__.py | 39 ++++++++++++++++++++++++++++---- 3 files changed, 37 insertions(+), 10 deletions(-) diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 3eb20d28..519e6ace 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -576,7 +576,7 @@ def select_ocr_image( options = context.get_options() pageinfo = get_pageinfo(image, context) - if options.force_ocr or (options.redo_ocr and pageinfo.only_ocr_text): + if options.force_ocr: re_symlink(image, output_file, log) return @@ -590,7 +590,7 @@ def select_ocr_image( xres, yres = im.info['dpi'] log.debug('resolution %r %r', xres, yres) - for textarea in pageinfo.get_textareas(): + for textarea in pageinfo.get_textareas(visible=True): # Calculate resolution based on the image size and page dimensions # without regard whatever resolution is in pageinfo (may differ or # be None) diff --git a/src/ocrmypdf/_weave.py b/src/ocrmypdf/_weave.py index 1574314d..c719796e 100644 --- a/src/ocrmypdf/_weave.py +++ b/src/ocrmypdf/_weave.py @@ -380,9 +380,7 @@ def weave_layers( if text and font: # Graft the text layer onto this page, whether new or old -# strip_old = context.get_options().redo_ocr - strip_old = (context.get_options().redo_ocr - and pdfinfo[page_num - 1].only_ocr_text) + strip_old = context.get_options().redo_ocr _weave_layers_graft( pdf_base=pdf_base, page_num=page_num, text=text, font=font, font_key=font_key, rotation=text_misaligned, procset=procset, diff --git a/src/ocrmypdf/pdfinfo/__init__.py b/src/ocrmypdf/pdfinfo/__init__.py index bf89465d..0ea265f7 100644 --- a/src/ocrmypdf/pdfinfo/__init__.py +++ b/src/ocrmypdf/pdfinfo/__init__.py @@ -594,15 +594,30 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext): page = pdf.pages[pageno] - pageinfo['textinfo'] = _page_get_textblocks( - fspath(infile), pageno, xmltext=xmltext) + # pageinfo['textinfo'] = _page_get_textblocks( + # fspath(infile), pageno, xmltext=xmltext) + + from .layout import get_textblocks + with Path(infile).open('rb') as f: + pageinfo['objects'] = get_textblocks(f, pageno) mediabox = [Decimal(d) for d in page.MediaBox.as_list()] width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] + def bboxes(hierarchical_textinfo): + from pdfminer.layout import LTTextLine, LTTextBox + for obj in hierarchical_textinfo: + if isinstance(hierarchical_textinfo, (LTTextLine, LTTextBox)): + yield hierarchical_textinfo.bbox + else: + try: + yield from bboxes(obj) + except TypeError: + continue + pageinfo['has_text'] = _page_has_text( - pageinfo['textinfo'], width_pt, height_pt) + bboxes(pageinfo['objects']), width_pt, height_pt) userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): @@ -732,8 +747,22 @@ class PageInfo: def images(self): return self._pageinfo['images'] - def get_textareas(self): - yield from self._pageinfo['textinfo'] + def get_textareas(self, visible=True, invisible=True): + def bboxes(objs): + from pdfminer.layout import LTTextBox + for obj in objs: + if isinstance(obj, LTTextBox): + yield obj.bbox + else: + try: + yield from bboxes(obj) + except TypeError: + continue + + if visible: + yield from bboxes(self._pageinfo['objects'][0]) + if invisible: + yield from bboxes(self._pageinfo['objects'][1]) @property def xres(self):