--redo-ocr now works in the presence of printable text

This commit is contained in:
James R. Barlow
2018-10-25 16:53:47 -07:00
parent 7ba0ff5c36
commit 339afb02aa
3 changed files with 37 additions and 10 deletions

View File

@@ -576,7 +576,7 @@ def select_ocr_image(
options = context.get_options()
pageinfo = get_pageinfo(image, context)
if options.force_ocr or (options.redo_ocr and pageinfo.only_ocr_text):
if options.force_ocr:
re_symlink(image, output_file, log)
return
@@ -590,7 +590,7 @@ def select_ocr_image(
xres, yres = im.info['dpi']
log.debug('resolution %r %r', xres, yres)
for textarea in pageinfo.get_textareas():
for textarea in pageinfo.get_textareas(visible=True):
# Calculate resolution based on the image size and page dimensions
# without regard whatever resolution is in pageinfo (may differ or
# be None)

View File

@@ -380,9 +380,7 @@ def weave_layers(
if text and font:
# Graft the text layer onto this page, whether new or old
# strip_old = context.get_options().redo_ocr
strip_old = (context.get_options().redo_ocr
and pdfinfo[page_num - 1].only_ocr_text)
strip_old = context.get_options().redo_ocr
_weave_layers_graft(
pdf_base=pdf_base, page_num=page_num, text=text, font=font,
font_key=font_key, rotation=text_misaligned, procset=procset,

View File

@@ -594,15 +594,30 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
page = pdf.pages[pageno]
pageinfo['textinfo'] = _page_get_textblocks(
fspath(infile), pageno, xmltext=xmltext)
# pageinfo['textinfo'] = _page_get_textblocks(
# fspath(infile), pageno, xmltext=xmltext)
from .layout import get_textblocks
with Path(infile).open('rb') as f:
pageinfo['objects'] = get_textblocks(f, pageno)
mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
width_pt = mediabox[2] - mediabox[0]
height_pt = mediabox[3] - mediabox[1]
def bboxes(hierarchical_textinfo):
from pdfminer.layout import LTTextLine, LTTextBox
for obj in hierarchical_textinfo:
if isinstance(hierarchical_textinfo, (LTTextLine, LTTextBox)):
yield hierarchical_textinfo.bbox
else:
try:
yield from bboxes(obj)
except TypeError:
continue
pageinfo['has_text'] = _page_has_text(
pageinfo['textinfo'], width_pt, height_pt)
bboxes(pageinfo['objects']), width_pt, height_pt)
userunit = page.get('/UserUnit', Decimal(1.0))
if not isinstance(userunit, Decimal):
@@ -732,8 +747,22 @@ class PageInfo:
def images(self):
return self._pageinfo['images']
def get_textareas(self):
yield from self._pageinfo['textinfo']
def get_textareas(self, visible=True, invisible=True):
def bboxes(objs):
from pdfminer.layout import LTTextBox
for obj in objs:
if isinstance(obj, LTTextBox):
yield obj.bbox
else:
try:
yield from bboxes(obj)
except TypeError:
continue
if visible:
yield from bboxes(self._pageinfo['objects'][0])
if invisible:
yield from bboxes(self._pageinfo['objects'][1])
@property
def xres(self):