mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 12:04:44 -04:00
--redo-ocr now works in the presence of printable text
This commit is contained in:
@@ -576,7 +576,7 @@ def select_ocr_image(
|
||||
options = context.get_options()
|
||||
pageinfo = get_pageinfo(image, context)
|
||||
|
||||
if options.force_ocr or (options.redo_ocr and pageinfo.only_ocr_text):
|
||||
if options.force_ocr:
|
||||
re_symlink(image, output_file, log)
|
||||
return
|
||||
|
||||
@@ -590,7 +590,7 @@ def select_ocr_image(
|
||||
|
||||
xres, yres = im.info['dpi']
|
||||
log.debug('resolution %r %r', xres, yres)
|
||||
for textarea in pageinfo.get_textareas():
|
||||
for textarea in pageinfo.get_textareas(visible=True):
|
||||
# Calculate resolution based on the image size and page dimensions
|
||||
# without regard whatever resolution is in pageinfo (may differ or
|
||||
# be None)
|
||||
|
||||
@@ -380,9 +380,7 @@ def weave_layers(
|
||||
|
||||
if text and font:
|
||||
# Graft the text layer onto this page, whether new or old
|
||||
# strip_old = context.get_options().redo_ocr
|
||||
strip_old = (context.get_options().redo_ocr
|
||||
and pdfinfo[page_num - 1].only_ocr_text)
|
||||
strip_old = context.get_options().redo_ocr
|
||||
_weave_layers_graft(
|
||||
pdf_base=pdf_base, page_num=page_num, text=text, font=font,
|
||||
font_key=font_key, rotation=text_misaligned, procset=procset,
|
||||
|
||||
@@ -594,15 +594,30 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
|
||||
|
||||
page = pdf.pages[pageno]
|
||||
|
||||
pageinfo['textinfo'] = _page_get_textblocks(
|
||||
fspath(infile), pageno, xmltext=xmltext)
|
||||
# pageinfo['textinfo'] = _page_get_textblocks(
|
||||
# fspath(infile), pageno, xmltext=xmltext)
|
||||
|
||||
from .layout import get_textblocks
|
||||
with Path(infile).open('rb') as f:
|
||||
pageinfo['objects'] = get_textblocks(f, pageno)
|
||||
|
||||
mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
|
||||
width_pt = mediabox[2] - mediabox[0]
|
||||
height_pt = mediabox[3] - mediabox[1]
|
||||
|
||||
def bboxes(hierarchical_textinfo):
|
||||
from pdfminer.layout import LTTextLine, LTTextBox
|
||||
for obj in hierarchical_textinfo:
|
||||
if isinstance(hierarchical_textinfo, (LTTextLine, LTTextBox)):
|
||||
yield hierarchical_textinfo.bbox
|
||||
else:
|
||||
try:
|
||||
yield from bboxes(obj)
|
||||
except TypeError:
|
||||
continue
|
||||
|
||||
pageinfo['has_text'] = _page_has_text(
|
||||
pageinfo['textinfo'], width_pt, height_pt)
|
||||
bboxes(pageinfo['objects']), width_pt, height_pt)
|
||||
|
||||
userunit = page.get('/UserUnit', Decimal(1.0))
|
||||
if not isinstance(userunit, Decimal):
|
||||
@@ -732,8 +747,22 @@ class PageInfo:
|
||||
def images(self):
|
||||
return self._pageinfo['images']
|
||||
|
||||
def get_textareas(self):
|
||||
yield from self._pageinfo['textinfo']
|
||||
def get_textareas(self, visible=True, invisible=True):
|
||||
def bboxes(objs):
|
||||
from pdfminer.layout import LTTextBox
|
||||
for obj in objs:
|
||||
if isinstance(obj, LTTextBox):
|
||||
yield obj.bbox
|
||||
else:
|
||||
try:
|
||||
yield from bboxes(obj)
|
||||
except TypeError:
|
||||
continue
|
||||
|
||||
if visible:
|
||||
yield from bboxes(self._pageinfo['objects'][0])
|
||||
if invisible:
|
||||
yield from bboxes(self._pageinfo['objects'][1])
|
||||
|
||||
@property
|
||||
def xres(self):
|
||||
|
||||
Reference in New Issue
Block a user