Fix some failing tests after --redo-ocr changes

This commit is contained in:
James R. Barlow
2018-10-29 11:49:38 -07:00
parent 8e396f4be2
commit de80fb6bc8
3 changed files with 35 additions and 10 deletions

View File

@@ -589,6 +589,18 @@ def _page_has_text(text_blocks, page_width, page_height):
return has_text
def simplify_textboxes(miner):
for box in filter_textboxes(miner, lambda x: True):
result = {}
first_line = box._objs[0]
first_char = first_line._objs[0]
result['is_visible'] = (first_char.rendermode != 3)
result['is_corrupt'] = (first_char.get_text() == '\ufffd')
result['bbox'] = box.bbox
yield result
def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
pageinfo = {}
pageinfo['pageno'] = pageno
@@ -600,15 +612,14 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
# fspath(infile), pageno, xmltext=xmltext)
with Path(infile).open('rb') as f:
pageinfo['objects'] = get_textblocks(f, pageno)
miner = get_textblocks(f, pageno)
pageinfo['textobjs'] = list(simplify_textboxes(miner))
mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
width_pt = mediabox[2] - mediabox[0]
height_pt = mediabox[3] - mediabox[1]
bboxes = (textbox.bbox for textbox in filter_textboxes(
pageinfo['objects'], lambda obj: True)
)
bboxes = (obj['bbox'] for obj in pageinfo['textobjs'])
pageinfo['has_text'] = _page_has_text(
bboxes, width_pt, height_pt
)
@@ -734,10 +745,18 @@ class PageInfo:
return self._pageinfo['images']
def get_textareas(self, visible=None, corrupt=None):
return (obj.bbox for obj in filter_textboxes(
self._pageinfo['objects'],
textbox_predicate(visible=visible, corrupt=corrupt)
))
def predicate(obj, want_visible, want_corrupt):
result = True
if want_visible is not None:
if obj['is_visible'] != want_visible:
result = False
if want_corrupt is not None:
if obj['is_corrupt'] != want_corrupt:
result = False
return result
return (obj['bbox'] for obj in self._pageinfo['textobjs']
if predicate(obj, visible, corrupt))
@property
def xres(self):

View File

@@ -22,6 +22,7 @@ import pdfminer.pdfinterp
import pdfminer.pdfdevice
from pdfminer.converter import PDFLayoutAnalyzer
from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
from pdfminer.glyphlist import glyphname2unicode
from pdfminer.layout import (
LTChar, LTContainer, LTLayoutContainer, LTPage, LTTextLine, LAParams,
@@ -31,6 +32,8 @@ from pdfminer.pdffont import PDFUnicodeNotDefined, PDFType3Font
from pdfminer.pdfpage import PDFPage
from pdfminer.utils import matrix2str, bbox2str, fsplit
from ..exceptions import EncryptedPdfError
# Fix pdfminer's regex in name2unicode function
# Font cids that are mapped to names of the form /g123 seem to be, by convention
@@ -137,7 +140,10 @@ def get_textblocks(infile, pageno):
page = PDFPage.get_pages(infile, pagenos=[pageno], maxpages=0)
interp.process_page(next(page))
try:
interp.process_page(next(page))
except PDFTextExtractionNotAllowed as e:
raise EncryptedPdfError()
return dev.get_result()

View File

@@ -377,7 +377,7 @@ def test_tesseract_image_too_big(renderer, spoof_tesseract_big_image_error,
def test_algo4(resources, spoof_tesseract_noop, outpdf):
p, _, _ = run_ocrmypdf(resources / 'encrypted_algo4.pdf', outpdf,
env=spoof_tesseract_noop)
assert p.returncode == ExitCode.ok
assert p.returncode == ExitCode.encrypted_pdf
@pytest.mark.parametrize('renderer', RENDERERS)