mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-05 05:05:44 -04:00
Fix some failing tests after --redo-ocr changes
This commit is contained in:
@@ -589,6 +589,18 @@ def _page_has_text(text_blocks, page_width, page_height):
|
||||
return has_text
|
||||
|
||||
|
||||
def simplify_textboxes(miner):
|
||||
for box in filter_textboxes(miner, lambda x: True):
|
||||
result = {}
|
||||
first_line = box._objs[0]
|
||||
first_char = first_line._objs[0]
|
||||
|
||||
result['is_visible'] = (first_char.rendermode != 3)
|
||||
result['is_corrupt'] = (first_char.get_text() == '\ufffd')
|
||||
result['bbox'] = box.bbox
|
||||
yield result
|
||||
|
||||
|
||||
def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
|
||||
pageinfo = {}
|
||||
pageinfo['pageno'] = pageno
|
||||
@@ -600,15 +612,14 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext):
|
||||
# fspath(infile), pageno, xmltext=xmltext)
|
||||
|
||||
with Path(infile).open('rb') as f:
|
||||
pageinfo['objects'] = get_textblocks(f, pageno)
|
||||
miner = get_textblocks(f, pageno)
|
||||
pageinfo['textobjs'] = list(simplify_textboxes(miner))
|
||||
|
||||
mediabox = [Decimal(d) for d in page.MediaBox.as_list()]
|
||||
width_pt = mediabox[2] - mediabox[0]
|
||||
height_pt = mediabox[3] - mediabox[1]
|
||||
|
||||
bboxes = (textbox.bbox for textbox in filter_textboxes(
|
||||
pageinfo['objects'], lambda obj: True)
|
||||
)
|
||||
bboxes = (obj['bbox'] for obj in pageinfo['textobjs'])
|
||||
pageinfo['has_text'] = _page_has_text(
|
||||
bboxes, width_pt, height_pt
|
||||
)
|
||||
@@ -734,10 +745,18 @@ class PageInfo:
|
||||
return self._pageinfo['images']
|
||||
|
||||
def get_textareas(self, visible=None, corrupt=None):
|
||||
return (obj.bbox for obj in filter_textboxes(
|
||||
self._pageinfo['objects'],
|
||||
textbox_predicate(visible=visible, corrupt=corrupt)
|
||||
))
|
||||
def predicate(obj, want_visible, want_corrupt):
|
||||
result = True
|
||||
if want_visible is not None:
|
||||
if obj['is_visible'] != want_visible:
|
||||
result = False
|
||||
if want_corrupt is not None:
|
||||
if obj['is_corrupt'] != want_corrupt:
|
||||
result = False
|
||||
return result
|
||||
|
||||
return (obj['bbox'] for obj in self._pageinfo['textobjs']
|
||||
if predicate(obj, visible, corrupt))
|
||||
|
||||
@property
|
||||
def xres(self):
|
||||
|
||||
@@ -22,6 +22,7 @@ import pdfminer.pdfinterp
|
||||
import pdfminer.pdfdevice
|
||||
|
||||
from pdfminer.converter import PDFLayoutAnalyzer
|
||||
from pdfminer.pdfdocument import PDFTextExtractionNotAllowed
|
||||
from pdfminer.glyphlist import glyphname2unicode
|
||||
from pdfminer.layout import (
|
||||
LTChar, LTContainer, LTLayoutContainer, LTPage, LTTextLine, LAParams,
|
||||
@@ -31,6 +32,8 @@ from pdfminer.pdffont import PDFUnicodeNotDefined, PDFType3Font
|
||||
from pdfminer.pdfpage import PDFPage
|
||||
from pdfminer.utils import matrix2str, bbox2str, fsplit
|
||||
|
||||
from ..exceptions import EncryptedPdfError
|
||||
|
||||
|
||||
# Fix pdfminer's regex in name2unicode function
|
||||
# Font cids that are mapped to names of the form /g123 seem to be, by convention
|
||||
@@ -137,7 +140,10 @@ def get_textblocks(infile, pageno):
|
||||
|
||||
page = PDFPage.get_pages(infile, pagenos=[pageno], maxpages=0)
|
||||
|
||||
interp.process_page(next(page))
|
||||
try:
|
||||
interp.process_page(next(page))
|
||||
except PDFTextExtractionNotAllowed as e:
|
||||
raise EncryptedPdfError()
|
||||
|
||||
return dev.get_result()
|
||||
|
||||
|
||||
@@ -377,7 +377,7 @@ def test_tesseract_image_too_big(renderer, spoof_tesseract_big_image_error,
|
||||
def test_algo4(resources, spoof_tesseract_noop, outpdf):
|
||||
p, _, _ = run_ocrmypdf(resources / 'encrypted_algo4.pdf', outpdf,
|
||||
env=spoof_tesseract_noop)
|
||||
assert p.returncode == ExitCode.ok
|
||||
assert p.returncode == ExitCode.encrypted_pdf
|
||||
|
||||
|
||||
@pytest.mark.parametrize('renderer', RENDERERS)
|
||||
|
||||
Reference in New Issue
Block a user