diff --git a/src/ocrmypdf/pdfinfo/__init__.py b/src/ocrmypdf/pdfinfo/__init__.py index 5d836f06..2f866fa3 100644 --- a/src/ocrmypdf/pdfinfo/__init__.py +++ b/src/ocrmypdf/pdfinfo/__init__.py @@ -589,6 +589,18 @@ def _page_has_text(text_blocks, page_width, page_height): return has_text +def simplify_textboxes(miner): + for box in filter_textboxes(miner, lambda x: True): + result = {} + first_line = box._objs[0] + first_char = first_line._objs[0] + + result['is_visible'] = (first_char.rendermode != 3) + result['is_corrupt'] = (first_char.get_text() == '\ufffd') + result['bbox'] = box.bbox + yield result + + def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext): pageinfo = {} pageinfo['pageno'] = pageno @@ -600,15 +612,14 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile, xmltext): # fspath(infile), pageno, xmltext=xmltext) with Path(infile).open('rb') as f: - pageinfo['objects'] = get_textblocks(f, pageno) + miner = get_textblocks(f, pageno) + pageinfo['textobjs'] = list(simplify_textboxes(miner)) mediabox = [Decimal(d) for d in page.MediaBox.as_list()] width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] - bboxes = (textbox.bbox for textbox in filter_textboxes( - pageinfo['objects'], lambda obj: True) - ) + bboxes = (obj['bbox'] for obj in pageinfo['textobjs']) pageinfo['has_text'] = _page_has_text( bboxes, width_pt, height_pt ) @@ -734,10 +745,18 @@ class PageInfo: return self._pageinfo['images'] def get_textareas(self, visible=None, corrupt=None): - return (obj.bbox for obj in filter_textboxes( - self._pageinfo['objects'], - textbox_predicate(visible=visible, corrupt=corrupt) - )) + def predicate(obj, want_visible, want_corrupt): + result = True + if want_visible is not None: + if obj['is_visible'] != want_visible: + result = False + if want_corrupt is not None: + if obj['is_corrupt'] != want_corrupt: + result = False + return result + + return (obj['bbox'] for obj in self._pageinfo['textobjs'] + if predicate(obj, visible, corrupt)) @property def xres(self): diff --git a/src/ocrmypdf/pdfinfo/layout.py b/src/ocrmypdf/pdfinfo/layout.py index 90c8c627..ffef6870 100644 --- a/src/ocrmypdf/pdfinfo/layout.py +++ b/src/ocrmypdf/pdfinfo/layout.py @@ -22,6 +22,7 @@ import pdfminer.pdfinterp import pdfminer.pdfdevice from pdfminer.converter import PDFLayoutAnalyzer +from pdfminer.pdfdocument import PDFTextExtractionNotAllowed from pdfminer.glyphlist import glyphname2unicode from pdfminer.layout import ( LTChar, LTContainer, LTLayoutContainer, LTPage, LTTextLine, LAParams, @@ -31,6 +32,8 @@ from pdfminer.pdffont import PDFUnicodeNotDefined, PDFType3Font from pdfminer.pdfpage import PDFPage from pdfminer.utils import matrix2str, bbox2str, fsplit +from ..exceptions import EncryptedPdfError + # Fix pdfminer's regex in name2unicode function # Font cids that are mapped to names of the form /g123 seem to be, by convention @@ -137,7 +140,10 @@ def get_textblocks(infile, pageno): page = PDFPage.get_pages(infile, pagenos=[pageno], maxpages=0) - interp.process_page(next(page)) + try: + interp.process_page(next(page)) + except PDFTextExtractionNotAllowed as e: + raise EncryptedPdfError() return dev.get_result() diff --git a/tests/test_main.py b/tests/test_main.py index 8f18a8b2..785f6d85 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -377,7 +377,7 @@ def test_tesseract_image_too_big(renderer, spoof_tesseract_big_image_error, def test_algo4(resources, spoof_tesseract_noop, outpdf): p, _, _ = run_ocrmypdf(resources / 'encrypted_algo4.pdf', outpdf, env=spoof_tesseract_noop) - assert p.returncode == ExitCode.ok + assert p.returncode == ExitCode.encrypted_pdf @pytest.mark.parametrize('renderer', RENDERERS)