From 8599400445d160c97bdb00ee1ae7d69ff4d0f5a2 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 10 Jun 2020 11:33:27 -0700 Subject: [PATCH] Only do page analysis on pages we will do OCR on --- src/ocrmypdf/_pipeline.py | 9 +++-- src/ocrmypdf/_sync.py | 1 + src/ocrmypdf/pdfinfo/info.py | 69 +++++++++++++++++++++--------------- 3 files changed, 49 insertions(+), 30 deletions(-) diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 689d2542..3789fdca 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -149,9 +149,14 @@ def triage(original_filename, input_file, output_file, options): return output_file -def get_pdfinfo(input_file, progbar=False, max_workers=None): +def get_pdfinfo(input_file, progbar=False, max_workers=None, check_pages=None): try: - return PdfInfo(input_file, progbar=progbar, max_workers=max_workers) + return PdfInfo( + input_file, + progbar=progbar, + max_workers=max_workers, + check_pages=check_pages, + ) except pikepdf.PasswordError: raise EncryptedPdfError() except pikepdf.PdfError: diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 60eed5de..931cd320 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -309,6 +309,7 @@ def run_pipeline(options, *, plugin_manager, api=False): origin_pdf, progbar=options.progress_bar, max_workers=options.jobs if not options.use_threads else 1, # To help debug + check_pages=options.pages, ) context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager) diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py index 5c8175f3..4ef35d3c 100644 --- a/src/ocrmypdf/pdfinfo/info.py +++ b/src/ocrmypdf/pdfinfo/info.py @@ -554,7 +554,7 @@ def simplify_textboxes(miner, textbox_getter): yield TextboxInfo(box.bbox, visible, corrupt) -def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike): +def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, check_pages): pageinfo = {} pageinfo['pageno'] = pageno pageinfo['images'] = [] @@ -564,12 +564,18 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike): width_pt = mediabox[2] - mediabox[0] height_pt = mediabox[3] - mediabox[1] - pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') - miner = get_page_analysis(infile, pageno, pscript5_mode) - pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) - bboxes = (box.bbox for box in pageinfo['textboxes']) + check_this_page = not check_pages or pageno in check_pages - pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) + if check_this_page: + pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5') + miner = get_page_analysis(infile, pageno, pscript5_mode) + pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes)) + bboxes = (box.bbox for box in pageinfo['textboxes']) + + pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt) + else: + pageinfo['textboxes'] = [] + pageinfo['has_text'] = None userunit = page.get('/UserUnit', Decimal(1.0)) if not isinstance(userunit, Decimal): @@ -584,12 +590,16 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike): pageinfo['rotate'] = 0 userunit_shorthand = (userunit, 0, 0, userunit, 0, 0) - contentsinfo = [ - ci - for ci in _process_content_streams( - pdf=pdf, container=page, shorthand=userunit_shorthand - ) - ] + + if check_this_page: + contentsinfo = [ + ci + for ci in _process_content_streams( + pdf=pdf, container=page, shorthand=userunit_shorthand + ) + ] + else: + contentsinfo = [] pageinfo['has_vector'] = False if any(isinstance(ci, VectorInfo) for ci in contentsinfo): @@ -615,12 +625,12 @@ def _pdf_pageinfo_sync_init(infile): def _pdf_pageinfo_sync(args): global worker_pdf # pylint: disable=global-statement - pageno, infile = args - page = PageInfo(worker_pdf, pageno, infile) + pageno, infile, check_pages = args + page = PageInfo(worker_pdf, pageno, infile, check_pages) return page -def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers): +def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers, check_pages): pages = [None] * len(pdf.pages) def update_pageinfo(result, pbar): @@ -631,7 +641,8 @@ def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers): if max_workers is None: max_workers = available_cpu_count() - contexts = ((n, infile) for n in range(len(pdf.pages))) + total = len(pdf.pages) + contexts = ((n, infile, check_pages) for n in range(total)) use_threads = False # No performance gain if threaded due to GIL n_workers = min(1 + len(pages) // 4, max_workers) @@ -644,7 +655,7 @@ def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers): use_threads=use_threads, max_workers=n_workers, tqdm_kwargs=dict( - total=len(pdf.pages), desc="Scan", unit='page', disable=not progbar + total=total, desc="Searching for text", unit='page', disable=not progbar ), task_initializer=partial(_pdf_pageinfo_sync_init, infile), task=_pdf_pageinfo_sync, @@ -655,10 +666,10 @@ def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers): class PageInfo: - def __init__(self, pdf, pageno, infile): + def __init__(self, pdf, pageno, infile, check_pages): self._pageno = pageno self._infile = infile - self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile) + self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, check_pages) @property def pageno(self): @@ -755,20 +766,22 @@ class PageInfo: class PdfInfo: """Get summary information about a PDF""" - def __init__(self, infile, progbar=False, max_workers=None): + def __init__(self, infile, progbar=False, max_workers=None, check_pages=None): self._infile = infile with pikepdf.open(infile) as pdf: if pdf.is_encrypted: raise EncryptedPdfError() # Triggered by encryption with empty passwd - self._pages = _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers) - self._needs_rendering = pdf.root.get('/NeedsRendering', False) - self._has_acroform = False - if '/AcroForm' in pdf.root: - if len(pdf.root.AcroForm.get('/Fields', [])) > 0: - self._has_acroform = True - elif '/XFA' in pdf.root.AcroForm: - self._has_acroform = True + self._pages = _pdf_pageinfo_concurrent( + pdf, infile, progbar, max_workers, check_pages=check_pages + ) + self._needs_rendering = pdf.root.get('/NeedsRendering', False) + self._has_acroform = False + if '/AcroForm' in pdf.root: + if len(pdf.root.AcroForm.get('/Fields', [])) > 0: + self._has_acroform = True + elif '/XFA' in pdf.root.AcroForm: + self._has_acroform = True @property def pages(self):