From 8599400445d160c97bdb00ee1ae7d69ff4d0f5a2 Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <james@purplerock.ca>
Date: Wed, 10 Jun 2020 11:33:27 -0700
Subject: [PATCH] Only do page analysis on pages we will do OCR on

---
 src/ocrmypdf/_pipeline.py    |  9 +++--
 src/ocrmypdf/_sync.py        |  1 +
 src/ocrmypdf/pdfinfo/info.py | 69 +++++++++++++++++++++---------------
 3 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
index 689d2542..3789fdca 100644
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -149,9 +149,14 @@ def triage(original_filename, input_file, output_file, options):
     return output_file
 
 
-def get_pdfinfo(input_file, progbar=False, max_workers=None):
+def get_pdfinfo(input_file, progbar=False, max_workers=None, check_pages=None):
     try:
-        return PdfInfo(input_file, progbar=progbar, max_workers=max_workers)
+        return PdfInfo(
+            input_file,
+            progbar=progbar,
+            max_workers=max_workers,
+            check_pages=check_pages,
+        )
     except pikepdf.PasswordError:
         raise EncryptedPdfError()
     except pikepdf.PdfError:
diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py
index 60eed5de..931cd320 100644
--- a/src/ocrmypdf/_sync.py
+++ b/src/ocrmypdf/_sync.py
@@ -309,6 +309,7 @@ def run_pipeline(options, *, plugin_manager, api=False):
             origin_pdf,
             progbar=options.progress_bar,
             max_workers=options.jobs if not options.use_threads else 1,  # To help debug
+            check_pages=options.pages,
         )
 
         context = PdfContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
diff --git a/src/ocrmypdf/pdfinfo/info.py b/src/ocrmypdf/pdfinfo/info.py
index 5c8175f3..4ef35d3c 100644
--- a/src/ocrmypdf/pdfinfo/info.py
+++ b/src/ocrmypdf/pdfinfo/info.py
@@ -554,7 +554,7 @@ def simplify_textboxes(miner, textbox_getter):
         yield TextboxInfo(box.bbox, visible, corrupt)
 
 
-def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike):
+def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike, check_pages):
     pageinfo = {}
     pageinfo['pageno'] = pageno
     pageinfo['images'] = []
@@ -564,12 +564,18 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike):
     width_pt = mediabox[2] - mediabox[0]
     height_pt = mediabox[3] - mediabox[1]
 
-    pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
-    miner = get_page_analysis(infile, pageno, pscript5_mode)
-    pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
-    bboxes = (box.bbox for box in pageinfo['textboxes'])
+    check_this_page = not check_pages or pageno in check_pages
 
-    pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)
+    if check_this_page:
+        pscript5_mode = str(pdf.docinfo.get('/Creator')).startswith('PScript5')
+        miner = get_page_analysis(infile, pageno, pscript5_mode)
+        pageinfo['textboxes'] = list(simplify_textboxes(miner, get_text_boxes))
+        bboxes = (box.bbox for box in pageinfo['textboxes'])
+
+        pageinfo['has_text'] = _page_has_text(bboxes, width_pt, height_pt)
+    else:
+        pageinfo['textboxes'] = []
+        pageinfo['has_text'] = None
 
     userunit = page.get('/UserUnit', Decimal(1.0))
     if not isinstance(userunit, Decimal):
@@ -584,12 +590,16 @@ def _pdf_get_pageinfo(pdf, pageno: int, infile: PathLike):
         pageinfo['rotate'] = 0
 
     userunit_shorthand = (userunit, 0, 0, userunit, 0, 0)
-    contentsinfo = [
-        ci
-        for ci in _process_content_streams(
-            pdf=pdf, container=page, shorthand=userunit_shorthand
-        )
-    ]
+
+    if check_this_page:
+        contentsinfo = [
+            ci
+            for ci in _process_content_streams(
+                pdf=pdf, container=page, shorthand=userunit_shorthand
+            )
+        ]
+    else:
+        contentsinfo = []
 
     pageinfo['has_vector'] = False
     if any(isinstance(ci, VectorInfo) for ci in contentsinfo):
@@ -615,12 +625,12 @@ def _pdf_pageinfo_sync_init(infile):
 
 def _pdf_pageinfo_sync(args):
     global worker_pdf  # pylint: disable=global-statement
-    pageno, infile = args
-    page = PageInfo(worker_pdf, pageno, infile)
+    pageno, infile, check_pages = args
+    page = PageInfo(worker_pdf, pageno, infile, check_pages)
     return page
 
 
-def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers):
+def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers, check_pages):
     pages = [None] * len(pdf.pages)
 
     def update_pageinfo(result, pbar):
@@ -631,7 +641,8 @@ def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers):
     if max_workers is None:
         max_workers = available_cpu_count()
 
-    contexts = ((n, infile) for n in range(len(pdf.pages)))
+    total = len(pdf.pages)
+    contexts = ((n, infile, check_pages) for n in range(total))
 
     use_threads = False  # No performance gain if threaded due to GIL
     n_workers = min(1 + len(pages) // 4, max_workers)
@@ -644,7 +655,7 @@ def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers):
         use_threads=use_threads,
         max_workers=n_workers,
         tqdm_kwargs=dict(
-            total=len(pdf.pages), desc="Scan", unit='page', disable=not progbar
+            total=total, desc="Searching for text", unit='page', disable=not progbar
         ),
         task_initializer=partial(_pdf_pageinfo_sync_init, infile),
         task=_pdf_pageinfo_sync,
@@ -655,10 +666,10 @@ def _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers):
 
 
 class PageInfo:
-    def __init__(self, pdf, pageno, infile):
+    def __init__(self, pdf, pageno, infile, check_pages):
         self._pageno = pageno
         self._infile = infile
-        self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile)
+        self._pageinfo = _pdf_get_pageinfo(pdf, pageno, infile, check_pages)
 
     @property
     def pageno(self):
@@ -755,20 +766,22 @@ class PageInfo:
 class PdfInfo:
     """Get summary information about a PDF"""
 
-    def __init__(self, infile, progbar=False, max_workers=None):
+    def __init__(self, infile, progbar=False, max_workers=None, check_pages=None):
         self._infile = infile
 
         with pikepdf.open(infile) as pdf:
             if pdf.is_encrypted:
                 raise EncryptedPdfError()  # Triggered by encryption with empty passwd
-            self._pages = _pdf_pageinfo_concurrent(pdf, infile, progbar, max_workers)
-        self._needs_rendering = pdf.root.get('/NeedsRendering', False)
-        self._has_acroform = False
-        if '/AcroForm' in pdf.root:
-            if len(pdf.root.AcroForm.get('/Fields', [])) > 0:
-                self._has_acroform = True
-            elif '/XFA' in pdf.root.AcroForm:
-                self._has_acroform = True
+            self._pages = _pdf_pageinfo_concurrent(
+                pdf, infile, progbar, max_workers, check_pages=check_pages
+            )
+            self._needs_rendering = pdf.root.get('/NeedsRendering', False)
+            self._has_acroform = False
+            if '/AcroForm' in pdf.root:
+                if len(pdf.root.AcroForm.get('/Fields', [])) > 0:
+                    self._has_acroform = True
+                elif '/XFA' in pdf.root.AcroForm:
+                    self._has_acroform = True
 
     @property
     def pages(self):