From 9ec4aa039dbf512ea91ca4f7dac0d48ca2b122ee Mon Sep 17 00:00:00 2001
From: "James R. Barlow" <jim@purplerock.ca>
Date: Thu, 17 Dec 2015 12:52:12 -0800
Subject: [PATCH] Add tesseract caching to speed up tests

---
 .gitignore                     |  1 +
 tests/spoof/tesseract_cache.py | 73 ++++++++++++++++++++++++++++++++++
 tests/test_main.py             | 26 ++++++++----
 3 files changed, 93 insertions(+), 7 deletions(-)
 create mode 100755 tests/spoof/tesseract_cache.py

diff --git a/.gitignore b/.gitignore
index 91f76b5d..d5b776f2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ htmlcov/
 .coverage
 .cache/
 .ipynb_checkpoints/
+tests/cache/
\ No newline at end of file
diff --git a/tests/spoof/tesseract_cache.py b/tests/spoof/tesseract_cache.py
new file mode 100755
index 00000000..14889f08
--- /dev/null
+++ b/tests/spoof/tesseract_cache.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import sys
+import os
+import hashlib
+import shutil
+import subprocess
+
+
+CACHE_PATH = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', 'cache'))
+
+
+def main():
+    operation = sys.argv[-1]
+    # For anything except a hocr or pdf, defer to real tesseract
+    if operation != 'hocr' and operation != 'pdf':
+        tess_args = ['tesseract'] + sys.argv[1:]
+        os.execvp("tesseract", tess_args)
+        return  # Not reachable
+
+    try:
+        os.makedirs(CACHE_PATH)
+    except FileExistsError:
+        pass
+
+    m = hashlib.sha1()
+
+    version = subprocess.check_output(
+        ['tesseract', '--version'],
+        stderr=subprocess.STDOUT)
+
+    m.update(version.encode())
+    m.update(operation.encode())
+
+    try:
+        lang = sys.argv[sys.argv.index('-l') + 1]
+        m.update(lang.encode())
+    except ValueError:
+        pass
+
+    input_file = sys.argv[-3]
+    output_file = sys.argv[-2]
+
+    if operation == 'hocr':
+        output_file += '.hocr'
+    elif operation == 'pdf':
+        output_file += '.pdf'
+
+    with open(input_file, 'rb') as f:
+        m.update(f.read())
+
+    cache_name = os.path.join(CACHE_PATH, m.hexdigest())
+    if os.path.exists(cache_name):
+        # Cache hit
+        print("Tesseract cache hit", file=sys.stderr)
+        shutil.copy(cache_name, output_file)
+        sys.exit(0)
+
+    # Cache miss
+    print("Tesseract cache miss", file=sys.stderr)
+
+    # Call tesseract
+    subprocess.check_call(['tesseract'] + sys.argv[1:])
+
+    # Insert file into cache
+    if os.path.exists(output_file):
+        shutil.copy(output_file, cache_name)
+    else:
+        print("Could not find output file", file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tests/test_main.py b/tests/test_main.py
index ec6bfb47..d45b3da1 100644
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -87,6 +87,15 @@ def spoof_tesseract_noop():
     return env
 
 
+@pytest.fixture
+def spoof_tesseract_cache():
+    env = os.environ.copy()
+    program = os.path.join(SPOOF_PATH, "tesseract_cache.py")
+    check_call(['chmod', '+x', program])
+    env['OCRMYPDF_TESSERACT'] = program
+    return env
+
+
 def test_quick(spoof_tesseract_noop):
     check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop)
 
@@ -143,7 +152,8 @@ def test_exotic_image():
 def test_preserve_metadata(spoof_tesseract_noop):
     pdf_before = pypdf.PdfFileReader(_make_input('graph.pdf'))
 
-    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf')
+    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
+                            env=spoof_tesseract_noop)
 
     pdf_after = pypdf.PdfFileReader(output)
 
@@ -204,14 +214,16 @@ def test_repeat_ocr():
     assert sh.returncode != 0
 
 
-def test_force_ocr():
-    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')
+def test_force_ocr(spoof_tesseract_cache):
+    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
+                         env=spoof_tesseract_cache)
     pdfinfo = pdf_get_all_pageinfo(out)
     assert pdfinfo[0]['has_text']
 
 
-def test_skip_ocr():
-    check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')
+def test_skip_ocr(spoof_tesseract_cache):
+    check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s',
+                   env=spoof_tesseract_cache)
 
 
 def test_argsfile(spoof_tesseract_noop):
@@ -235,9 +247,9 @@ def test_ocr_timeout():
     yield check_ocr_timeout, 'tesseract'
 
 
-def test_skip_big():
+def test_skip_big(spoof_tesseract_cache):
     out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
-                         '--skip-big', '10')
+                         '--skip-big', '10', env=spoof_tesseract_cache)
     pdfinfo = pdf_get_all_pageinfo(out)
     assert pdfinfo[0]['has_text'] == False