Add tesseract caching to speed up tests

2026-02-15 08:42:25 -05:00 · 2015-12-17 12:52:12 -08:00
parent ecebe2f24b
commit 9ec4aa039d
3 changed files with 93 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -21,3 +21,4 @@ htmlcov/
 .coverage
 .cache/
 .ipynb_checkpoints/
+tests/cache/
--- a/tests/spoof/tesseract_cache.py
+++ b/tests/spoof/tesseract_cache.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python3
+import sys
+import os
+import hashlib
+import shutil
+import subprocess
+
+
+CACHE_PATH = os.path.abspath(os.path.join(
+        os.path.dirname(__file__), '..', 'cache'))
+
+
+def main():
+    operation = sys.argv[-1]
+    # For anything except a hocr or pdf, defer to real tesseract
+    if operation != 'hocr' and operation != 'pdf':
+        tess_args = ['tesseract'] + sys.argv[1:]
+        os.execvp("tesseract", tess_args)
+        return  # Not reachable
+
+    try:
+        os.makedirs(CACHE_PATH)
+    except FileExistsError:
+        pass
+
+    m = hashlib.sha1()
+
+    version = subprocess.check_output(
+        ['tesseract', '--version'],
+        stderr=subprocess.STDOUT)
+
+    m.update(version.encode())
+    m.update(operation.encode())
+
+    try:
+        lang = sys.argv[sys.argv.index('-l') + 1]
+        m.update(lang.encode())
+    except ValueError:
+        pass
+
+    input_file = sys.argv[-3]
+    output_file = sys.argv[-2]
+
+    if operation == 'hocr':
+        output_file += '.hocr'
+    elif operation == 'pdf':
+        output_file += '.pdf'
+
+    with open(input_file, 'rb') as f:
+        m.update(f.read())
+
+    cache_name = os.path.join(CACHE_PATH, m.hexdigest())
+    if os.path.exists(cache_name):
+        # Cache hit
+        print("Tesseract cache hit", file=sys.stderr)
+        shutil.copy(cache_name, output_file)
+        sys.exit(0)
+
+    # Cache miss
+    print("Tesseract cache miss", file=sys.stderr)
+
+    # Call tesseract
+    subprocess.check_call(['tesseract'] + sys.argv[1:])
+
+    # Insert file into cache
+    if os.path.exists(output_file):
+        shutil.copy(output_file, cache_name)
+    else:
+        print("Could not find output file", file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -87,6 +87,15 @@ def spoof_tesseract_noop():
    return env


+@pytest.fixture
+def spoof_tesseract_cache():
+    env = os.environ.copy()
+    program = os.path.join(SPOOF_PATH, "tesseract_cache.py")
+    check_call(['chmod', '+x', program])
+    env['OCRMYPDF_TESSERACT'] = program
+    return env
+
+
 def test_quick(spoof_tesseract_noop):
    check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop)

@@ -143,7 +152,8 @@ def test_exotic_image():
 def test_preserve_metadata(spoof_tesseract_noop):
    pdf_before = pypdf.PdfFileReader(_make_input('graph.pdf'))

-    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf')
+    output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
+                            env=spoof_tesseract_noop)

    pdf_after = pypdf.PdfFileReader(output)

@@ -204,14 +214,16 @@ def test_repeat_ocr():
    assert sh.returncode != 0


-def test_force_ocr():
-    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')
+def test_force_ocr(spoof_tesseract_cache):
+    out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
+                         env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text']


-def test_skip_ocr():
-    check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')
+def test_skip_ocr(spoof_tesseract_cache):
+    check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s',
+                   env=spoof_tesseract_cache)


 def test_argsfile(spoof_tesseract_noop):
@@ -235,9 +247,9 @@ def test_ocr_timeout():
    yield check_ocr_timeout, 'tesseract'


-def test_skip_big():
+def test_skip_big(spoof_tesseract_cache):
    out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
-                         '--skip-big', '10')
+                         '--skip-big', '10', env=spoof_tesseract_cache)
    pdfinfo = pdf_get_all_pageinfo(out)
    assert pdfinfo[0]['has_text'] == False