From 9ec4aa039dbf512ea91ca4f7dac0d48ca2b122ee Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 17 Dec 2015 12:52:12 -0800 Subject: [PATCH] Add tesseract caching to speed up tests --- .gitignore | 1 + tests/spoof/tesseract_cache.py | 73 ++++++++++++++++++++++++++++++++++ tests/test_main.py | 26 ++++++++---- 3 files changed, 93 insertions(+), 7 deletions(-) create mode 100755 tests/spoof/tesseract_cache.py diff --git a/.gitignore b/.gitignore index 91f76b5d..d5b776f2 100644 --- a/.gitignore +++ b/.gitignore @@ -21,3 +21,4 @@ htmlcov/ .coverage .cache/ .ipynb_checkpoints/ +tests/cache/ \ No newline at end of file diff --git a/tests/spoof/tesseract_cache.py b/tests/spoof/tesseract_cache.py new file mode 100755 index 00000000..14889f08 --- /dev/null +++ b/tests/spoof/tesseract_cache.py @@ -0,0 +1,73 @@ +#!/usr/bin/env python3 +import sys +import os +import hashlib +import shutil +import subprocess + + +CACHE_PATH = os.path.abspath(os.path.join( + os.path.dirname(__file__), '..', 'cache')) + + +def main(): + operation = sys.argv[-1] + # For anything except a hocr or pdf, defer to real tesseract + if operation != 'hocr' and operation != 'pdf': + tess_args = ['tesseract'] + sys.argv[1:] + os.execvp("tesseract", tess_args) + return # Not reachable + + try: + os.makedirs(CACHE_PATH) + except FileExistsError: + pass + + m = hashlib.sha1() + + version = subprocess.check_output( + ['tesseract', '--version'], + stderr=subprocess.STDOUT) + + m.update(version.encode()) + m.update(operation.encode()) + + try: + lang = sys.argv[sys.argv.index('-l') + 1] + m.update(lang.encode()) + except ValueError: + pass + + input_file = sys.argv[-3] + output_file = sys.argv[-2] + + if operation == 'hocr': + output_file += '.hocr' + elif operation == 'pdf': + output_file += '.pdf' + + with open(input_file, 'rb') as f: + m.update(f.read()) + + cache_name = os.path.join(CACHE_PATH, m.hexdigest()) + if os.path.exists(cache_name): + # Cache hit + print("Tesseract cache hit", file=sys.stderr) + shutil.copy(cache_name, output_file) + sys.exit(0) + + # Cache miss + print("Tesseract cache miss", file=sys.stderr) + + # Call tesseract + subprocess.check_call(['tesseract'] + sys.argv[1:]) + + # Insert file into cache + if os.path.exists(output_file): + shutil.copy(output_file, cache_name) + else: + print("Could not find output file", file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/tests/test_main.py b/tests/test_main.py index ec6bfb47..d45b3da1 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -87,6 +87,15 @@ def spoof_tesseract_noop(): return env +@pytest.fixture +def spoof_tesseract_cache(): + env = os.environ.copy() + program = os.path.join(SPOOF_PATH, "tesseract_cache.py") + check_call(['chmod', '+x', program]) + env['OCRMYPDF_TESSERACT'] = program + return env + + def test_quick(spoof_tesseract_noop): check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop) @@ -143,7 +152,8 @@ def test_exotic_image(): def test_preserve_metadata(spoof_tesseract_noop): pdf_before = pypdf.PdfFileReader(_make_input('graph.pdf')) - output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf') + output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf', + env=spoof_tesseract_noop) pdf_after = pypdf.PdfFileReader(output) @@ -204,14 +214,16 @@ def test_repeat_ocr(): assert sh.returncode != 0 -def test_force_ocr(): - out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f') +def test_force_ocr(spoof_tesseract_cache): + out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f', + env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(out) assert pdfinfo[0]['has_text'] -def test_skip_ocr(): - check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s') +def test_skip_ocr(spoof_tesseract_cache): + check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s', + env=spoof_tesseract_cache) def test_argsfile(spoof_tesseract_noop): @@ -235,9 +247,9 @@ def test_ocr_timeout(): yield check_ocr_timeout, 'tesseract' -def test_skip_big(): +def test_skip_big(spoof_tesseract_cache): out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf', - '--skip-big', '10') + '--skip-big', '10', env=spoof_tesseract_cache) pdfinfo = pdf_get_all_pageinfo(out) assert pdfinfo[0]['has_text'] == False