Add tesseract caching to speed up tests

This commit is contained in:
James R. Barlow
2015-12-17 12:52:12 -08:00
parent ecebe2f24b
commit 9ec4aa039d
3 changed files with 93 additions and 7 deletions

1
.gitignore vendored
View File

@@ -21,3 +21,4 @@ htmlcov/
.coverage
.cache/
.ipynb_checkpoints/
tests/cache/

73
tests/spoof/tesseract_cache.py Executable file
View File

@@ -0,0 +1,73 @@
#!/usr/bin/env python3
import sys
import os
import hashlib
import shutil
import subprocess
CACHE_PATH = os.path.abspath(os.path.join(
os.path.dirname(__file__), '..', 'cache'))
def main():
operation = sys.argv[-1]
# For anything except a hocr or pdf, defer to real tesseract
if operation != 'hocr' and operation != 'pdf':
tess_args = ['tesseract'] + sys.argv[1:]
os.execvp("tesseract", tess_args)
return # Not reachable
try:
os.makedirs(CACHE_PATH)
except FileExistsError:
pass
m = hashlib.sha1()
version = subprocess.check_output(
['tesseract', '--version'],
stderr=subprocess.STDOUT)
m.update(version.encode())
m.update(operation.encode())
try:
lang = sys.argv[sys.argv.index('-l') + 1]
m.update(lang.encode())
except ValueError:
pass
input_file = sys.argv[-3]
output_file = sys.argv[-2]
if operation == 'hocr':
output_file += '.hocr'
elif operation == 'pdf':
output_file += '.pdf'
with open(input_file, 'rb') as f:
m.update(f.read())
cache_name = os.path.join(CACHE_PATH, m.hexdigest())
if os.path.exists(cache_name):
# Cache hit
print("Tesseract cache hit", file=sys.stderr)
shutil.copy(cache_name, output_file)
sys.exit(0)
# Cache miss
print("Tesseract cache miss", file=sys.stderr)
# Call tesseract
subprocess.check_call(['tesseract'] + sys.argv[1:])
# Insert file into cache
if os.path.exists(output_file):
shutil.copy(output_file, cache_name)
else:
print("Could not find output file", file=sys.stderr)
if __name__ == '__main__':
main()

View File

@@ -87,6 +87,15 @@ def spoof_tesseract_noop():
return env
@pytest.fixture
def spoof_tesseract_cache():
env = os.environ.copy()
program = os.path.join(SPOOF_PATH, "tesseract_cache.py")
check_call(['chmod', '+x', program])
env['OCRMYPDF_TESSERACT'] = program
return env
def test_quick(spoof_tesseract_noop):
check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop)
@@ -143,7 +152,8 @@ def test_exotic_image():
def test_preserve_metadata(spoof_tesseract_noop):
pdf_before = pypdf.PdfFileReader(_make_input('graph.pdf'))
output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf')
output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
env=spoof_tesseract_noop)
pdf_after = pypdf.PdfFileReader(output)
@@ -204,14 +214,16 @@ def test_repeat_ocr():
assert sh.returncode != 0
def test_force_ocr():
out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')
def test_force_ocr(spoof_tesseract_cache):
out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
env=spoof_tesseract_cache)
pdfinfo = pdf_get_all_pageinfo(out)
assert pdfinfo[0]['has_text']
def test_skip_ocr():
check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')
def test_skip_ocr(spoof_tesseract_cache):
check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s',
env=spoof_tesseract_cache)
def test_argsfile(spoof_tesseract_noop):
@@ -235,9 +247,9 @@ def test_ocr_timeout():
yield check_ocr_timeout, 'tesseract'
def test_skip_big():
def test_skip_big(spoof_tesseract_cache):
out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
'--skip-big', '10')
'--skip-big', '10', env=spoof_tesseract_cache)
pdfinfo = pdf_get_all_pageinfo(out)
assert pdfinfo[0]['has_text'] == False