mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-15 08:42:25 -05:00
Add tesseract caching to speed up tests
This commit is contained in:
1
.gitignore
vendored
1
.gitignore
vendored
@@ -21,3 +21,4 @@ htmlcov/
|
||||
.coverage
|
||||
.cache/
|
||||
.ipynb_checkpoints/
|
||||
tests/cache/
|
||||
73
tests/spoof/tesseract_cache.py
Executable file
73
tests/spoof/tesseract_cache.py
Executable file
@@ -0,0 +1,73 @@
|
||||
#!/usr/bin/env python3
|
||||
import sys
|
||||
import os
|
||||
import hashlib
|
||||
import shutil
|
||||
import subprocess
|
||||
|
||||
|
||||
CACHE_PATH = os.path.abspath(os.path.join(
|
||||
os.path.dirname(__file__), '..', 'cache'))
|
||||
|
||||
|
||||
def main():
|
||||
operation = sys.argv[-1]
|
||||
# For anything except a hocr or pdf, defer to real tesseract
|
||||
if operation != 'hocr' and operation != 'pdf':
|
||||
tess_args = ['tesseract'] + sys.argv[1:]
|
||||
os.execvp("tesseract", tess_args)
|
||||
return # Not reachable
|
||||
|
||||
try:
|
||||
os.makedirs(CACHE_PATH)
|
||||
except FileExistsError:
|
||||
pass
|
||||
|
||||
m = hashlib.sha1()
|
||||
|
||||
version = subprocess.check_output(
|
||||
['tesseract', '--version'],
|
||||
stderr=subprocess.STDOUT)
|
||||
|
||||
m.update(version.encode())
|
||||
m.update(operation.encode())
|
||||
|
||||
try:
|
||||
lang = sys.argv[sys.argv.index('-l') + 1]
|
||||
m.update(lang.encode())
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
input_file = sys.argv[-3]
|
||||
output_file = sys.argv[-2]
|
||||
|
||||
if operation == 'hocr':
|
||||
output_file += '.hocr'
|
||||
elif operation == 'pdf':
|
||||
output_file += '.pdf'
|
||||
|
||||
with open(input_file, 'rb') as f:
|
||||
m.update(f.read())
|
||||
|
||||
cache_name = os.path.join(CACHE_PATH, m.hexdigest())
|
||||
if os.path.exists(cache_name):
|
||||
# Cache hit
|
||||
print("Tesseract cache hit", file=sys.stderr)
|
||||
shutil.copy(cache_name, output_file)
|
||||
sys.exit(0)
|
||||
|
||||
# Cache miss
|
||||
print("Tesseract cache miss", file=sys.stderr)
|
||||
|
||||
# Call tesseract
|
||||
subprocess.check_call(['tesseract'] + sys.argv[1:])
|
||||
|
||||
# Insert file into cache
|
||||
if os.path.exists(output_file):
|
||||
shutil.copy(output_file, cache_name)
|
||||
else:
|
||||
print("Could not find output file", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -87,6 +87,15 @@ def spoof_tesseract_noop():
|
||||
return env
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def spoof_tesseract_cache():
|
||||
env = os.environ.copy()
|
||||
program = os.path.join(SPOOF_PATH, "tesseract_cache.py")
|
||||
check_call(['chmod', '+x', program])
|
||||
env['OCRMYPDF_TESSERACT'] = program
|
||||
return env
|
||||
|
||||
|
||||
def test_quick(spoof_tesseract_noop):
|
||||
check_ocrmypdf('c02-22.pdf', 'test_quick.pdf', env=spoof_tesseract_noop)
|
||||
|
||||
@@ -143,7 +152,8 @@ def test_exotic_image():
|
||||
def test_preserve_metadata(spoof_tesseract_noop):
|
||||
pdf_before = pypdf.PdfFileReader(_make_input('graph.pdf'))
|
||||
|
||||
output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf')
|
||||
output = check_ocrmypdf('graph.pdf', 'test_metadata_preserve.pdf',
|
||||
env=spoof_tesseract_noop)
|
||||
|
||||
pdf_after = pypdf.PdfFileReader(output)
|
||||
|
||||
@@ -204,14 +214,16 @@ def test_repeat_ocr():
|
||||
assert sh.returncode != 0
|
||||
|
||||
|
||||
def test_force_ocr():
|
||||
out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f')
|
||||
def test_force_ocr(spoof_tesseract_cache):
|
||||
out = check_ocrmypdf('graph_ocred.pdf', 'test_force.pdf', '-f',
|
||||
env=spoof_tesseract_cache)
|
||||
pdfinfo = pdf_get_all_pageinfo(out)
|
||||
assert pdfinfo[0]['has_text']
|
||||
|
||||
|
||||
def test_skip_ocr():
|
||||
check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s')
|
||||
def test_skip_ocr(spoof_tesseract_cache):
|
||||
check_ocrmypdf('graph_ocred.pdf', 'test_skip.pdf', '-s',
|
||||
env=spoof_tesseract_cache)
|
||||
|
||||
|
||||
def test_argsfile(spoof_tesseract_noop):
|
||||
@@ -235,9 +247,9 @@ def test_ocr_timeout():
|
||||
yield check_ocr_timeout, 'tesseract'
|
||||
|
||||
|
||||
def test_skip_big():
|
||||
def test_skip_big(spoof_tesseract_cache):
|
||||
out = check_ocrmypdf('enormous.pdf', 'test_enormous.pdf',
|
||||
'--skip-big', '10')
|
||||
'--skip-big', '10', env=spoof_tesseract_cache)
|
||||
pdfinfo = pdf_get_all_pageinfo(out)
|
||||
assert pdfinfo[0]['has_text'] == False
|
||||
|
||||
|
||||
Reference in New Issue
Block a user