From 41eb54cc0a59855aaa2d5d03001f80507d2fb2b6 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 14 May 2020 03:23:25 -0700 Subject: [PATCH] Standardize tesseract.generate_hocr and _pdf parameters --- src/ocrmypdf/_pipeline.py | 8 ++++---- src/ocrmypdf/_validation.py | 4 ++-- src/ocrmypdf/exec/tesseract.py | 35 +++++++++++++++++----------------- tests/conftest.py | 1 - tests/test_main.py | 2 +- tests/test_tesseract.py | 10 +++++----- 6 files changed, 29 insertions(+), 31 deletions(-) diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 162677df..6350ea69 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -538,8 +538,8 @@ def ocr_tesseract_hocr(input_file, page_context): tesseract.generate_hocr( input_file=input_file, output_hocr=hocr_out, - output_sidecar=hocr_text_out, - language=options.language, + output_text=hocr_text_out, + languages=options.language, engine_mode=options.tesseract_oem, tessconfig=options.tesseract_config, timeout=options.tesseract_timeout, @@ -615,10 +615,10 @@ def ocr_tesseract_textonly_pdf(input_image, page_context): output_text = page_context.get_path('ocr_tess.txt') options = page_context.options tesseract.generate_pdf( - input_image=input_image, + input_file=input_image, output_pdf=output_pdf, output_text=output_text, - language=options.language, + languages=options.language, engine_mode=options.tesseract_oem, tessconfig=options.tesseract_config, timeout=options.tesseract_timeout, diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py index 2fc85928..4ef6fcdf 100644 --- a/src/ocrmypdf/_validation.py +++ b/src/ocrmypdf/_validation.py @@ -84,12 +84,12 @@ def check_options_languages(options): options.language = options.language[0].split('+') languages = set(options.language) - if not languages.issubset(tesseract.languages(options.tesseract_env)): + if not languages.issubset(tesseract.get_languages(options.tesseract_env)): msg = ( "The installed version of tesseract does not have language " "data for the following requested languages: \n" ) - for lang in languages - tesseract.languages(options.tesseract_env): + for lang in languages - tesseract.get_languages(options.tesseract_env): msg += lang + '\n' raise MissingDependencyError(msg) diff --git a/src/ocrmypdf/exec/tesseract.py b/src/ocrmypdf/exec/tesseract.py index 57450937..39046ce5 100644 --- a/src/ocrmypdf/exec/tesseract.py +++ b/src/ocrmypdf/exec/tesseract.py @@ -89,7 +89,8 @@ def has_textonly_pdf(tesseract_env=None, langs=None): params = proc.stdout except CalledProcessError as e: raise MissingDependencyError( - "Could not --print-parameters from tesseract" + "Could not --print-parameters from tesseract. This can happen if the " + "TESSDATA_PREFIX environment is not set to a valid tessdata folder. " ) from e if 'textonly_pdf' in params: return True @@ -105,7 +106,7 @@ def has_user_words(tesseract_env=None): return version(tesseract_env) >= '4.1' -def languages(tesseract_env=None): +def get_languages(tesseract_env=None): def lang_error(output): msg = ( "Tesseract failed to report available languages.\n" @@ -232,21 +233,21 @@ def page_timedout(timeout): log.warning("[tesseract] took too long to OCR - skipping") -def _generate_null_hocr(output_hocr, output_sidecar, image): +def _generate_null_hocr(output_hocr, output_text, image): """Produce a .hocr file that reports no text detected on a page that is the same size as the input image.""" with Image.open(image) as im: w, h = im.size output_hocr.write_text(HOCR_TEMPLATE.format(w, h), encoding='utf-8') - output_sidecar.write_text('[skipped page]', encoding='utf-8') + output_text.write_text('[skipped page]', encoding='utf-8') def generate_hocr( input_file: Path, output_hocr: Path, - output_sidecar: Path, - language: list, + output_text: Path, + languages: list, engine_mode, tessconfig: list, timeout: float, @@ -257,7 +258,7 @@ def generate_hocr( ): prefix = output_hocr.with_suffix('') - args_tesseract = tess_base_args(language, engine_mode) + args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) @@ -286,11 +287,11 @@ def generate_hocr( # Temporary workaround to hocrTransform not being able to function if # it does not have a valid hOCR file. page_timedout(timeout) - _generate_null_hocr(output_hocr, output_sidecar, input_file) + _generate_null_hocr(output_hocr, output_text, input_file) except CalledProcessError as e: tesseract_log_output(e.output) if b'Image too large' in e.output: - _generate_null_hocr(output_hocr, output_sidecar, input_file) + _generate_null_hocr(output_hocr, output_text, input_file) return raise SubprocessOutputError() from e @@ -299,7 +300,7 @@ def generate_hocr( # The sidecar text file will get the suffix .txt; rename it to # whatever caller wants it named if prefix.with_suffix('.txt').exists(): - shutil.move(prefix.with_suffix('.txt'), output_sidecar) + shutil.move(prefix.with_suffix('.txt'), output_text) def use_skip_page(output_pdf, output_text): @@ -311,10 +312,10 @@ def use_skip_page(output_pdf, output_text): def generate_pdf( *, - input_image: Path, + input_file: Path, output_pdf: Path, output_text: Path, - language: List[str], + languages: List[str], engine_mode, tessconfig: List[str], timeout: float, @@ -325,22 +326,20 @@ def generate_pdf( ): """Use Tesseract to render a PDF. - input_image -- image to analyze + input_file -- image to analyze output_pdf -- file to generate output_text -- OCR text file - language -- list of languages to consider + languages -- list of languages to consider engine_mode -- engine mode argument for tess v4 tessconfig -- tesseract configuration timeout -- timeout (seconds) - log -- logger object """ - args_tesseract = tess_base_args(language, engine_mode) + args_tesseract = tess_base_args(languages, engine_mode) if pagesegmode is not None: args_tesseract.extend(['--psm', str(pagesegmode)]) - # has_textonly_pdf(tesseract_env=tesseract_env, langs=language) args_tesseract.extend(['-c', 'textonly_pdf=1']) if user_words: @@ -354,7 +353,7 @@ def generate_pdf( # Reminder: test suite tesseract spoofers might break after any changes # to the number of order parameters here - args_tesseract.extend([input_image, prefix, 'pdf', 'txt'] + tessconfig) + args_tesseract.extend([input_file, prefix, 'pdf', 'txt'] + tessconfig) try: p = run( args_tesseract, diff --git a/tests/conftest.py b/tests/conftest.py index 6ea95e96..8eaede2a 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -241,7 +241,6 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None): [str(input_file), str(output_file)] + [str(arg) for arg in args if arg is not None] ) - if env: options.tesseract_env = env.copy() options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file) diff --git a/tests/test_main.py b/tests/test_main.py index 0fc11e76..c3a715ce 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -233,7 +233,7 @@ def test_german(spoof_tesseract_cache, resources, outdir): env=spoof_tesseract_cache, ) except MissingDependencyError: - if 'deu' not in tesseract.languages(): + if 'deu' not in tesseract.get_languages(): pytest.xfail(reason="tesseract-deu language pack not installed") raise diff --git a/tests/test_tesseract.py b/tests/test_tesseract.py index 9c37dde1..ffe3fe31 100644 --- a/tests/test_tesseract.py +++ b/tests/test_tesseract.py @@ -77,7 +77,7 @@ def test_no_languages(tmp_path): env['TESSDATA_PREFIX'] = fspath(tmp_path) with pytest.raises(MissingDependencyError): - tesseract.languages(tesseract_env=env) + tesseract.get_languages(tesseract_env=env) def test_image_too_large_hocr(monkeypatch, resources, outdir): @@ -88,8 +88,8 @@ def test_image_too_large_hocr(monkeypatch, resources, outdir): tesseract.generate_hocr( input_file=resources / 'crom.png', output_hocr=outdir / 'out.hocr', - output_sidecar=outdir / 'out.txt', - language=['eng'], + output_text=outdir / 'out.txt', + languages=['eng'], engine_mode=None, tessconfig=[], timeout=180.0, @@ -107,10 +107,10 @@ def test_image_too_large_pdf(monkeypatch, resources, outdir): monkeypatch.setattr(tesseract, 'run', dummy_run) tesseract.generate_pdf( - input_image=resources / 'crom.png', + input_file=resources / 'crom.png', output_pdf=outdir / 'pdf.pdf', output_text=outdir / 'txt.txt', - language=['eng'], + languages=['eng'], engine_mode=None, tessconfig=[], timeout=180.0,