Standardize tesseract.generate_hocr and _pdf parameters

2026-05-19 20:14:53 -04:00 · 2020-05-14 03:23:25 -07:00
parent 12a2f78c4d
commit 41eb54cc0a
6 changed files with 29 additions and 31 deletions
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -538,8 +538,8 @@ def ocr_tesseract_hocr(input_file, page_context):
    tesseract.generate_hocr(
        input_file=input_file,
        output_hocr=hocr_out,
-        output_sidecar=hocr_text_out,
-        language=options.language,
+        output_text=hocr_text_out,
+        languages=options.language,
        engine_mode=options.tesseract_oem,
        tessconfig=options.tesseract_config,
        timeout=options.tesseract_timeout,
@@ -615,10 +615,10 @@ def ocr_tesseract_textonly_pdf(input_image, page_context):
    output_text = page_context.get_path('ocr_tess.txt')
    options = page_context.options
    tesseract.generate_pdf(
-        input_image=input_image,
+        input_file=input_image,
        output_pdf=output_pdf,
        output_text=output_text,
-        language=options.language,
+        languages=options.language,
        engine_mode=options.tesseract_oem,
        tessconfig=options.tesseract_config,
        timeout=options.tesseract_timeout,
--- a/src/ocrmypdf/_validation.py
+++ b/src/ocrmypdf/_validation.py
@@ -84,12 +84,12 @@ def check_options_languages(options):
        options.language = options.language[0].split('+')

    languages = set(options.language)
-    if not languages.issubset(tesseract.languages(options.tesseract_env)):
+    if not languages.issubset(tesseract.get_languages(options.tesseract_env)):
        msg = (
            "The installed version of tesseract does not have language "
            "data for the following requested languages: \n"
        )
-        for lang in languages - tesseract.languages(options.tesseract_env):
+        for lang in languages - tesseract.get_languages(options.tesseract_env):
            msg += lang + '\n'
        raise MissingDependencyError(msg)

--- a/src/ocrmypdf/exec/tesseract.py
+++ b/src/ocrmypdf/exec/tesseract.py
@@ -89,7 +89,8 @@ def has_textonly_pdf(tesseract_env=None, langs=None):
        params = proc.stdout
    except CalledProcessError as e:
        raise MissingDependencyError(
-            "Could not --print-parameters from tesseract"
+            "Could not --print-parameters from tesseract. This can happen if the "
+            "TESSDATA_PREFIX environment is not set to a valid tessdata folder. "
        ) from e
    if 'textonly_pdf' in params:
        return True
@@ -105,7 +106,7 @@ def has_user_words(tesseract_env=None):
    return version(tesseract_env) >= '4.1'


-def languages(tesseract_env=None):
+def get_languages(tesseract_env=None):
    def lang_error(output):
        msg = (
            "Tesseract failed to report available languages.\n"
@@ -232,21 +233,21 @@ def page_timedout(timeout):
    log.warning("[tesseract] took too long to OCR - skipping")


-def _generate_null_hocr(output_hocr, output_sidecar, image):
+def _generate_null_hocr(output_hocr, output_text, image):
    """Produce a .hocr file that reports no text detected on a page that is
    the same size as the input image."""
    with Image.open(image) as im:
        w, h = im.size

    output_hocr.write_text(HOCR_TEMPLATE.format(w, h), encoding='utf-8')
-    output_sidecar.write_text('[skipped page]', encoding='utf-8')
+    output_text.write_text('[skipped page]', encoding='utf-8')


 def generate_hocr(
    input_file: Path,
    output_hocr: Path,
-    output_sidecar: Path,
-    language: list,
+    output_text: Path,
+    languages: list,
    engine_mode,
    tessconfig: list,
    timeout: float,
@@ -257,7 +258,7 @@ def generate_hocr(
 ):
    prefix = output_hocr.with_suffix('')

-    args_tesseract = tess_base_args(language, engine_mode)
+    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])
@@ -286,11 +287,11 @@ def generate_hocr(
        # Temporary workaround to hocrTransform not being able to function if
        # it does not have a valid hOCR file.
        page_timedout(timeout)
-        _generate_null_hocr(output_hocr, output_sidecar, input_file)
+        _generate_null_hocr(output_hocr, output_text, input_file)
    except CalledProcessError as e:
        tesseract_log_output(e.output)
        if b'Image too large' in e.output:
-            _generate_null_hocr(output_hocr, output_sidecar, input_file)
+            _generate_null_hocr(output_hocr, output_text, input_file)
            return

        raise SubprocessOutputError() from e
@@ -299,7 +300,7 @@ def generate_hocr(
        # The sidecar text file will get the suffix .txt; rename it to
        # whatever caller wants it named
        if prefix.with_suffix('.txt').exists():
-            shutil.move(prefix.with_suffix('.txt'), output_sidecar)
+            shutil.move(prefix.with_suffix('.txt'), output_text)


 def use_skip_page(output_pdf, output_text):
@@ -311,10 +312,10 @@ def use_skip_page(output_pdf, output_text):

 def generate_pdf(
    *,
-    input_image: Path,
+    input_file: Path,
    output_pdf: Path,
    output_text: Path,
-    language: List[str],
+    languages: List[str],
    engine_mode,
    tessconfig: List[str],
    timeout: float,
@@ -325,22 +326,20 @@ def generate_pdf(
 ):
    """Use Tesseract to render a PDF.

-    input_image -- image to analyze
+    input_file -- image to analyze
    output_pdf -- file to generate
    output_text -- OCR text file
-    language -- list of languages to consider
+    languages -- list of languages to consider
    engine_mode -- engine mode argument for tess v4
    tessconfig -- tesseract configuration
    timeout -- timeout (seconds)
-    log -- logger object
    """

-    args_tesseract = tess_base_args(language, engine_mode)
+    args_tesseract = tess_base_args(languages, engine_mode)

    if pagesegmode is not None:
        args_tesseract.extend(['--psm', str(pagesegmode)])

-    # has_textonly_pdf(tesseract_env=tesseract_env, langs=language)
    args_tesseract.extend(['-c', 'textonly_pdf=1'])

    if user_words:
@@ -354,7 +353,7 @@ def generate_pdf(
    # Reminder: test suite tesseract spoofers might break after any changes
    # to the number of order parameters here

-    args_tesseract.extend([input_image, prefix, 'pdf', 'txt'] + tessconfig)
+    args_tesseract.extend([input_file, prefix, 'pdf', 'txt'] + tessconfig)
    try:
        p = run(
            args_tesseract,
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -241,7 +241,6 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None):
        [str(input_file), str(output_file)]
        + [str(arg) for arg in args if arg is not None]
    )
-
    if env:
        options.tesseract_env = env.copy()
        options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -233,7 +233,7 @@ def test_german(spoof_tesseract_cache, resources, outdir):
            env=spoof_tesseract_cache,
        )
    except MissingDependencyError:
-        if 'deu' not in tesseract.languages():
+        if 'deu' not in tesseract.get_languages():
            pytest.xfail(reason="tesseract-deu language pack not installed")
        raise

--- a/tests/test_tesseract.py
+++ b/tests/test_tesseract.py
@@ -77,7 +77,7 @@ def test_no_languages(tmp_path):
    env['TESSDATA_PREFIX'] = fspath(tmp_path)

    with pytest.raises(MissingDependencyError):
-        tesseract.languages(tesseract_env=env)
+        tesseract.get_languages(tesseract_env=env)


 def test_image_too_large_hocr(monkeypatch, resources, outdir):
@@ -88,8 +88,8 @@ def test_image_too_large_hocr(monkeypatch, resources, outdir):
    tesseract.generate_hocr(
        input_file=resources / 'crom.png',
        output_hocr=outdir / 'out.hocr',
-        output_sidecar=outdir / 'out.txt',
-        language=['eng'],
+        output_text=outdir / 'out.txt',
+        languages=['eng'],
        engine_mode=None,
        tessconfig=[],
        timeout=180.0,
@@ -107,10 +107,10 @@ def test_image_too_large_pdf(monkeypatch, resources, outdir):

    monkeypatch.setattr(tesseract, 'run', dummy_run)
    tesseract.generate_pdf(
-        input_image=resources / 'crom.png',
+        input_file=resources / 'crom.png',
        output_pdf=outdir / 'pdf.pdf',
        output_text=outdir / 'txt.txt',
-        language=['eng'],
+        languages=['eng'],
        engine_mode=None,
        tessconfig=[],
        timeout=180.0,