Remove test_bad_utf8

Due to difficulties of getting this to work on Python 3.8, Windows, and high probability that this behavior is now gone from Tesseract 4.0+. Originally added in 2017.
2026-02-16 01:02:26 -05:00 · 2019-12-04 15:00:12 -08:00
parent d0301813cc
commit 9db01c7ff5
2 changed files with 2 additions and 23 deletions
--- a/src/ocrmypdf/exec/tesseract.py
+++ b/src/ocrmypdf/exec/tesseract.py
@@ -194,12 +194,7 @@ def tesseract_log_output(mainlog, stdout, input_file):
    try:
        text = stdout.decode()
    except UnicodeDecodeError:
-        log.error(
-            "command line output was not utf-8. "
-            + "This usually means Tesseract's language packs do not match "
-            "the installed version of Tesseract."
-        )
-        text = stdout.decode('utf-8', 'backslashreplace')
+        text = stdout.decode('utf-8', 'ignore')

    lines = text.splitlines()
    for line in lines:
--- a/tests/test_stdio.py
+++ b/tests/test_stdio.py
@@ -18,7 +18,7 @@
 import os
 import sys
 from pathlib import Path
-from subprocess import DEVNULL, PIPE, run, Popen
+from subprocess import DEVNULL, PIPE, run, Popen, CalledProcessError

 import pytest

@@ -115,22 +115,6 @@ def test_bad_locale():
    assert 'configured to use ASCII as encoding' in err, "should whine"


-@pytest.mark.parametrize('renderer', ['hocr', 'sandwich'])
-def test_bad_utf8(spoof_tess_bad_utf8, renderer, resources, no_outpdf):
-    p, out, err = run_ocrmypdf(
-        resources / 'ccitt.pdf',
-        no_outpdf,
-        '--pdf-renderer',
-        renderer,
-        env=spoof_tess_bad_utf8,
-    )
-
-    assert out == '', "stdout not clean"
-    assert p.returncode != 0
-    assert 'not utf-8' in err, "should whine about utf-8"
-    assert '\\x96' in err, 'should repeat backslash encoded output'
-
-
 def test_dev_null(spoof_tesseract_noop, resources):
    p, out, err = run_ocrmypdf(
        resources / 'trivial.pdf', os.devnull, '--force-ocr', env=spoof_tesseract_noop