Remove test_bad_utf8

Due to difficulties of getting this to work on Python 3.8, Windows, and
high probability that this behavior is now gone from Tesseract 4.0+.

Originally added in 2017.
This commit is contained in:
James R. Barlow
2019-12-04 15:00:12 -08:00
parent d0301813cc
commit 9db01c7ff5
2 changed files with 2 additions and 23 deletions

View File

@@ -194,12 +194,7 @@ def tesseract_log_output(mainlog, stdout, input_file):
try:
text = stdout.decode()
except UnicodeDecodeError:
log.error(
"command line output was not utf-8. "
+ "This usually means Tesseract's language packs do not match "
"the installed version of Tesseract."
)
text = stdout.decode('utf-8', 'backslashreplace')
text = stdout.decode('utf-8', 'ignore')
lines = text.splitlines()
for line in lines:

View File

@@ -18,7 +18,7 @@
import os
import sys
from pathlib import Path
from subprocess import DEVNULL, PIPE, run, Popen
from subprocess import DEVNULL, PIPE, run, Popen, CalledProcessError
import pytest
@@ -115,22 +115,6 @@ def test_bad_locale():
assert 'configured to use ASCII as encoding' in err, "should whine"
@pytest.mark.parametrize('renderer', ['hocr', 'sandwich'])
def test_bad_utf8(spoof_tess_bad_utf8, renderer, resources, no_outpdf):
p, out, err = run_ocrmypdf(
resources / 'ccitt.pdf',
no_outpdf,
'--pdf-renderer',
renderer,
env=spoof_tess_bad_utf8,
)
assert out == '', "stdout not clean"
assert p.returncode != 0
assert 'not utf-8' in err, "should whine about utf-8"
assert '\\x96' in err, 'should repeat backslash encoded output'
def test_dev_null(spoof_tesseract_noop, resources):
p, out, err = run_ocrmypdf(
resources / 'trivial.pdf', os.devnull, '--force-ocr', env=spoof_tesseract_noop