mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-16 01:02:26 -05:00
Remove test_bad_utf8
Due to difficulties of getting this to work on Python 3.8, Windows, and high probability that this behavior is now gone from Tesseract 4.0+. Originally added in 2017.
This commit is contained in:
@@ -194,12 +194,7 @@ def tesseract_log_output(mainlog, stdout, input_file):
|
||||
try:
|
||||
text = stdout.decode()
|
||||
except UnicodeDecodeError:
|
||||
log.error(
|
||||
"command line output was not utf-8. "
|
||||
+ "This usually means Tesseract's language packs do not match "
|
||||
"the installed version of Tesseract."
|
||||
)
|
||||
text = stdout.decode('utf-8', 'backslashreplace')
|
||||
text = stdout.decode('utf-8', 'ignore')
|
||||
|
||||
lines = text.splitlines()
|
||||
for line in lines:
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from subprocess import DEVNULL, PIPE, run, Popen
|
||||
from subprocess import DEVNULL, PIPE, run, Popen, CalledProcessError
|
||||
|
||||
import pytest
|
||||
|
||||
@@ -115,22 +115,6 @@ def test_bad_locale():
|
||||
assert 'configured to use ASCII as encoding' in err, "should whine"
|
||||
|
||||
|
||||
@pytest.mark.parametrize('renderer', ['hocr', 'sandwich'])
|
||||
def test_bad_utf8(spoof_tess_bad_utf8, renderer, resources, no_outpdf):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'ccitt.pdf',
|
||||
no_outpdf,
|
||||
'--pdf-renderer',
|
||||
renderer,
|
||||
env=spoof_tess_bad_utf8,
|
||||
)
|
||||
|
||||
assert out == '', "stdout not clean"
|
||||
assert p.returncode != 0
|
||||
assert 'not utf-8' in err, "should whine about utf-8"
|
||||
assert '\\x96' in err, 'should repeat backslash encoded output'
|
||||
|
||||
|
||||
def test_dev_null(spoof_tesseract_noop, resources):
|
||||
p, out, err = run_ocrmypdf(
|
||||
resources / 'trivial.pdf', os.devnull, '--force-ocr', env=spoof_tesseract_noop
|
||||
|
||||
Reference in New Issue
Block a user