Don't utf-8 decode tesseract --print-parameters

Output not guaranteed to be UTF-8.

Fixes #543.
This commit is contained in:
James R. Barlow
2020-04-28 02:37:17 -07:00
parent b840b16c82
commit 17cd655752

View File

@@ -77,20 +77,14 @@ def has_textonly_pdf(tesseract_env=None, langs=None):
args_tess = tess_base_args(langs, engine_mode=None) + ['--print-parameters', 'pdf']
params = ''
try:
proc = run(
args_tess,
check=True,
universal_newlines=True,
stdout=PIPE,
stderr=STDOUT,
env=tesseract_env,
)
# print-parameters can return non-UTF8 if the parameters are so initialized
proc = run(args_tess, check=True, stdout=PIPE, stderr=STDOUT, env=tesseract_env)
params = proc.stdout
except CalledProcessError as e:
raise MissingDependencyError(
"Could not --print-parameters from tesseract"
) from e
if 'textonly_pdf' in params:
if b'textonly_pdf' in params:
return True
return False