From 17cd655752fba4e261a4a1260fa33bef22428b11 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 28 Apr 2020 02:37:17 -0700 Subject: [PATCH] Don't utf-8 decode tesseract --print-parameters Output not guaranteed to be UTF-8. Fixes #543. --- src/ocrmypdf/exec/tesseract.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/src/ocrmypdf/exec/tesseract.py b/src/ocrmypdf/exec/tesseract.py index abcbb2fa..c8a16f42 100644 --- a/src/ocrmypdf/exec/tesseract.py +++ b/src/ocrmypdf/exec/tesseract.py @@ -77,20 +77,14 @@ def has_textonly_pdf(tesseract_env=None, langs=None): args_tess = tess_base_args(langs, engine_mode=None) + ['--print-parameters', 'pdf'] params = '' try: - proc = run( - args_tess, - check=True, - universal_newlines=True, - stdout=PIPE, - stderr=STDOUT, - env=tesseract_env, - ) + # print-parameters can return non-UTF8 if the parameters are so initialized + proc = run(args_tess, check=True, stdout=PIPE, stderr=STDOUT, env=tesseract_env) params = proc.stdout except CalledProcessError as e: raise MissingDependencyError( "Could not --print-parameters from tesseract" ) from e - if 'textonly_pdf' in params: + if b'textonly_pdf' in params: return True return False