From 32326438094660ef6341ee37585f019fbdb7e79a Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 13 Jun 2017 10:18:08 -0700 Subject: [PATCH] =?UTF-8?q?Support=20=E2=80=9Ctextonly=20PDF=E2=80=9D=20re?= =?UTF-8?q?nderer=20in=20Tesseract=203.05.01?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrmypdf/__main__.py | 5 ++--- ocrmypdf/exec/tesseract.py | 40 ++++++++++++++++++++------------------ 2 files changed, 23 insertions(+), 22 deletions(-) diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py index c83508b7..ba1640ff 100755 --- a/ocrmypdf/__main__.py +++ b/ocrmypdf/__main__.py @@ -327,10 +327,9 @@ def check_options_output(options, log): " recommended for builds of tesseract 4.00.00alpha older than" " February 2017. Make sure you are using a recent build.") - if options.debug_rendering and options.pdf_renderer == 'tesseract': + if options.debug_rendering and options.pdf_renderer != 'hocr': log.info( - "Ignoring --debug-rendering because it is not supported with" - "--pdf-renderer=tesseract.") + "Ignoring --debug-rendering because it requires --pdf-renderer=hocr") lossless_reconstruction = False if options.pdf_renderer in ('hocr', 'tess4'): diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py index b7a77c3a..cbc75e2a 100644 --- a/ocrmypdf/exec/tesseract.py +++ b/ocrmypdf/exec/tesseract.py @@ -63,26 +63,28 @@ def v4(): return (version() >= '4') +@lru_cache(maxsize=1) def has_textonly_pdf(): - if version() == '4.00.00alpha': - # textonly_pdf added during the 4.00.00alpha cycle, so we must test - # more carefully to see if it is present - args_tess = [ - get_program('tesseract'), - '--print-parameters' - ] - try: - params = check_output( - args_tess, close_fds=True, universal_newlines=True, - stderr=STDOUT) - except CalledProcessError as e: - print("Could not --print-parameters from tesseract", - file=sys.stderr) - raise MissingDependencyError from e - if 'textonly_pdf' in params: - return True - else: - return v4() + """Does Tesseract have textonly_pdf capability? + + Available in 3.05.01, and v4.00.00alpha since January 2017. Best to + parse the parameter list + """ + args_tess = [ + get_program('tesseract'), + '--print-parameters' + ] + try: + params = check_output( + args_tess, close_fds=True, universal_newlines=True, + stderr=STDOUT) + except CalledProcessError as e: + print("Could not --print-parameters from tesseract", + file=sys.stderr) + raise MissingDependencyError from e + if 'textonly_pdf' in params: + return True + return False def psm():