From 37ebcadfa16ffaacd1c1120d1b246e056efbf34d Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 9 May 2017 17:54:56 -0700 Subject: [PATCH] =?UTF-8?q?Implement=20=E2=80=94user-words,=20=E2=80=94use?= =?UTF-8?q?r-patterns?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- ocrmypdf/__main__.py | 9 +++++++++ ocrmypdf/exec/tesseract.py | 18 ++++++++++++++++-- ocrmypdf/pipeline.py | 4 ++++ 3 files changed, 29 insertions(+), 2 deletions(-) diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py index dcb979a3..7cd29396 100755 --- a/ocrmypdf/__main__.py +++ b/ocrmypdf/__main__.py @@ -255,6 +255,15 @@ advanced.add_argument( "are applied to all pages, including those for which OCR was " "skipped. Not supported for --output-type=pdf ; that setting " "preserves the original compression of all images.") +advanced.add_argument( + '--user-words', metavar='FILE', + help="Specify the location of the Tesseract user words file. This is a " + "list of words Tesseract should consider while performing OCR in " + "addition to its standard language dictionaries. This can improve " + "OCR quality especially for specialized and technical documents.") +advanced.add_argument( + '--user-patterns', metavar='FILE', + help="Specify the location of the Tesseract user patterns file.") debugging = parser.add_argument_group( "Debugging", diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py index 21e2cc4f..a3559abd 100644 --- a/ocrmypdf/exec/tesseract.py +++ b/ocrmypdf/exec/tesseract.py @@ -214,7 +214,8 @@ def _generate_null_hocr(output_hocr, image): def generate_hocr(input_file, output_hocr, language: list, engine_mode, tessconfig: list, - timeout: float, pagesegmode: int, log): + timeout: float, pagesegmode: int, user_words, user_patterns, + log): badxml = os.path.splitext(output_hocr)[0] + '.badxml' @@ -223,6 +224,12 @@ def generate_hocr(input_file, output_hocr, language: list, engine_mode, if pagesegmode is not None: args_tesseract.extend([psm(), str(pagesegmode)]) + if user_words: + args_tesseract.extend(['--user-words', user_words]) + + if user_patterns: + args_tesseract.extend(['--user-patterns', user_patterns]) + args_tesseract.extend([ input_file, badxml, @@ -293,7 +300,8 @@ def use_skip_page(text_only, skip_pdf, output_pdf): def generate_pdf(input_image, skip_pdf, output_pdf, language: list, engine_mode, text_only: bool, - tessconfig: list, timeout: float, pagesegmode: int, log): + tessconfig: list, timeout: float, pagesegmode: int, + user_words, user_patterns, log): '''Use Tesseract to render a PDF. input_image -- image to analyze @@ -315,6 +323,12 @@ def generate_pdf(input_image, skip_pdf, output_pdf, language: list, if text_only: args_tesseract.extend(['-c', 'textonly_pdf=1']) + if user_words: + args_tesseract.extend(['--user-words', user_words]) + + if user_patterns: + args_tesseract.extend(['--user-patterns', user_patterns]) + args_tesseract.extend([ input_image, os.path.splitext(output_pdf)[0], # Tesseract appends suffix diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index beafe1b4..c5832ed8 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -495,6 +495,8 @@ def ocr_tesseract_hocr( tessconfig=options.tesseract_config, timeout=options.tesseract_timeout, pagesegmode=options.tesseract_pagesegmode, + user_words=options.user_words, + user_patterns=options.user_patterns, log=log ) @@ -703,6 +705,8 @@ def ocr_tesseract_and_render_pdf( tessconfig=options.tesseract_config, timeout=options.tesseract_timeout, pagesegmode=options.tesseract_pagesegmode, + user_words=options.user_words, + user_patterns=options.user_patterns, log=log)