diff --git a/ocrmypdf/__main__.py b/ocrmypdf/__main__.py index 9ad7749b..b2da5b21 100755 --- a/ocrmypdf/__main__.py +++ b/ocrmypdf/__main__.py @@ -281,6 +281,15 @@ advanced.add_argument( "are applied to all pages, including those for which OCR was " "skipped. Not supported for --output-type=pdf ; that setting " "preserves the original compression of all images.") +advanced.add_argument( + '--user-words', metavar='FILE', + help="Specify the location of the Tesseract user words file. This is a " + "list of words Tesseract should consider while performing OCR in " + "addition to its standard language dictionaries. This can improve " + "OCR quality especially for specialized and technical documents.") +advanced.add_argument( + '--user-patterns', metavar='FILE', + help="Specify the location of the Tesseract user patterns file.") debugging = parser.add_argument_group( "Debugging", diff --git a/ocrmypdf/exec/tesseract.py b/ocrmypdf/exec/tesseract.py index cbc75e2a..87d655e4 100644 --- a/ocrmypdf/exec/tesseract.py +++ b/ocrmypdf/exec/tesseract.py @@ -213,7 +213,8 @@ def _generate_null_hocr(output_hocr, output_sidecar, image): def generate_hocr(input_file, output_files, language: list, engine_mode, tessconfig: list, - timeout: float, pagesegmode: int, log): + timeout: float, pagesegmode: int, user_words, user_patterns, + log): output_hocr = next(o for o in output_files if o.endswith('.hocr')) output_sidecar = next(o for o in output_files if o.endswith('.txt')) @@ -224,10 +225,17 @@ def generate_hocr(input_file, output_files, language: list, engine_mode, if pagesegmode is not None: args_tesseract.extend([psm(), str(pagesegmode)]) + if user_words: + args_tesseract.extend(['--user-words', user_words]) + + if user_patterns: + args_tesseract.extend(['--user-patterns', user_patterns]) + # Reminder: test suite tesseract spoofers will break after any changes # to the number of order parameters here # Tesseract 3.04 requires the order here to be "hocr txt" and will fail # on "txt hocr" + args_tesseract.extend([ input_file, prefix, @@ -285,7 +293,8 @@ def use_skip_page(text_only, skip_pdf, output_pdf, output_text): def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text, language: list, engine_mode, text_only: bool, - tessconfig: list, timeout: float, pagesegmode: int, log): + tessconfig: list, timeout: float, pagesegmode: int, + user_words, user_patterns, log): '''Use Tesseract to render a PDF. input_image -- image to analyze @@ -308,10 +317,17 @@ def generate_pdf(*, input_image, skip_pdf, output_pdf, output_text, if text_only: args_tesseract.extend(['-c', 'textonly_pdf=1']) + if user_words: + args_tesseract.extend(['--user-words', user_words]) + + if user_patterns: + args_tesseract.extend(['--user-patterns', user_patterns]) + prefix = os.path.splitext(output_pdf)[0] # Tesseract appends suffixes - # Reminder: test suite tesseract spoofers will break after any changes + # Reminder: test suite tesseract spoofers might break after any changes # to the number of order parameters here + args_tesseract.extend([ input_image, prefix, diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index 32f0d753..891685b9 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -525,6 +525,8 @@ def ocr_tesseract_hocr( tessconfig=options.tesseract_config, timeout=options.tesseract_timeout, pagesegmode=options.tesseract_pagesegmode, + user_words=options.user_words, + user_patterns=options.user_patterns, log=log ) @@ -755,6 +757,8 @@ def ocr_tesseract_and_render_pdf( tessconfig=options.tesseract_config, timeout=options.tesseract_timeout, pagesegmode=options.tesseract_pagesegmode, + user_words=options.user_words, + user_patterns=options.user_patterns, log=log)