From 4f964a3c8ad0a97b52fbd0a4e39108497f43cb29 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 2 Dec 2015 23:20:31 -0800 Subject: [PATCH] Introduce --pdf-renderer auto Tess 3.03's has various quality problems like wrong DPI that are fixed in Tess 3.04. Idea here is to introduce an option to let OCRmyPDF select the rendering backend based on the options and system. However, we're not ready for tesseract as the main renderer. Setting pdf-renderer to tesseract does not pass all test cases, mainly the one where --tesseract-timeout is triggered, and some others. --- ocrmypdf/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index 0e249b56..26806f72 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -174,7 +174,7 @@ advanced.add_argument( '--tesseract-config', default=[], type=list, action='append', help="additional Tesseract configuration files") advanced.add_argument( - '--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr', + '--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto', help='choose OCR PDF renderer') advanced.add_argument( '--tesseract-timeout', default=180.0, type=float, @@ -216,6 +216,8 @@ if not set(options.language).issubset(tesseract.languages()): # ---------- # Arguments +if options.pdf_renderer == 'auto': + options.pdf_renderer = 'hocr' if any((options.deskew, options.clean, options.clean_final)): try: