From 74059eecf1240961e64d84a2a34ae95091a369cf Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 2 Dec 2015 01:48:10 -0800 Subject: [PATCH 1/5] Choose PDF/A-2b by default instead of A-1b --- RELEASE_NOTES.rst | 8 ++++++++ ocrmypdf/ghostscript.py | 2 +- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/RELEASE_NOTES.rst b/RELEASE_NOTES.rst index 83a6b004..c4a901f6 100644 --- a/RELEASE_NOTES.rst +++ b/RELEASE_NOTES.rst @@ -5,6 +5,14 @@ Please always read this file before installing the package Download software here: https://github.com/jbarlow83/OCRmyPDF/tags + +Latest: +======= + +- Default output format is now PDF/A-2b instead of PDF/A-1b +- Python 3.5 and OS X El Capitan are now supported platforms - no changes were + needed to implement support + v3.0: ===== diff --git a/ocrmypdf/ghostscript.py b/ocrmypdf/ghostscript.py index 638e9a3e..6ad06765 100644 --- a/ocrmypdf/ghostscript.py +++ b/ocrmypdf/ghostscript.py @@ -44,7 +44,7 @@ def generate_pdfa(pdf_pages, output_file, threads=1): "-sDEVICE=pdfwrite", "-sColorConversionStrategy=/RGB", "-sProcessColorModel=DeviceRGB", - "-dPDFA", + "-dPDFA=2", "-sPDFACompatibilityPolicy=2", "-sOutputICCProfile=srgb.icc", "-sOutputFile=" + gs_pdf.name, From 80d89b54208c911f56a3949a7f07b8f5cc56e7df Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 2 Dec 2015 02:19:39 -0800 Subject: [PATCH 2/5] Set /Creator metadata to OCRmyPDF with reference to Tess version and settings --- ocrmypdf/main.py | 11 +++++++---- ocrmypdf/pdfa.py | 2 ++ 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index a9642891..0e249b56 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -39,9 +39,7 @@ warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning) BASEDIR = os.path.dirname(os.path.realpath(__file__)) -JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove')) -JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar') -JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf') +VERSION = '3.0' # ------------- @@ -103,7 +101,7 @@ check_pil_encoder('zlib', 'PNG') parser = cmdline.get_argparse( prog="ocrmypdf", description="Generate searchable PDF file from an image-only PDF file.", - version='3.0', + version=VERSION, fromfile_prefix_chars='@', ignored_args=[ 'touch_files_only', 'recreate_database', 'checksum_file_name', @@ -758,6 +756,11 @@ def generate_postscript_stub( if options.subject: pdfmark['subject'] = options.subject + pdfmark['creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format( + parser.prog, VERSION, + '+PDF' if options.pdf_renderer == 'tesseract' else '', + tesseract.version()) + generate_pdfa_def(output_file, pdfmark) diff --git a/ocrmypdf/pdfa.py b/ocrmypdf/pdfa.py index 2eab70de..b7219fc8 100644 --- a/ocrmypdf/pdfa.py +++ b/ocrmypdf/pdfa.py @@ -27,6 +27,7 @@ def /Author <$author> /Subject <$subject> /Keywords <$keywords> + /Creator <$creator> /DOCINFO pdfmark % Define an ICC profile : @@ -86,6 +87,7 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark): title=pdfmark_utf16.get('title', ''), author=pdfmark_utf16.get('author', ''), subject=pdfmark_utf16.get('subject', ''), + creator=pdfmark_utf16.get('creator', ''), keywords=pdfmark_utf16.get('keywords', '')) return result From d6124c17878a99f8ed7dde24a7c83c35cca79899 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 2 Dec 2015 03:12:52 -0800 Subject: [PATCH 3/5] pageinfo: improve robustness of text test for Tesseract produced PDFs --- ocrmypdf/pageinfo.py | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py index ff9b1452..7923ccc6 100644 --- a/ocrmypdf/pageinfo.py +++ b/ocrmypdf/pageinfo.py @@ -104,6 +104,24 @@ def _find_page_images(page, pageinfo): yield image +def _page_has_text(pdf, page): + # Simple test + text = page.extractText() + if text.strip() != '': + return True + + # More nuanced test to deal with quirks of Tesseract PDF generation + # Check if there's a Glyphless font + font = page['/Resources']['/Font'] + font_objects = list(font.keys()) + for font_object in font_objects: + basefont = font[font_object]['/BaseFont'] + if basefont.endswith('GlyphLessFont'): + return True + + return False + + def _pdf_get_pageinfo(infile, page: int): pageinfo = {} pageinfo['pageno'] = page @@ -112,8 +130,7 @@ def _pdf_get_pageinfo(infile, page: int): pdf = pypdf.PdfFileReader(infile) page = pdf.pages[page - 1] - text = page.extractText() - pageinfo['has_text'] = (text.strip() != '') + pageinfo['has_text'] = _page_has_text(pdf, page) width_pt = page['/MediaBox'][2] - page['/MediaBox'][0] height_pt = page['/MediaBox'][3] - page['/MediaBox'][1] From df1fda74388edd2125852db7cf2dde9840c3e7e7 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 2 Dec 2015 23:16:36 -0800 Subject: [PATCH 4/5] pageinfo: workaround PyPDF extractText limitations on hidden text It appears that extractText() does not find all text. At a glance it may be that Tesseract's PDF renderer generates a font and uses glyphs that map to different Unicode code points that PyPDF expects, so it discards the content and finds nothing. As a proxy in lieu of better PDF parsing, assume that a "GlyphLessFont" means there is a text there. I had previously found it does not work to check for the presence of a font on page. Some PDF generators create a font resource entry even if the font is never called for. --- ocrmypdf/pageinfo.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py index 7923ccc6..d1d76929 100644 --- a/ocrmypdf/pageinfo.py +++ b/ocrmypdf/pageinfo.py @@ -112,12 +112,16 @@ def _page_has_text(pdf, page): # More nuanced test to deal with quirks of Tesseract PDF generation # Check if there's a Glyphless font - font = page['/Resources']['/Font'] - font_objects = list(font.keys()) - for font_object in font_objects: - basefont = font[font_object]['/BaseFont'] - if basefont.endswith('GlyphLessFont'): - return True + try: + font = page['/Resources']['/Font'] + except KeyError: + pass + else: + font_objects = list(font.keys()) + for font_object in font_objects: + basefont = font[font_object]['/BaseFont'] + if basefont.endswith('GlyphLessFont'): + return True return False From 4f964a3c8ad0a97b52fbd0a4e39108497f43cb29 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Wed, 2 Dec 2015 23:20:31 -0800 Subject: [PATCH 5/5] Introduce --pdf-renderer auto Tess 3.03's has various quality problems like wrong DPI that are fixed in Tess 3.04. Idea here is to introduce an option to let OCRmyPDF select the rendering backend based on the options and system. However, we're not ready for tesseract as the main renderer. Setting pdf-renderer to tesseract does not pass all test cases, mainly the one where --tesseract-timeout is triggered, and some others. --- ocrmypdf/main.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index 0e249b56..26806f72 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -174,7 +174,7 @@ advanced.add_argument( '--tesseract-config', default=[], type=list, action='append', help="additional Tesseract configuration files") advanced.add_argument( - '--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr', + '--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto', help='choose OCR PDF renderer') advanced.add_argument( '--tesseract-timeout', default=180.0, type=float, @@ -216,6 +216,8 @@ if not set(options.language).issubset(tesseract.languages()): # ---------- # Arguments +if options.pdf_renderer == 'auto': + options.pdf_renderer = 'hocr' if any((options.deskew, options.clean, options.clean_final)): try: