Merge branch 'feature/pdfa-2' into develop

2026-05-18 19:47:48 -04:00 · 2015-12-04 04:04:08 -08:00
parent ee7d97ae8c 4f964a3c8a
commit f1b2f1ae08
5 changed files with 44 additions and 8 deletions
--- a/RELEASE_NOTES.rst
+++ b/RELEASE_NOTES.rst
@@ -5,6 +5,14 @@ Please always read this file before installing the package

 Download software here: https://github.com/jbarlow83/OCRmyPDF/tags

+
+Latest:
+=======
+
+-  Default output format is now PDF/A-2b instead of PDF/A-1b
+-  Python 3.5 and OS X El Capitan are now supported platforms - no changes were
+   needed to implement support
+
 v3.0:
 =====

--- a/ocrmypdf/ghostscript.py
+++ b/ocrmypdf/ghostscript.py
@@ -44,7 +44,7 @@ def generate_pdfa(pdf_pages, output_file, threads=1):
            "-sDEVICE=pdfwrite",
            "-sColorConversionStrategy=/RGB",
            "-sProcessColorModel=DeviceRGB",
-            "-dPDFA",
+            "-dPDFA=2",
            "-sPDFACompatibilityPolicy=2",
            "-sOutputICCProfile=srgb.icc",
            "-sOutputFile=" + gs_pdf.name,
--- a/ocrmypdf/main.py
+++ b/ocrmypdf/main.py
@@ -39,9 +39,7 @@ warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)


 BASEDIR = os.path.dirname(os.path.realpath(__file__))
-JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove'))
-JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
-JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
+VERSION = '3.0'


 # -------------
@@ -103,7 +101,7 @@ check_pil_encoder('zlib', 'PNG')
 parser = cmdline.get_argparse(
    prog="ocrmypdf",
    description="Generate searchable PDF file from an image-only PDF file.",
-    version='3.0',
+    version=VERSION,
    fromfile_prefix_chars='@',
    ignored_args=[
        'touch_files_only', 'recreate_database', 'checksum_file_name',
@@ -176,7 +174,7 @@ advanced.add_argument(
    '--tesseract-config', default=[], type=list, action='append',
    help="additional Tesseract configuration files")
 advanced.add_argument(
-    '--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr',
+    '--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto',
    help='choose OCR PDF renderer')
 advanced.add_argument(
    '--tesseract-timeout', default=180.0, type=float,
@@ -218,6 +216,8 @@ if not set(options.language).issubset(tesseract.languages()):
 # ----------
 # Arguments

+if options.pdf_renderer == 'auto':
+    options.pdf_renderer = 'hocr'

 if any((options.deskew, options.clean, options.clean_final)):
    try:
@@ -756,6 +756,11 @@ def generate_postscript_stub(
    if options.subject:
        pdfmark['subject'] = options.subject

+    pdfmark['creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format(
+            parser.prog, VERSION,
+            '+PDF' if options.pdf_renderer == 'tesseract' else '',
+            tesseract.version())
+
    generate_pdfa_def(output_file, pdfmark)


--- a/ocrmypdf/pageinfo.py
+++ b/ocrmypdf/pageinfo.py
@@ -104,6 +104,28 @@ def _find_page_images(page, pageinfo):
        yield image


+def _page_has_text(pdf, page):
+    # Simple test
+    text = page.extractText()
+    if text.strip() != '':
+        return True
+
+    # More nuanced test to deal with quirks of Tesseract PDF generation
+    # Check if there's a Glyphless font
+    try:
+        font = page['/Resources']['/Font']
+    except KeyError:
+        pass
+    else:
+        font_objects = list(font.keys())
+        for font_object in font_objects:
+            basefont = font[font_object]['/BaseFont']
+            if basefont.endswith('GlyphLessFont'):
+                return True
+
+    return False
+
+
 def _pdf_get_pageinfo(infile, page: int):
    pageinfo = {}
    pageinfo['pageno'] = page
@@ -112,8 +134,7 @@ def _pdf_get_pageinfo(infile, page: int):
    pdf = pypdf.PdfFileReader(infile)
    page = pdf.pages[page - 1]

-    text = page.extractText()
-    pageinfo['has_text'] = (text.strip() != '')
+    pageinfo['has_text'] = _page_has_text(pdf, page)

    width_pt = page['/MediaBox'][2] - page['/MediaBox'][0]
    height_pt = page['/MediaBox'][3] - page['/MediaBox'][1]
--- a/ocrmypdf/pdfa.py
+++ b/ocrmypdf/pdfa.py
@@ -27,6 +27,7 @@ def
  /Author <$author>
  /Subject <$subject>
  /Keywords <$keywords>
+  /Creator <$creator>
  /DOCINFO pdfmark

 % Define an ICC profile :
@@ -86,6 +87,7 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
                          title=pdfmark_utf16.get('title', ''),
                          author=pdfmark_utf16.get('author', ''),
                          subject=pdfmark_utf16.get('subject', ''),
+                          creator=pdfmark_utf16.get('creator', ''),
                          keywords=pdfmark_utf16.get('keywords', ''))
    return result