Merge branch 'feature/pdfa-2' into develop

This commit is contained in:
James R. Barlow
2015-12-04 04:04:08 -08:00
5 changed files with 44 additions and 8 deletions

View File

@@ -5,6 +5,14 @@ Please always read this file before installing the package
Download software here: https://github.com/jbarlow83/OCRmyPDF/tags
Latest:
=======
- Default output format is now PDF/A-2b instead of PDF/A-1b
- Python 3.5 and OS X El Capitan are now supported platforms - no changes were
needed to implement support
v3.0:
=====

View File

@@ -44,7 +44,7 @@ def generate_pdfa(pdf_pages, output_file, threads=1):
"-sDEVICE=pdfwrite",
"-sColorConversionStrategy=/RGB",
"-sProcessColorModel=DeviceRGB",
"-dPDFA",
"-dPDFA=2",
"-sPDFACompatibilityPolicy=2",
"-sOutputICCProfile=srgb.icc",
"-sOutputFile=" + gs_pdf.name,

View File

@@ -39,9 +39,7 @@ warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
BASEDIR = os.path.dirname(os.path.realpath(__file__))
JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove'))
JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
VERSION = '3.0'
# -------------
@@ -103,7 +101,7 @@ check_pil_encoder('zlib', 'PNG')
parser = cmdline.get_argparse(
prog="ocrmypdf",
description="Generate searchable PDF file from an image-only PDF file.",
version='3.0',
version=VERSION,
fromfile_prefix_chars='@',
ignored_args=[
'touch_files_only', 'recreate_database', 'checksum_file_name',
@@ -176,7 +174,7 @@ advanced.add_argument(
'--tesseract-config', default=[], type=list, action='append',
help="additional Tesseract configuration files")
advanced.add_argument(
'--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr',
'--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto',
help='choose OCR PDF renderer')
advanced.add_argument(
'--tesseract-timeout', default=180.0, type=float,
@@ -218,6 +216,8 @@ if not set(options.language).issubset(tesseract.languages()):
# ----------
# Arguments
if options.pdf_renderer == 'auto':
options.pdf_renderer = 'hocr'
if any((options.deskew, options.clean, options.clean_final)):
try:
@@ -756,6 +756,11 @@ def generate_postscript_stub(
if options.subject:
pdfmark['subject'] = options.subject
pdfmark['creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format(
parser.prog, VERSION,
'+PDF' if options.pdf_renderer == 'tesseract' else '',
tesseract.version())
generate_pdfa_def(output_file, pdfmark)

View File

@@ -104,6 +104,28 @@ def _find_page_images(page, pageinfo):
yield image
def _page_has_text(pdf, page):
# Simple test
text = page.extractText()
if text.strip() != '':
return True
# More nuanced test to deal with quirks of Tesseract PDF generation
# Check if there's a Glyphless font
try:
font = page['/Resources']['/Font']
except KeyError:
pass
else:
font_objects = list(font.keys())
for font_object in font_objects:
basefont = font[font_object]['/BaseFont']
if basefont.endswith('GlyphLessFont'):
return True
return False
def _pdf_get_pageinfo(infile, page: int):
pageinfo = {}
pageinfo['pageno'] = page
@@ -112,8 +134,7 @@ def _pdf_get_pageinfo(infile, page: int):
pdf = pypdf.PdfFileReader(infile)
page = pdf.pages[page - 1]
text = page.extractText()
pageinfo['has_text'] = (text.strip() != '')
pageinfo['has_text'] = _page_has_text(pdf, page)
width_pt = page['/MediaBox'][2] - page['/MediaBox'][0]
height_pt = page['/MediaBox'][3] - page['/MediaBox'][1]

View File

@@ -27,6 +27,7 @@ def
/Author <$author>
/Subject <$subject>
/Keywords <$keywords>
/Creator <$creator>
/DOCINFO pdfmark
% Define an ICC profile :
@@ -86,6 +87,7 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
title=pdfmark_utf16.get('title', ''),
author=pdfmark_utf16.get('author', ''),
subject=pdfmark_utf16.get('subject', ''),
creator=pdfmark_utf16.get('creator', ''),
keywords=pdfmark_utf16.get('keywords', ''))
return result