mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-18 19:47:48 -04:00
Merge branch 'feature/pdfa-2' into develop
This commit is contained in:
@@ -5,6 +5,14 @@ Please always read this file before installing the package
|
||||
|
||||
Download software here: https://github.com/jbarlow83/OCRmyPDF/tags
|
||||
|
||||
|
||||
Latest:
|
||||
=======
|
||||
|
||||
- Default output format is now PDF/A-2b instead of PDF/A-1b
|
||||
- Python 3.5 and OS X El Capitan are now supported platforms - no changes were
|
||||
needed to implement support
|
||||
|
||||
v3.0:
|
||||
=====
|
||||
|
||||
|
||||
@@ -44,7 +44,7 @@ def generate_pdfa(pdf_pages, output_file, threads=1):
|
||||
"-sDEVICE=pdfwrite",
|
||||
"-sColorConversionStrategy=/RGB",
|
||||
"-sProcessColorModel=DeviceRGB",
|
||||
"-dPDFA",
|
||||
"-dPDFA=2",
|
||||
"-sPDFACompatibilityPolicy=2",
|
||||
"-sOutputICCProfile=srgb.icc",
|
||||
"-sOutputFile=" + gs_pdf.name,
|
||||
|
||||
@@ -39,9 +39,7 @@ warnings.simplefilter('ignore', pypdf.utils.PdfReadWarning)
|
||||
|
||||
|
||||
BASEDIR = os.path.dirname(os.path.realpath(__file__))
|
||||
JHOVE_PATH = os.path.realpath(os.path.join(BASEDIR, 'jhove'))
|
||||
JHOVE_JAR = os.path.join(JHOVE_PATH, 'bin', 'JhoveApp.jar')
|
||||
JHOVE_CFG = os.path.join(JHOVE_PATH, 'conf', 'jhove.conf')
|
||||
VERSION = '3.0'
|
||||
|
||||
|
||||
# -------------
|
||||
@@ -103,7 +101,7 @@ check_pil_encoder('zlib', 'PNG')
|
||||
parser = cmdline.get_argparse(
|
||||
prog="ocrmypdf",
|
||||
description="Generate searchable PDF file from an image-only PDF file.",
|
||||
version='3.0',
|
||||
version=VERSION,
|
||||
fromfile_prefix_chars='@',
|
||||
ignored_args=[
|
||||
'touch_files_only', 'recreate_database', 'checksum_file_name',
|
||||
@@ -176,7 +174,7 @@ advanced.add_argument(
|
||||
'--tesseract-config', default=[], type=list, action='append',
|
||||
help="additional Tesseract configuration files")
|
||||
advanced.add_argument(
|
||||
'--pdf-renderer', choices=['tesseract', 'hocr'], default='hocr',
|
||||
'--pdf-renderer', choices=['auto', 'tesseract', 'hocr'], default='auto',
|
||||
help='choose OCR PDF renderer')
|
||||
advanced.add_argument(
|
||||
'--tesseract-timeout', default=180.0, type=float,
|
||||
@@ -218,6 +216,8 @@ if not set(options.language).issubset(tesseract.languages()):
|
||||
# ----------
|
||||
# Arguments
|
||||
|
||||
if options.pdf_renderer == 'auto':
|
||||
options.pdf_renderer = 'hocr'
|
||||
|
||||
if any((options.deskew, options.clean, options.clean_final)):
|
||||
try:
|
||||
@@ -756,6 +756,11 @@ def generate_postscript_stub(
|
||||
if options.subject:
|
||||
pdfmark['subject'] = options.subject
|
||||
|
||||
pdfmark['creator'] = '{0} {1} / Tesseract OCR{2} {3}'.format(
|
||||
parser.prog, VERSION,
|
||||
'+PDF' if options.pdf_renderer == 'tesseract' else '',
|
||||
tesseract.version())
|
||||
|
||||
generate_pdfa_def(output_file, pdfmark)
|
||||
|
||||
|
||||
|
||||
@@ -104,6 +104,28 @@ def _find_page_images(page, pageinfo):
|
||||
yield image
|
||||
|
||||
|
||||
def _page_has_text(pdf, page):
|
||||
# Simple test
|
||||
text = page.extractText()
|
||||
if text.strip() != '':
|
||||
return True
|
||||
|
||||
# More nuanced test to deal with quirks of Tesseract PDF generation
|
||||
# Check if there's a Glyphless font
|
||||
try:
|
||||
font = page['/Resources']['/Font']
|
||||
except KeyError:
|
||||
pass
|
||||
else:
|
||||
font_objects = list(font.keys())
|
||||
for font_object in font_objects:
|
||||
basefont = font[font_object]['/BaseFont']
|
||||
if basefont.endswith('GlyphLessFont'):
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def _pdf_get_pageinfo(infile, page: int):
|
||||
pageinfo = {}
|
||||
pageinfo['pageno'] = page
|
||||
@@ -112,8 +134,7 @@ def _pdf_get_pageinfo(infile, page: int):
|
||||
pdf = pypdf.PdfFileReader(infile)
|
||||
page = pdf.pages[page - 1]
|
||||
|
||||
text = page.extractText()
|
||||
pageinfo['has_text'] = (text.strip() != '')
|
||||
pageinfo['has_text'] = _page_has_text(pdf, page)
|
||||
|
||||
width_pt = page['/MediaBox'][2] - page['/MediaBox'][0]
|
||||
height_pt = page['/MediaBox'][3] - page['/MediaBox'][1]
|
||||
|
||||
@@ -27,6 +27,7 @@ def
|
||||
/Author <$author>
|
||||
/Subject <$subject>
|
||||
/Keywords <$keywords>
|
||||
/Creator <$creator>
|
||||
/DOCINFO pdfmark
|
||||
|
||||
% Define an ICC profile :
|
||||
@@ -86,6 +87,7 @@ def _get_pdfa_def(icc_profile, icc_identifier, pdfmark):
|
||||
title=pdfmark_utf16.get('title', ''),
|
||||
author=pdfmark_utf16.get('author', ''),
|
||||
subject=pdfmark_utf16.get('subject', ''),
|
||||
creator=pdfmark_utf16.get('creator', ''),
|
||||
keywords=pdfmark_utf16.get('keywords', ''))
|
||||
return result
|
||||
|
||||
|
||||
Reference in New Issue
Block a user