Replace GPLv3-derived PDF/A template with PostScript generator

This commit is contained in:
James R. Barlow
2020-08-05 01:30:31 -07:00
parent aa0ec40102
commit 8c90f7c972

View File

@@ -1,4 +1,4 @@
# © 2015 James R. Barlow: github.com/jbarlow83
# © 2020 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,8 +11,7 @@ Utilities for PDF/A production and confirmation with Ghostspcript.
import base64
from pathlib import Path
from string import Template
from typing import Dict, Union
from typing import Dict, Iterator, Union
import pikepdf
import pkg_resources
@@ -22,29 +21,56 @@ ICC_PROFILE_RELPATH = 'data/sRGB.icc'
SRGB_ICC_PROFILE = pkg_resources.resource_filename('ocrmypdf', ICC_PROFILE_RELPATH)
# This is a template written in PostScript which is needed to create PDF/A
# files, from the Ghostscript documentation. Lines beginning with % are
# comments. Python substitution variables have a '$' prefix.
pdfa_def_template = u"""%!
% Define an ICC profile :
/ICCProfile $icc_profile
def
def _postscript_objdef(
alias: str,
dictionary: Dict[str, str],
*,
stream_name: str = None,
stream_data: bytes = None,
) -> Iterator[str]:
assert (stream_name is None) == (stream_data is None)
[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
[{icc_PDFA} << /N 3 >> /PUT pdfmark
[{icc_PDFA} ICCProfile /PUT pdfmark
objtype = '/stream' if stream_name else '/dict'
% Define the output intent dictionary :
if stream_name:
a85_data = base64.a85encode(stream_data, adobe=True).decode('ascii')
yield f'{stream_name} ' + a85_data
yield 'def'
[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
[{OutputIntent_PDFA} <<
/Type /OutputIntent % Must be so (the standard requires).
/S /GTS_PDFA1 % Must be so (the standard requires).
/DestOutputProfile {icc_PDFA} % Must be so (see above).
/OutputConditionIdentifier ($icc_identifier)
>> /PUT pdfmark
[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
"""
if alias != '{Catalog}': # Catalog needs no definition
yield f'[/_objdef {alias} /type {objtype} /OBJ pdfmark'
yield f'[{alias} <<'
for key, val in dictionary.items():
yield f' {key} {val}'
yield '>> /PUT pdfmark'
if stream_name:
yield f'[{alias} {stream_name[1:]} /PUT pdfmark'
def _make_postscript(icc_name: str, icc_data: bytes, colors: int) -> Iterator[str]:
yield '%!'
yield from _postscript_objdef(
'{icc_PDFA}', # Not an f-string
{'/N': str(colors)},
stream_name='/ICCProfile',
stream_data=icc_data,
)
yield ''
yield from _postscript_objdef(
'{OutputIntent_PDFA}',
{
'/Type': '/OutputIntent',
'/S': '/GTS_PDFA1',
'/DestOutputProfile': '{icc_PDFA}',
'/OutputConditionIdentifier': f'({icc_name})', # Only f-string
},
)
yield ''
yield from _postscript_objdef(
'{Catalog}', {'/OutputIntents': '[ {OutputIntent_PDFA} ]'}
)
def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
@@ -75,13 +101,8 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
else:
raise NotImplementedError("Only supporting sRGB")
# Read the ICC profile, encode as ASCII85 and convert to a string which we
# will insert in the .ps file
bytes_icc_profile = Path(icc_profile).read_bytes()
icc_profile = base64.a85encode(bytes_icc_profile, adobe=True).decode('ascii')
t = Template(pdfa_def_template)
ps = t.substitute(icc_profile=icc_profile, icc_identifier=icc)
ps = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))
# We should have encoded everything to pure ASCII by this point, and
# to be safe, only allow ASCII in PostScript