Replace GPLv3-derived PDF/A template with PostScript generator

2026-02-15 08:42:25 -05:00 · 2020-08-05 01:30:31 -07:00
parent aa0ec40102
commit 8c90f7c972
1 changed files with 50 additions and 29 deletions
--- a/src/ocrmypdf/pdfa.py
+++ b/src/ocrmypdf/pdfa.py
@@ -1,4 +1,4 @@
-# © 2015 James R. Barlow: github.com/jbarlow83
+# © 2020 James R. Barlow: github.com/jbarlow83
 #
 # This Source Code Form is subject to the terms of the Mozilla Public
 # License, v. 2.0. If a copy of the MPL was not distributed with this
@@ -11,8 +11,7 @@ Utilities for PDF/A production and confirmation with Ghostspcript.

 import base64
 from pathlib import Path
-from string import Template
-from typing import Dict, Union
+from typing import Dict, Iterator, Union

 import pikepdf
 import pkg_resources
@@ -22,29 +21,56 @@ ICC_PROFILE_RELPATH = 'data/sRGB.icc'
 SRGB_ICC_PROFILE = pkg_resources.resource_filename('ocrmypdf', ICC_PROFILE_RELPATH)


-# This is a template written in PostScript which is needed to create PDF/A
-# files, from the Ghostscript documentation. Lines beginning with % are
-# comments. Python substitution variables have a '$' prefix.
-pdfa_def_template = u"""%!
-% Define an ICC profile :
-/ICCProfile $icc_profile
-def
+def _postscript_objdef(
+    alias: str,
+    dictionary: Dict[str, str],
+    *,
+    stream_name: str = None,
+    stream_data: bytes = None,
+) -> Iterator[str]:
+    assert (stream_name is None) == (stream_data is None)

-[/_objdef {icc_PDFA} /type /stream /OBJ pdfmark
-[{icc_PDFA} << /N 3 >> /PUT pdfmark
-[{icc_PDFA} ICCProfile /PUT pdfmark
+    objtype = '/stream' if stream_name else '/dict'

-% Define the output intent dictionary :
+    if stream_name:
+        a85_data = base64.a85encode(stream_data, adobe=True).decode('ascii')
+        yield f'{stream_name} ' + a85_data
+        yield 'def'

-[/_objdef {OutputIntent_PDFA} /type /dict /OBJ pdfmark
-[{OutputIntent_PDFA} <<
-  /Type /OutputIntent             % Must be so (the standard requires).
-  /S /GTS_PDFA1                   % Must be so (the standard requires).
-  /DestOutputProfile {icc_PDFA}            % Must be so (see above).
-  /OutputConditionIdentifier ($icc_identifier)
->> /PUT pdfmark
-[{Catalog} <</OutputIntents [ {OutputIntent_PDFA} ]>> /PUT pdfmark
-"""
+    if alias != '{Catalog}':  # Catalog needs no definition
+        yield f'[/_objdef {alias} /type {objtype} /OBJ pdfmark'
+
+    yield f'[{alias} <<'
+    for key, val in dictionary.items():
+        yield f'  {key} {val}'
+    yield '>> /PUT pdfmark'
+
+    if stream_name:
+        yield f'[{alias} {stream_name[1:]} /PUT pdfmark'
+
+
+def _make_postscript(icc_name: str, icc_data: bytes, colors: int) -> Iterator[str]:
+    yield '%!'
+    yield from _postscript_objdef(
+        '{icc_PDFA}',  # Not an f-string
+        {'/N': str(colors)},
+        stream_name='/ICCProfile',
+        stream_data=icc_data,
+    )
+    yield ''
+    yield from _postscript_objdef(
+        '{OutputIntent_PDFA}',
+        {
+            '/Type': '/OutputIntent',
+            '/S': '/GTS_PDFA1',
+            '/DestOutputProfile': '{icc_PDFA}',
+            '/OutputConditionIdentifier': f'({icc_name})',  # Only f-string
+        },
+    )
+    yield ''
+    yield from _postscript_objdef(
+        '{Catalog}', {'/OutputIntents': '[ {OutputIntent_PDFA} ]'}
+    )


 def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
@@ -75,13 +101,8 @@ def generate_pdfa_ps(target_filename: Path, icc: str = 'sRGB'):
    else:
        raise NotImplementedError("Only supporting sRGB")

-    # Read the ICC profile, encode as ASCII85 and convert to a string which we
-    # will insert in the .ps file
    bytes_icc_profile = Path(icc_profile).read_bytes()
-    icc_profile = base64.a85encode(bytes_icc_profile, adobe=True).decode('ascii')
-
-    t = Template(pdfa_def_template)
-    ps = t.substitute(icc_profile=icc_profile, icc_identifier=icc)
+    ps = '\n'.join(_make_postscript(icc, bytes_icc_profile, 3))

    # We should have encoded everything to pure ASCII by this point, and
    # to be safe, only allow ASCII in PostScript