Don't generate PDF/A-1b with object streams

Acrobat insists that PDF/A-1b should not have object streams. Other programs like veraPDF disagree with this restriction, but we can accommodate Acrobat so we will. Also add more tests around this.
2026-05-04 12:48:02 -04:00 · 2021-02-26 00:23:57 -08:00
parent a23c22b0e8
commit 4124889f36
2 changed files with 54 additions and 6 deletions
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -739,6 +739,24 @@ def should_linearize(working_file: Path, context: PdfContext):
    return False


+def get_pdf_save_settings(output_type: str):
+    if output_type == 'pdfa-1':
+        # Trigger recompression to ensure object streams are removed, because
+        # Acrobat complains about them in PDF/A-1b validation.
+        return dict(
+            preserve_pdfa=True,
+            compress_streams=True,
+            stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
+            object_stream_mode=pikepdf.ObjectStreamMode.disable,
+        )
+    else:
+        return dict(
+            preserve_pdfa=True,
+            compress_streams=True,
+            object_stream_mode=(pikepdf.ObjectStreamMode.generate),
+        )
+
+
 def metadata_fixup(working_file: Path, context: PdfContext):
    output_file = context.get_path('metafix.pdf')
    options = context.options
@@ -783,9 +801,7 @@ def metadata_fixup(working_file: Path, context: PdfContext):

        pdf.save(
            output_file,
-            compress_streams=True,
-            preserve_pdfa=True,
-            object_stream_mode=pikepdf.ObjectStreamMode.generate,
+            **get_pdf_save_settings(options.output_type),
            linearize=(  # Don't linearize if optimize() will be linearizing too
                should_linearize(working_file, context)
                if options.optimize == 0
@@ -799,10 +815,8 @@ def metadata_fixup(working_file: Path, context: PdfContext):
 def optimize_pdf(input_file: Path, context: PdfContext):
    output_file = context.get_path('optimize.pdf')
    save_settings = dict(
-        compress_streams=True,
-        preserve_pdfa=True,
-        object_stream_mode=pikepdf.ObjectStreamMode.generate,
        linearize=should_linearize(input_file, context),
+        **get_pdf_save_settings(context.options.output_type),
    )
    optimize(input_file, output_file, context, save_settings)
    return output_file
--- a/tests/test_pdfa.py
+++ b/tests/test_pdfa.py
@@ -0,0 +1,34 @@
+# © 2021 James R. Barlow: github.com/jbarlow83
+#
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+import pikepdf
+import pytest
+
+check_ocrmypdf = pytest.helpers.check_ocrmypdf
+
+
+@pytest.mark.parametrize('optimize', (0, 3))
+@pytest.mark.parametrize('pdfa_level', (1, 2, 3))
+def test_pdfa(resources, outpdf, optimize, pdfa_level):
+    check_ocrmypdf(
+        resources / 'francais.pdf',
+        outpdf,
+        '--plugin',
+        'tests/plugins/tesseract_noop.py',
+        f'--output-type=pdfa-{pdfa_level}',
+        f'--optimize={optimize}',
+    )
+    if pdfa_level in (2, 3):
+        # PDF/A-2 allows ObjStm
+        assert b'/ObjStm' in outpdf.read_bytes()
+    elif pdfa_level == 1:
+        # PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so
+        # we don't use it
+        assert b'/ObjStm' not in outpdf.read_bytes()
+
+    with pikepdf.open(outpdf) as pdf:
+        with pdf.open_metadata() as m:
+            assert m.pdfa_status == f'{pdfa_level}B'