Don't generate PDF/A-1b with object streams

Acrobat insists that PDF/A-1b should not have object streams.
Other programs like veraPDF disagree with this restriction, but
we can accommodate Acrobat so we will.

Also add more tests around this.
This commit is contained in:
James R. Barlow
2021-02-26 00:23:57 -08:00
parent a23c22b0e8
commit 4124889f36
2 changed files with 54 additions and 6 deletions

View File

@@ -739,6 +739,24 @@ def should_linearize(working_file: Path, context: PdfContext):
return False
def get_pdf_save_settings(output_type: str):
if output_type == 'pdfa-1':
# Trigger recompression to ensure object streams are removed, because
# Acrobat complains about them in PDF/A-1b validation.
return dict(
preserve_pdfa=True,
compress_streams=True,
stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
object_stream_mode=pikepdf.ObjectStreamMode.disable,
)
else:
return dict(
preserve_pdfa=True,
compress_streams=True,
object_stream_mode=(pikepdf.ObjectStreamMode.generate),
)
def metadata_fixup(working_file: Path, context: PdfContext):
output_file = context.get_path('metafix.pdf')
options = context.options
@@ -783,9 +801,7 @@ def metadata_fixup(working_file: Path, context: PdfContext):
pdf.save(
output_file,
compress_streams=True,
preserve_pdfa=True,
object_stream_mode=pikepdf.ObjectStreamMode.generate,
**get_pdf_save_settings(options.output_type),
linearize=( # Don't linearize if optimize() will be linearizing too
should_linearize(working_file, context)
if options.optimize == 0
@@ -799,10 +815,8 @@ def metadata_fixup(working_file: Path, context: PdfContext):
def optimize_pdf(input_file: Path, context: PdfContext):
output_file = context.get_path('optimize.pdf')
save_settings = dict(
compress_streams=True,
preserve_pdfa=True,
object_stream_mode=pikepdf.ObjectStreamMode.generate,
linearize=should_linearize(input_file, context),
**get_pdf_save_settings(context.options.output_type),
)
optimize(input_file, output_file, context, save_settings)
return output_file

34
tests/test_pdfa.py Normal file
View File

@@ -0,0 +1,34 @@
# © 2021 James R. Barlow: github.com/jbarlow83
#
# This Source Code Form is subject to the terms of the Mozilla Public
# License, v. 2.0. If a copy of the MPL was not distributed with this
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
import pikepdf
import pytest
check_ocrmypdf = pytest.helpers.check_ocrmypdf
@pytest.mark.parametrize('optimize', (0, 3))
@pytest.mark.parametrize('pdfa_level', (1, 2, 3))
def test_pdfa(resources, outpdf, optimize, pdfa_level):
check_ocrmypdf(
resources / 'francais.pdf',
outpdf,
'--plugin',
'tests/plugins/tesseract_noop.py',
f'--output-type=pdfa-{pdfa_level}',
f'--optimize={optimize}',
)
if pdfa_level in (2, 3):
# PDF/A-2 allows ObjStm
assert b'/ObjStm' in outpdf.read_bytes()
elif pdfa_level == 1:
# PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so
# we don't use it
assert b'/ObjStm' not in outpdf.read_bytes()
with pikepdf.open(outpdf) as pdf:
with pdf.open_metadata() as m:
assert m.pdfa_status == f'{pdfa_level}B'