diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 3ca5e97f..747b2936 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -739,6 +739,24 @@ def should_linearize(working_file: Path, context: PdfContext): return False +def get_pdf_save_settings(output_type: str): + if output_type == 'pdfa-1': + # Trigger recompression to ensure object streams are removed, because + # Acrobat complains about them in PDF/A-1b validation. + return dict( + preserve_pdfa=True, + compress_streams=True, + stream_decode_level=pikepdf.StreamDecodeLevel.generalized, + object_stream_mode=pikepdf.ObjectStreamMode.disable, + ) + else: + return dict( + preserve_pdfa=True, + compress_streams=True, + object_stream_mode=(pikepdf.ObjectStreamMode.generate), + ) + + def metadata_fixup(working_file: Path, context: PdfContext): output_file = context.get_path('metafix.pdf') options = context.options @@ -783,9 +801,7 @@ def metadata_fixup(working_file: Path, context: PdfContext): pdf.save( output_file, - compress_streams=True, - preserve_pdfa=True, - object_stream_mode=pikepdf.ObjectStreamMode.generate, + **get_pdf_save_settings(options.output_type), linearize=( # Don't linearize if optimize() will be linearizing too should_linearize(working_file, context) if options.optimize == 0 @@ -799,10 +815,8 @@ def metadata_fixup(working_file: Path, context: PdfContext): def optimize_pdf(input_file: Path, context: PdfContext): output_file = context.get_path('optimize.pdf') save_settings = dict( - compress_streams=True, - preserve_pdfa=True, - object_stream_mode=pikepdf.ObjectStreamMode.generate, linearize=should_linearize(input_file, context), + **get_pdf_save_settings(context.options.output_type), ) optimize(input_file, output_file, context, save_settings) return output_file diff --git a/tests/test_pdfa.py b/tests/test_pdfa.py new file mode 100644 index 00000000..d0c269ff --- /dev/null +++ b/tests/test_pdfa.py @@ -0,0 +1,34 @@ +# © 2021 James R. Barlow: github.com/jbarlow83 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +import pikepdf +import pytest + +check_ocrmypdf = pytest.helpers.check_ocrmypdf + + +@pytest.mark.parametrize('optimize', (0, 3)) +@pytest.mark.parametrize('pdfa_level', (1, 2, 3)) +def test_pdfa(resources, outpdf, optimize, pdfa_level): + check_ocrmypdf( + resources / 'francais.pdf', + outpdf, + '--plugin', + 'tests/plugins/tesseract_noop.py', + f'--output-type=pdfa-{pdfa_level}', + f'--optimize={optimize}', + ) + if pdfa_level in (2, 3): + # PDF/A-2 allows ObjStm + assert b'/ObjStm' in outpdf.read_bytes() + elif pdfa_level == 1: + # PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so + # we don't use it + assert b'/ObjStm' not in outpdf.read_bytes() + + with pikepdf.open(outpdf) as pdf: + with pdf.open_metadata() as m: + assert m.pdfa_status == f'{pdfa_level}B'