mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-04 12:48:02 -04:00
Don't generate PDF/A-1b with object streams
Acrobat insists that PDF/A-1b should not have object streams. Other programs like veraPDF disagree with this restriction, but we can accommodate Acrobat so we will. Also add more tests around this.
This commit is contained in:
@@ -739,6 +739,24 @@ def should_linearize(working_file: Path, context: PdfContext):
|
||||
return False
|
||||
|
||||
|
||||
def get_pdf_save_settings(output_type: str):
|
||||
if output_type == 'pdfa-1':
|
||||
# Trigger recompression to ensure object streams are removed, because
|
||||
# Acrobat complains about them in PDF/A-1b validation.
|
||||
return dict(
|
||||
preserve_pdfa=True,
|
||||
compress_streams=True,
|
||||
stream_decode_level=pikepdf.StreamDecodeLevel.generalized,
|
||||
object_stream_mode=pikepdf.ObjectStreamMode.disable,
|
||||
)
|
||||
else:
|
||||
return dict(
|
||||
preserve_pdfa=True,
|
||||
compress_streams=True,
|
||||
object_stream_mode=(pikepdf.ObjectStreamMode.generate),
|
||||
)
|
||||
|
||||
|
||||
def metadata_fixup(working_file: Path, context: PdfContext):
|
||||
output_file = context.get_path('metafix.pdf')
|
||||
options = context.options
|
||||
@@ -783,9 +801,7 @@ def metadata_fixup(working_file: Path, context: PdfContext):
|
||||
|
||||
pdf.save(
|
||||
output_file,
|
||||
compress_streams=True,
|
||||
preserve_pdfa=True,
|
||||
object_stream_mode=pikepdf.ObjectStreamMode.generate,
|
||||
**get_pdf_save_settings(options.output_type),
|
||||
linearize=( # Don't linearize if optimize() will be linearizing too
|
||||
should_linearize(working_file, context)
|
||||
if options.optimize == 0
|
||||
@@ -799,10 +815,8 @@ def metadata_fixup(working_file: Path, context: PdfContext):
|
||||
def optimize_pdf(input_file: Path, context: PdfContext):
|
||||
output_file = context.get_path('optimize.pdf')
|
||||
save_settings = dict(
|
||||
compress_streams=True,
|
||||
preserve_pdfa=True,
|
||||
object_stream_mode=pikepdf.ObjectStreamMode.generate,
|
||||
linearize=should_linearize(input_file, context),
|
||||
**get_pdf_save_settings(context.options.output_type),
|
||||
)
|
||||
optimize(input_file, output_file, context, save_settings)
|
||||
return output_file
|
||||
|
||||
34
tests/test_pdfa.py
Normal file
34
tests/test_pdfa.py
Normal file
@@ -0,0 +1,34 @@
|
||||
# © 2021 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This Source Code Form is subject to the terms of the Mozilla Public
|
||||
# License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
|
||||
|
||||
import pikepdf
|
||||
import pytest
|
||||
|
||||
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
||||
|
||||
|
||||
@pytest.mark.parametrize('optimize', (0, 3))
|
||||
@pytest.mark.parametrize('pdfa_level', (1, 2, 3))
|
||||
def test_pdfa(resources, outpdf, optimize, pdfa_level):
|
||||
check_ocrmypdf(
|
||||
resources / 'francais.pdf',
|
||||
outpdf,
|
||||
'--plugin',
|
||||
'tests/plugins/tesseract_noop.py',
|
||||
f'--output-type=pdfa-{pdfa_level}',
|
||||
f'--optimize={optimize}',
|
||||
)
|
||||
if pdfa_level in (2, 3):
|
||||
# PDF/A-2 allows ObjStm
|
||||
assert b'/ObjStm' in outpdf.read_bytes()
|
||||
elif pdfa_level == 1:
|
||||
# PDF/A-1 might allow ObjStm, but Acrobat does not approve it, so
|
||||
# we don't use it
|
||||
assert b'/ObjStm' not in outpdf.read_bytes()
|
||||
|
||||
with pikepdf.open(outpdf) as pdf:
|
||||
with pdf.open_metadata() as m:
|
||||
assert m.pdfa_status == f'{pdfa_level}B'
|
||||
Reference in New Issue
Block a user