Files
OCRmyPDF/tests/test_page_boxes.py
James R. Barlow b386d39b3b tests: fix test_page_boxes when verapdf unavailable
The test expected MediaBox preservation for pdfa output, but this only
works when verapdf is available for speculative PDF/A conversion.
Without verapdf (Linux/Windows CI), Ghostscript normalizes the MediaBox.

Also convert pikepdf.Array to list in assertions for clearer error
messages, avoiding pytest repr issues with pikepdf objects.
2026-01-21 00:22:26 -08:00

130 lines
3.4 KiB
Python

# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import pikepdf
import pytest
from ocrmypdf._exec import verapdf
from .conftest import check_ocrmypdf
page_rect = [0, 0, 612, 792]
inset_rect = [200, 200, 612, 792]
wh_rect = [0, 0, 412, 592]
neg_rect = [-100, -100, 512, 692]
# When speculative PDF/A succeeds (verapdf available), MediaBox is preserved.
# Ghostscript would normalize MediaBox to start at origin, but speculative
# conversion bypasses Ghostscript.
_pdfa_inset_expected = inset_rect if verapdf.available() else wh_rect
mediabox_testdata = [
('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, _pdfa_inset_expected),
('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
(
'fpdf2',
'pdfa',
'ccitt.pdf',
'--force-ocr',
inset_rect,
wh_rect,
),
(
'fpdf2',
'pdf',
'ccitt.pdf',
'--force-ocr',
inset_rect,
wh_rect,
),
('fpdf2', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
('fpdf2', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
]
@pytest.mark.parametrize(
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
)
def test_media_box(
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
with pikepdf.open(resources / in_pdf) as pdf:
page = pdf.pages[0]
page.MediaBox = crop_to
pdf.save(outdir / 'cropped.pdf')
args = [
'--jobs',
'1',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
]
if mode:
args.append(mode)
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
with pikepdf.open(outdir / 'processed.pdf') as pdf:
page = pdf.pages[0]
assert [float(x) for x in page.mediabox] == crop_expected
cropbox_testdata = [
('fpdf2', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
('fpdf2', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
(
'fpdf2',
'pdfa',
'ccitt.pdf',
'--force-ocr',
inset_rect,
inset_rect,
),
(
'fpdf2',
'pdf',
'ccitt.pdf',
'--force-ocr',
inset_rect,
inset_rect,
),
]
@pytest.mark.parametrize(
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
)
def test_crop_box(
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
with pikepdf.open(resources / in_pdf) as pdf:
page = pdf.pages[0]
page.CropBox = crop_to
pdf.save(outdir / 'cropped.pdf')
args = [
'--jobs',
'1',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
'--optimize',
'0',
]
if mode:
args.append(mode)
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
with pikepdf.open(outdir / 'processed.pdf') as pdf:
page = pdf.pages[0]
assert [float(x) for x in page.cropbox] == crop_expected