Test and fix page box issues

This commit is contained in:
James R. Barlow
2025-02-11 00:40:01 -08:00
parent 8d715c4157
commit eace567f7b
2 changed files with 132 additions and 4 deletions

View File

@@ -549,7 +549,9 @@ def rasterize(
device = colorspaces[device_idx]
log.debug(f"Rasterize with {device}, rotation {correction}")
log.debug(
f"Rasterize with {device}, rotation {correction}, mediabox {pageinfo.mediabox}"
)
canvas_dpi, page_dpi = calculate_raster_dpi(page_context)
@@ -840,7 +842,7 @@ def fix_pagepdf_boxes(
The single page PDF is created with a normal MediaBox with its lower left corner
at (0, 0). infile is the single page PDF. page_context.mediabox has the original
file's mediabox, which may have a different origin. We needto adjust the other
file's mediabox, which may have a different origin. We need to adjust the other
boxes in the single page PDF to match the effect they had on the original page.
When correcting page rotation, we create a single page PDF that is correctly
@@ -856,16 +858,20 @@ def fix_pagepdf_boxes(
for page in pdf.pages:
# page.BleedBox = page_context.pageinfo.bleedbox
# page.ArtBox = page_context.pageinfo.artbox
log.debug(
f"initial mediabox={page.MediaBox} and pageinfo mediabox={page_context.pageinfo.mediabox}"
)
mediabox = page_context.pageinfo.mediabox
offset = mediabox[0], mediabox[1]
offset = -mediabox[0], -mediabox[1]
cropbox = _offset_rect(page_context.pageinfo.cropbox, offset)
trimbox = _offset_rect(page_context.pageinfo.trimbox, offset)
if swap_axis:
cropbox = cropbox[1], cropbox[0], cropbox[3], cropbox[2]
trimbox = trimbox[1], trimbox[0], trimbox[3], trimbox[2]
mediabox = mediabox[1], mediabox[0], mediabox[3], mediabox[2]
page.CropBox = cropbox
page.TrimBox = trimbox
log.debug(f"cropbox={cropbox}, trimbox={trimbox}, mediabox={mediabox}")
pdf.save(out_file)
return out_file

122
tests/test_page_boxes.py Normal file
View File

@@ -0,0 +1,122 @@
# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import pikepdf
import pytest
from .conftest import check_ocrmypdf
page_rect = [0, 0, 612, 792]
inset_rect = [200, 200, 612, 792]
wh_rect = [0, 0, 412, 592]
neg_rect = [-100, -100, 512, 692]
mediabox_testdata = [
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
(
'hocr',
'pdfa',
'ccitt.pdf',
'--force-ocr',
inset_rect,
wh_rect,
),
(
'hocr',
'pdf',
'ccitt.pdf',
'--force-ocr',
inset_rect,
wh_rect,
),
('hocr', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
('hocr', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
]
@pytest.mark.parametrize(
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
)
def test_media_box(
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
with pikepdf.open(resources / in_pdf) as pdf:
page = pdf.pages[0]
page.MediaBox = crop_to
pdf.save(outdir / 'cropped.pdf')
args = [
'--jobs',
'1',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
]
if mode:
args.append(mode)
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
with pikepdf.open(outdir / 'processed.pdf') as pdf:
page = pdf.pages[0]
assert page.MediaBox == crop_expected
cropbox_testdata = [
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
(
'hocr',
'pdfa',
'ccitt.pdf',
'--force-ocr',
inset_rect,
inset_rect,
),
(
'hocr',
'pdf',
'ccitt.pdf',
'--force-ocr',
inset_rect,
inset_rect,
),
]
@pytest.mark.parametrize(
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
)
def test_crop_box(
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
):
with pikepdf.open(resources / in_pdf) as pdf:
page = pdf.pages[0]
page.CropBox = crop_to
pdf.save(outdir / 'cropped.pdf')
pdf.save('cropped.pdf')
args = [
'--jobs',
'1',
'--pdf-renderer',
renderer,
'--output-type',
output_type,
]
if mode:
args.append(mode)
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
with pikepdf.open(outdir / 'processed.pdf') as pdf:
page = pdf.pages[0]
pdf.save('processed.pdf')
assert page.CropBox == crop_expected