mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-23 22:28:05 -05:00
Test and fix page box issues
This commit is contained in:
@@ -549,7 +549,9 @@ def rasterize(
|
||||
|
||||
device = colorspaces[device_idx]
|
||||
|
||||
log.debug(f"Rasterize with {device}, rotation {correction}")
|
||||
log.debug(
|
||||
f"Rasterize with {device}, rotation {correction}, mediabox {pageinfo.mediabox}"
|
||||
)
|
||||
|
||||
canvas_dpi, page_dpi = calculate_raster_dpi(page_context)
|
||||
|
||||
@@ -840,7 +842,7 @@ def fix_pagepdf_boxes(
|
||||
|
||||
The single page PDF is created with a normal MediaBox with its lower left corner
|
||||
at (0, 0). infile is the single page PDF. page_context.mediabox has the original
|
||||
file's mediabox, which may have a different origin. We needto adjust the other
|
||||
file's mediabox, which may have a different origin. We need to adjust the other
|
||||
boxes in the single page PDF to match the effect they had on the original page.
|
||||
|
||||
When correcting page rotation, we create a single page PDF that is correctly
|
||||
@@ -856,16 +858,20 @@ def fix_pagepdf_boxes(
|
||||
for page in pdf.pages:
|
||||
# page.BleedBox = page_context.pageinfo.bleedbox
|
||||
# page.ArtBox = page_context.pageinfo.artbox
|
||||
log.debug(
|
||||
f"initial mediabox={page.MediaBox} and pageinfo mediabox={page_context.pageinfo.mediabox}"
|
||||
)
|
||||
mediabox = page_context.pageinfo.mediabox
|
||||
offset = mediabox[0], mediabox[1]
|
||||
offset = -mediabox[0], -mediabox[1]
|
||||
cropbox = _offset_rect(page_context.pageinfo.cropbox, offset)
|
||||
trimbox = _offset_rect(page_context.pageinfo.trimbox, offset)
|
||||
|
||||
if swap_axis:
|
||||
cropbox = cropbox[1], cropbox[0], cropbox[3], cropbox[2]
|
||||
trimbox = trimbox[1], trimbox[0], trimbox[3], trimbox[2]
|
||||
mediabox = mediabox[1], mediabox[0], mediabox[3], mediabox[2]
|
||||
page.CropBox = cropbox
|
||||
page.TrimBox = trimbox
|
||||
log.debug(f"cropbox={cropbox}, trimbox={trimbox}, mediabox={mediabox}")
|
||||
pdf.save(out_file)
|
||||
return out_file
|
||||
|
||||
|
||||
122
tests/test_page_boxes.py
Normal file
122
tests/test_page_boxes.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
||||
# SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pikepdf
|
||||
import pytest
|
||||
|
||||
from .conftest import check_ocrmypdf
|
||||
|
||||
page_rect = [0, 0, 612, 792]
|
||||
inset_rect = [200, 200, 612, 792]
|
||||
wh_rect = [0, 0, 412, 592]
|
||||
|
||||
neg_rect = [-100, -100, 512, 692]
|
||||
|
||||
mediabox_testdata = [
|
||||
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
|
||||
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
|
||||
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
(
|
||||
'hocr',
|
||||
'pdfa',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
wh_rect,
|
||||
),
|
||||
(
|
||||
'hocr',
|
||||
'pdf',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
wh_rect,
|
||||
),
|
||||
('hocr', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
|
||||
('hocr', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
|
||||
)
|
||||
def test_media_box(
|
||||
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
|
||||
):
|
||||
with pikepdf.open(resources / in_pdf) as pdf:
|
||||
page = pdf.pages[0]
|
||||
page.MediaBox = crop_to
|
||||
pdf.save(outdir / 'cropped.pdf')
|
||||
args = [
|
||||
'--jobs',
|
||||
'1',
|
||||
'--pdf-renderer',
|
||||
renderer,
|
||||
'--output-type',
|
||||
output_type,
|
||||
]
|
||||
if mode:
|
||||
args.append(mode)
|
||||
|
||||
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
|
||||
|
||||
with pikepdf.open(outdir / 'processed.pdf') as pdf:
|
||||
page = pdf.pages[0]
|
||||
assert page.MediaBox == crop_expected
|
||||
|
||||
|
||||
cropbox_testdata = [
|
||||
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
||||
(
|
||||
'hocr',
|
||||
'pdfa',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
inset_rect,
|
||||
),
|
||||
(
|
||||
'hocr',
|
||||
'pdf',
|
||||
'ccitt.pdf',
|
||||
'--force-ocr',
|
||||
inset_rect,
|
||||
inset_rect,
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
|
||||
)
|
||||
def test_crop_box(
|
||||
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
|
||||
):
|
||||
with pikepdf.open(resources / in_pdf) as pdf:
|
||||
page = pdf.pages[0]
|
||||
page.CropBox = crop_to
|
||||
pdf.save(outdir / 'cropped.pdf')
|
||||
pdf.save('cropped.pdf')
|
||||
args = [
|
||||
'--jobs',
|
||||
'1',
|
||||
'--pdf-renderer',
|
||||
renderer,
|
||||
'--output-type',
|
||||
output_type,
|
||||
]
|
||||
if mode:
|
||||
args.append(mode)
|
||||
|
||||
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
|
||||
|
||||
with pikepdf.open(outdir / 'processed.pdf') as pdf:
|
||||
page = pdf.pages[0]
|
||||
pdf.save('processed.pdf')
|
||||
assert page.CropBox == crop_expected
|
||||
Reference in New Issue
Block a user