mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2025-12-23 22:28:05 -05:00
123 lines
3.0 KiB
Python
123 lines
3.0 KiB
Python
# SPDX-FileCopyrightText: 2025 James R. Barlow
|
|
# SPDX-License-Identifier: MPL-2.0
|
|
|
|
from __future__ import annotations
|
|
|
|
import pikepdf
|
|
import pytest
|
|
|
|
from .conftest import check_ocrmypdf
|
|
|
|
page_rect = [0, 0, 612, 792]
|
|
inset_rect = [200, 200, 612, 792]
|
|
wh_rect = [0, 0, 412, 592]
|
|
|
|
neg_rect = [-100, -100, 512, 692]
|
|
|
|
mediabox_testdata = [
|
|
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
|
|
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, wh_rect),
|
|
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
|
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
|
(
|
|
'hocr',
|
|
'pdfa',
|
|
'ccitt.pdf',
|
|
'--force-ocr',
|
|
inset_rect,
|
|
wh_rect,
|
|
),
|
|
(
|
|
'hocr',
|
|
'pdf',
|
|
'ccitt.pdf',
|
|
'--force-ocr',
|
|
inset_rect,
|
|
wh_rect,
|
|
),
|
|
('hocr', 'pdfa', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
|
|
('hocr', 'pdf', 'ccitt.pdf', '--force-ocr', neg_rect, page_rect),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', mediabox_testdata
|
|
)
|
|
def test_media_box(
|
|
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
|
|
):
|
|
with pikepdf.open(resources / in_pdf) as pdf:
|
|
page = pdf.pages[0]
|
|
page.MediaBox = crop_to
|
|
pdf.save(outdir / 'cropped.pdf')
|
|
args = [
|
|
'--jobs',
|
|
'1',
|
|
'--pdf-renderer',
|
|
renderer,
|
|
'--output-type',
|
|
output_type,
|
|
]
|
|
if mode:
|
|
args.append(mode)
|
|
|
|
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
|
|
|
|
with pikepdf.open(outdir / 'processed.pdf') as pdf:
|
|
page = pdf.pages[0]
|
|
assert page.MediaBox == crop_expected
|
|
|
|
|
|
cropbox_testdata = [
|
|
('hocr', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
|
|
('sandwich', 'pdfa', 'ccitt.pdf', None, inset_rect, inset_rect),
|
|
('hocr', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
|
('sandwich', 'pdf', 'ccitt.pdf', None, inset_rect, inset_rect),
|
|
(
|
|
'hocr',
|
|
'pdfa',
|
|
'ccitt.pdf',
|
|
'--force-ocr',
|
|
inset_rect,
|
|
inset_rect,
|
|
),
|
|
(
|
|
'hocr',
|
|
'pdf',
|
|
'ccitt.pdf',
|
|
'--force-ocr',
|
|
inset_rect,
|
|
inset_rect,
|
|
),
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
'renderer, output_type, in_pdf, mode, crop_to, crop_expected', cropbox_testdata
|
|
)
|
|
def test_crop_box(
|
|
resources, outdir, renderer, output_type, in_pdf, mode, crop_to, crop_expected
|
|
):
|
|
with pikepdf.open(resources / in_pdf) as pdf:
|
|
page = pdf.pages[0]
|
|
page.CropBox = crop_to
|
|
pdf.save(outdir / 'cropped.pdf')
|
|
args = [
|
|
'--jobs',
|
|
'1',
|
|
'--pdf-renderer',
|
|
renderer,
|
|
'--output-type',
|
|
output_type,
|
|
'--optimize',
|
|
'0',
|
|
]
|
|
if mode:
|
|
args.append(mode)
|
|
|
|
check_ocrmypdf(outdir / 'cropped.pdf', outdir / 'processed.pdf', *args)
|
|
|
|
with pikepdf.open(outdir / 'processed.pdf') as pdf:
|
|
page = pdf.pages[0]
|
|
assert page.CropBox == crop_expected
|