Files
OCRmyPDF/tests/test_hocrtransform.py
James R. Barlow bbd263ff48 Add tests for fpdf2 renderer and font infrastructure
- Add hOCR test fixtures for Latin, Arabic, CJK, Devanagari scripts
- Add tests for fpdf2 renderer, multi-font manager, system font provider
- Add multilingual rendering tests
- Update existing tests to use fpdf2 renderer
2026-01-06 13:46:11 -08:00

125 lines
3.5 KiB
Python

# SPDX-FileCopyrightText: 2022 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
from __future__ import annotations
import re
from io import StringIO
from pathlib import Path
import pytest
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfparser import PDFParser
from PIL import Image
from ocrmypdf._exec.tesseract import generate_hocr
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import Fpdf2PdfRenderer
from ocrmypdf.helpers import check_pdf
from ocrmypdf.hocrtransform import HocrParser
from .conftest import check_ocrmypdf
def text_from_pdf(filename):
output_string = StringIO()
with open(filename, 'rb') as in_file:
parser = PDFParser(in_file)
doc = PDFDocument(parser)
rsrcmgr = PDFResourceManager()
device = TextConverter(rsrcmgr, output_string, laparams=LAParams())
interpreter = PDFPageInterpreter(rsrcmgr, device)
for page in PDFPage.create_pages(doc):
interpreter.process_page(page)
return output_string.getvalue()
# pylint: disable=redefined-outer-name
@pytest.fixture
def font_dir():
"""Get the font directory."""
return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
@pytest.fixture
def multi_font_manager(font_dir):
"""Create a MultiFontManager for tests."""
return MultiFontManager(font_dir)
@pytest.fixture
def blank_hocr(tmp_path):
im = Image.new('1', (8, 8), 0)
im.save(tmp_path / 'blank.tif', format='TIFF')
generate_hocr(
input_file=tmp_path / 'blank.tif',
output_hocr=tmp_path / 'blank.hocr',
output_text=tmp_path / 'blank.txt',
languages=['eng'],
engine_mode=1,
tessconfig=[],
pagesegmode=3,
thresholding=0,
user_words=None,
user_patterns=None,
timeout=None,
)
return tmp_path / 'blank.hocr'
def test_mono_image(blank_hocr, outdir, multi_font_manager):
im = Image.new('1', (8, 8), 0)
for n in range(8):
im.putpixel((n, n), 1)
im.save(outdir / 'mono.tif', format='TIFF')
# Parse hOCR file
parser = HocrParser(str(blank_hocr))
ocr_page = parser.parse()
# Use DPI from hOCR or default
dpi = ocr_page.dpi or 8
# Render to PDF using fpdf2
renderer = Fpdf2PdfRenderer(
page=ocr_page,
dpi=dpi,
multi_font_manager=multi_font_manager,
invisible_text=True,
)
renderer.render(outdir / 'mono.pdf')
check_pdf(outdir / 'mono.pdf')
@pytest.mark.slow
def test_fpdf2_matches_sandwich(resources, outdir):
"""Test that fpdf2 renderer produces similar output to sandwich renderer."""
# Note: hocr renderer now redirects to fpdf2
check_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'fpdf2.pdf', '--pdf-renderer=fpdf2'
)
check_ocrmypdf(
resources / 'ccitt.pdf', outdir / 'tess.pdf', '--pdf-renderer=sandwich'
)
# Slight differences in spacing and word order can appear, so at least ensure
# that we get all of the same words...
def clean(s):
s = re.sub(r'\s+', ' ', s)
words = s.split(' ')
return set(words)
fpdf2_words = clean(text_from_pdf(outdir / 'fpdf2.pdf'))
tess_words = clean(text_from_pdf(outdir / 'tess.pdf'))
similarity = len(fpdf2_words & tess_words) / len(fpdf2_words | tess_words)
assert similarity > 0.99