Files
OCRmyPDF/tests/test_fpdf_renderer.py
2026-01-27 14:01:30 -08:00

540 lines
17 KiB
Python

# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Tests for fpdf2-based PDF renderer."""
from __future__ import annotations
from pathlib import Path
import pytest
from ocrmypdf.font import MultiFontManager
from ocrmypdf.fpdf_renderer import (
DebugRenderOptions,
Fpdf2MultiPageRenderer,
Fpdf2PdfRenderer,
)
from ocrmypdf.hocrtransform.hocr_parser import HocrParser
from ocrmypdf.models.ocr_element import OcrClass
@pytest.fixture
def font_dir():
"""Return path to font directory."""
return Path(__file__).parent.parent / "src" / "ocrmypdf" / "data"
@pytest.fixture
def multi_font_manager(font_dir):
"""Create MultiFontManager instance for testing."""
return MultiFontManager(font_dir)
@pytest.fixture
def resources():
"""Return path to test resources directory."""
return Path(__file__).parent / "resources"
class TestFpdf2RendererImports:
"""Test that all fpdf2 renderer modules can be imported."""
def test_imports(self):
"""Test that all fpdf_renderer modules can be imported."""
from ocrmypdf.fpdf_renderer import (
DebugRenderOptions,
Fpdf2MultiPageRenderer,
Fpdf2PdfRenderer,
)
assert DebugRenderOptions is not None
assert Fpdf2PdfRenderer is not None
assert Fpdf2MultiPageRenderer is not None
class TestDebugRenderOptions:
"""Test DebugRenderOptions dataclass."""
def test_defaults(self):
"""Test default values."""
opts = DebugRenderOptions()
assert opts.render_baseline is False
assert opts.render_line_bbox is False
assert opts.render_word_bbox is False
def test_custom_values(self):
"""Test custom values."""
opts = DebugRenderOptions(
render_baseline=True,
render_line_bbox=True,
render_word_bbox=True,
)
assert opts.render_baseline is True
assert opts.render_line_bbox is True
assert opts.render_word_bbox is True
class TestFpdf2PdfRenderer:
"""Test Fpdf2PdfRenderer."""
def test_requires_page_element(self, multi_font_manager):
"""Test that renderer requires ocr_page element."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a non-page element
word = OcrElement(
ocr_class=OcrClass.WORD,
text="test",
bbox=BoundingBox(left=0, top=0, right=100, bottom=20),
)
with pytest.raises(ValueError, match="Root element must be ocr_page"):
Fpdf2PdfRenderer(
page=word,
dpi=300,
multi_font_manager=multi_font_manager,
)
def test_requires_bbox(self, multi_font_manager):
"""Test that renderer requires page with bounding box."""
from ocrmypdf.models.ocr_element import OcrElement
page = OcrElement(ocr_class=OcrClass.PAGE)
with pytest.raises(ValueError, match="Page must have bounding box"):
Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
)
def test_render_simple_page(self, multi_font_manager, tmp_path):
"""Test rendering a simple page with one word."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a simple page with one word
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Hello",
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
children=[word],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72, # 1:1 mapping to PDF points
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_simple.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_invisible_text(self, multi_font_manager, tmp_path):
"""Test rendering invisible text (OCR layer)."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
word = OcrElement(
ocr_class=OcrClass.WORD,
text="Invisible",
bbox=BoundingBox(left=100, top=100, right=250, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=250, bottom=130),
children=[word],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72,
multi_font_manager=multi_font_manager,
invisible_text=True, # This is the default
)
output_path = tmp_path / "test_invisible.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
class TestFpdf2MultiPageRenderer:
"""Test Fpdf2MultiPageRenderer."""
def test_requires_pages(self, multi_font_manager):
"""Test that renderer requires at least one page."""
with pytest.raises(ValueError, match="No pages to render"):
renderer = Fpdf2MultiPageRenderer(
pages_data=[],
multi_font_manager=multi_font_manager,
)
renderer.render(Path("/tmp/test.pdf"))
def test_render_multiple_pages(self, multi_font_manager, tmp_path):
"""Test rendering multiple pages."""
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
pages_data = []
for i in range(3):
word = OcrElement(
ocr_class=OcrClass.WORD,
text=f"Page{i+1}",
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
children=[word],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
pages_data.append((i + 1, page, 72))
renderer = Fpdf2MultiPageRenderer(
pages_data=pages_data,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_multipage.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
class TestFpdf2RendererWithHocr:
"""Test fpdf2 renderer with actual hOCR files."""
def test_render_latin_hocr(self, resources, multi_font_manager, tmp_path):
"""Test rendering Latin text from hOCR."""
hocr_path = resources / "latin.hocr"
if not hocr_path.exists():
pytest.skip("latin.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
# Ensure we got a page
assert page.ocr_class == OcrClass.PAGE
assert page.bbox is not None
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "latin_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_cjk_hocr(self, resources, multi_font_manager, tmp_path):
"""Test rendering CJK text from hOCR."""
hocr_path = resources / "cjk.hocr"
if not hocr_path.exists():
pytest.skip("cjk.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "cjk_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_arabic_hocr(self, resources, multi_font_manager, tmp_path):
"""Test rendering Arabic text from hOCR."""
hocr_path = resources / "arabic.hocr"
if not hocr_path.exists():
pytest.skip("arabic.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "arabic_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_hello_world_scripts_hocr(
self, resources, multi_font_manager, tmp_path
):
"""Test rendering comprehensive multilingual 'Hello!' hOCR file.
This tests all major scripts including:
- Latin (English, Spanish, French, German, Italian, Polish, Portuguese, Turkish)
- Cyrillic (Russian)
- Greek
- CJK (Chinese Simplified, Chinese Traditional, Japanese, Korean)
- Devanagari (Hindi)
- Arabic (RTL)
- Hebrew (RTL)
Also includes rotated baselines to exercise skew handling.
"""
hocr_path = resources / "hello_world_scripts.hocr"
if not hocr_path.exists():
pytest.skip("hello_world_scripts.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
# Verify we parsed the page correctly
assert page.ocr_class == OcrClass.PAGE
assert page.bbox is not None
# Should have 2550x3300 at 300 DPI
assert page.bbox.right == 2550
assert page.bbox.bottom == 3300
# Test with visible text for visual inspection
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "hello_world_scripts_fpdf2.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
def test_render_hello_world_scripts_multipage(
self, resources, multi_font_manager, tmp_path
):
"""Test rendering hello_world_scripts.hocr using MultiPageRenderer.
Uses Fpdf2MultiPageRenderer to render the multilingual test file,
demonstrating font handling across all major writing systems.
"""
hocr_path = resources / "hello_world_scripts.hocr"
if not hocr_path.exists():
pytest.skip("hello_world_scripts.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
# Build pages_data list as expected by MultiPageRenderer
pages_data = [(1, page, 300)] # (page_number, page_element, dpi)
renderer = Fpdf2MultiPageRenderer(
pages_data=pages_data,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "hello_world_scripts_multipage.pdf"
renderer.render(output_path)
assert output_path.exists()
assert output_path.stat().st_size > 0
class TestWordSegmentation:
"""Test that rendered PDFs have proper word segmentation for pdfminer.six."""
def test_word_segmentation_with_pdfminer(self, multi_font_manager, tmp_path):
"""Test that pdfminer.six can extract words with proper spacing.
This test verifies that explicit space characters are inserted between
words so that pdfminer.six (and similar PDF readers) can properly
segment words during text extraction.
"""
from pdfminer.high_level import extract_text
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a page with multiple words on one line
word1 = OcrElement(
ocr_class=OcrClass.WORD,
text="Hello",
bbox=BoundingBox(left=100, top=100, right=200, bottom=130),
)
word2 = OcrElement(
ocr_class=OcrClass.WORD,
text="World",
bbox=BoundingBox(left=220, top=100, right=320, bottom=130),
)
word3 = OcrElement(
ocr_class=OcrClass.WORD,
text="Test",
bbox=BoundingBox(left=340, top=100, right=420, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=420, bottom=130),
children=[word1, word2, word3],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72, # 1:1 mapping to PDF points
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_word_segmentation.pdf"
renderer.render(output_path)
# Extract text using pdfminer.six
extracted_text = extract_text(str(output_path))
# Verify words are separated by spaces
assert "Hello" in extracted_text
assert "World" in extracted_text
assert "Test" in extracted_text
# The text should NOT be run together like "HelloWorldTest"
assert "HelloWorld" not in extracted_text
assert "WorldTest" not in extracted_text
# Verify proper word segmentation - words should be separated
# (allowing for whitespace variations)
words_found = extracted_text.split()
assert "Hello" in words_found
assert "World" in words_found
assert "Test" in words_found
def test_cjk_no_spurious_spaces(self, multi_font_manager, tmp_path):
"""Test that CJK text does not get spurious spaces inserted.
CJK scripts don't use spaces between characters/words, so we should
not insert spaces between adjacent CJK words.
"""
from pdfminer.high_level import extract_text
from ocrmypdf.models.ocr_element import BoundingBox, OcrElement
# Create a page with CJK words (Chinese characters)
# 你好 = "Hello" in Chinese
# 世界 = "World" in Chinese
word1 = OcrElement(
ocr_class=OcrClass.WORD,
text="你好",
bbox=BoundingBox(left=100, top=100, right=160, bottom=130),
)
word2 = OcrElement(
ocr_class=OcrClass.WORD,
text="世界",
bbox=BoundingBox(left=170, top=100, right=230, bottom=130),
)
line = OcrElement(
ocr_class=OcrClass.LINE,
bbox=BoundingBox(left=100, top=100, right=230, bottom=130),
children=[word1, word2],
)
page = OcrElement(
ocr_class=OcrClass.PAGE,
bbox=BoundingBox(left=0, top=0, right=612, bottom=792),
children=[line],
)
renderer = Fpdf2PdfRenderer(
page=page,
dpi=72,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "test_cjk_segmentation.pdf"
renderer.render(output_path)
# Extract text using pdfminer.six
extracted_text = extract_text(str(output_path))
# CJK text should be present
assert "你好" in extracted_text
assert "世界" in extracted_text
# There should NOT be spaces between CJK characters
# (but pdfminer may add some whitespace, so we check the raw chars)
extracted_chars = extracted_text.replace(" ", "").replace("\n", "")
assert "你好世界" in extracted_chars or (
"你好" in extracted_chars and "世界" in extracted_chars
)
def test_latin_hocr_word_segmentation(
self, resources, multi_font_manager, tmp_path
):
"""Test word segmentation with real Latin hOCR file."""
from pdfminer.high_level import extract_text
hocr_path = resources / "latin.hocr"
if not hocr_path.exists():
pytest.skip("latin.hocr not found")
parser = HocrParser(hocr_path)
page = parser.parse()
renderer = Fpdf2PdfRenderer(
page=page,
dpi=300,
multi_font_manager=multi_font_manager,
invisible_text=False,
)
output_path = tmp_path / "latin_segmentation.pdf"
renderer.render(output_path)
# Extract text using pdfminer.six
extracted_text = extract_text(str(output_path))
# The Latin text should have proper word segmentation
# Words should be separable
words = extracted_text.split()
assert len(words) > 0
# Check that common English words are properly segmented
# (not stuck together)
text_no_newlines = extracted_text.replace("\n", " ")
# There should be spaces in the extracted text
assert " " in text_no_newlines