Files
OCRmyPDF/tests/test_ocr_element.py
James R. Barlow b4f9673364 Add unit tests for HocrParser, PdfTextRenderer, and OcrElement
Comprehensive test coverage for the new hocrtransform components:

- test_ocr_element.py: Tests for BoundingBox, Baseline, FontInfo,
  OcrElement dataclass methods (iter_by_class, find_by_class,
  get_text_recursive, words/lines/paragraphs properties)

- test_hocr_parser.py: Tests for parsing hOCR files including
  page/paragraph/line/word extraction, RTL text, rotated text,
  different line types (header, caption), font info, and edge cases

- test_pdf_renderer.py: Tests for PDF rendering including text
  extraction verification, page sizing, multi-line content,
  text direction, baseline handling, textangle rotation, word breaks,
  debug options, and image overlay

Also fixes x_font regex pattern to not capture trailing semicolons.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-21 17:05:49 -08:00

245 lines
8.3 KiB
Python

# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for OcrElement dataclass and related classes."""
from __future__ import annotations
import pytest
from ocrmypdf.hocrtransform import (
Baseline,
BoundingBox,
FontInfo,
OcrClass,
OcrElement,
)
class TestBoundingBox:
"""Tests for BoundingBox dataclass."""
def test_basic_creation(self):
bbox = BoundingBox(left=10, top=20, right=100, bottom=50)
assert bbox.left == 10
assert bbox.top == 20
assert bbox.right == 100
assert bbox.bottom == 50
def test_width_height(self):
bbox = BoundingBox(left=10, top=20, right=110, bottom=70)
assert bbox.width == 100
assert bbox.height == 50
def test_zero_size_box(self):
bbox = BoundingBox(left=10, top=20, right=10, bottom=20)
assert bbox.width == 0
assert bbox.height == 0
def test_invalid_left_right(self):
with pytest.raises(ValueError, match="right.*left"):
BoundingBox(left=100, top=20, right=10, bottom=50)
def test_invalid_top_bottom(self):
with pytest.raises(ValueError, match="bottom.*top"):
BoundingBox(left=10, top=50, right=100, bottom=20)
class TestBaseline:
"""Tests for Baseline dataclass."""
def test_defaults(self):
baseline = Baseline()
assert baseline.slope == 0.0
assert baseline.intercept == 0.0
def test_with_values(self):
baseline = Baseline(slope=0.01, intercept=-5)
assert baseline.slope == 0.01
assert baseline.intercept == -5
class TestFontInfo:
"""Tests for FontInfo dataclass."""
def test_defaults(self):
font = FontInfo()
assert font.name is None
assert font.size is None
assert font.bold is False
assert font.italic is False
def test_with_values(self):
font = FontInfo(name="Arial", size=12.0, bold=True)
assert font.name == "Arial"
assert font.size == 12.0
assert font.bold is True
assert font.italic is False
class TestOcrElement:
"""Tests for OcrElement dataclass."""
def test_minimal_element(self):
elem = OcrElement(ocr_class=OcrClass.WORD, text="hello")
assert elem.ocr_class == "ocrx_word"
assert elem.text == "hello"
assert elem.bbox is None
assert elem.children == []
def test_element_with_bbox(self):
bbox = BoundingBox(left=0, top=0, right=100, bottom=50)
elem = OcrElement(ocr_class=OcrClass.LINE, bbox=bbox)
assert elem.bbox == bbox
assert elem.bbox.width == 100
def test_element_hierarchy(self):
word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])
paragraph = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[paragraph])
assert len(page.children) == 1
assert len(page.children[0].children) == 1
assert len(page.children[0].children[0].children) == 2
def test_iter_by_class_single(self):
word = OcrElement(ocr_class=OcrClass.WORD, text="test")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
words = page.iter_by_class(OcrClass.WORD)
assert len(words) == 1
assert words[0].text == "test"
def test_iter_by_class_multiple(self):
words = [
OcrElement(ocr_class=OcrClass.WORD, text="one"),
OcrElement(ocr_class=OcrClass.WORD, text="two"),
OcrElement(ocr_class=OcrClass.WORD, text="three"),
]
line = OcrElement(ocr_class=OcrClass.LINE, children=words)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
result = page.iter_by_class(OcrClass.WORD)
assert len(result) == 3
assert [w.text for w in result] == ["one", "two", "three"]
def test_iter_by_class_multiple_types(self):
line = OcrElement(ocr_class=OcrClass.LINE)
header = OcrElement(ocr_class=OcrClass.HEADER)
caption = OcrElement(ocr_class=OcrClass.CAPTION)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line, header, caption])
result = page.iter_by_class(OcrClass.LINE, OcrClass.HEADER)
assert len(result) == 2
def test_find_by_class(self):
word = OcrElement(ocr_class=OcrClass.WORD, text="found")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
result = page.find_by_class(OcrClass.WORD)
assert result is not None
assert result.text == "found"
def test_find_by_class_not_found(self):
line = OcrElement(ocr_class=OcrClass.LINE)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
result = page.find_by_class(OcrClass.WORD)
assert result is None
def test_get_text_recursive_leaf(self):
word = OcrElement(ocr_class=OcrClass.WORD, text="hello")
assert word.get_text_recursive() == "hello"
def test_get_text_recursive_nested(self):
word1 = OcrElement(ocr_class=OcrClass.WORD, text="Hello")
word2 = OcrElement(ocr_class=OcrClass.WORD, text="World")
line = OcrElement(ocr_class=OcrClass.LINE, children=[word1, word2])
assert line.get_text_recursive() == "Hello World"
def test_words_property(self):
words = [
OcrElement(ocr_class=OcrClass.WORD, text="a"),
OcrElement(ocr_class=OcrClass.WORD, text="b"),
]
line = OcrElement(ocr_class=OcrClass.LINE, children=words)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[line])
assert len(page.words) == 2
assert page.words[0].text == "a"
def test_lines_property(self):
line1 = OcrElement(ocr_class=OcrClass.LINE)
line2 = OcrElement(ocr_class=OcrClass.HEADER) # Also a line type
par = OcrElement(ocr_class=OcrClass.PARAGRAPH, children=[line1, line2])
page = OcrElement(ocr_class=OcrClass.PAGE, children=[par])
assert len(page.lines) == 2
def test_paragraphs_property(self):
par1 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
par2 = OcrElement(ocr_class=OcrClass.PARAGRAPH)
page = OcrElement(ocr_class=OcrClass.PAGE, children=[par1, par2])
assert len(page.paragraphs) == 2
def test_direction_ltr(self):
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="ltr")
assert elem.direction == "ltr"
def test_direction_rtl(self):
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, direction="rtl")
assert elem.direction == "rtl"
def test_language(self):
elem = OcrElement(ocr_class=OcrClass.PARAGRAPH, language="eng")
assert elem.language == "eng"
def test_baseline(self):
baseline = Baseline(slope=0.01, intercept=-3)
elem = OcrElement(ocr_class=OcrClass.LINE, baseline=baseline)
assert elem.baseline.slope == 0.01
assert elem.baseline.intercept == -3
def test_textangle(self):
elem = OcrElement(ocr_class=OcrClass.LINE, textangle=5.0)
assert elem.textangle == 5.0
def test_confidence(self):
elem = OcrElement(ocr_class=OcrClass.WORD, confidence=0.95)
assert elem.confidence == 0.95
def test_page_properties(self):
elem = OcrElement(
ocr_class=OcrClass.PAGE,
dpi=300.0,
page_number=0,
logical_page_number=1,
)
assert elem.dpi == 300.0
assert elem.page_number == 0
assert elem.logical_page_number == 1
class TestOcrClass:
"""Tests for OcrClass constants."""
def test_class_values(self):
assert OcrClass.PAGE == "ocr_page"
assert OcrClass.PARAGRAPH == "ocr_par"
assert OcrClass.LINE == "ocr_line"
assert OcrClass.WORD == "ocrx_word"
assert OcrClass.HEADER == "ocr_header"
assert OcrClass.CAPTION == "ocr_caption"
def test_line_types_frozenset(self):
assert OcrClass.LINE in OcrClass.LINE_TYPES
assert OcrClass.HEADER in OcrClass.LINE_TYPES
assert OcrClass.CAPTION in OcrClass.LINE_TYPES
assert OcrClass.WORD not in OcrClass.LINE_TYPES