Files
OCRmyPDF/tests/test_null_ocr_engine.py
James R. Barlow 0c3745a1a4 Add OCR engine selection framework and null OCR engine
Introduce --ocr-engine option to select between OCR engines:
- 'auto' (default): Uses Tesseract
- 'tesseract': Explicit Tesseract selection
- 'none': Skip OCR entirely (for PDF processing only)

Key changes:
- Extend OcrEngine ABC with generate_ocr() and supports_generate_ocr()
  for direct OcrElement tree output (bypasses hOCR)
- Add get_ocr_engine(options) hook parameter for engine selection
- Implement NullOcrEngine for --ocr-engine none
- Export OcrElement, OcrClass, BoundingBox from ocrmypdf package
- Add ocr_tree support to grafting pipeline

This prepares the foundation for pluggable OCR engines while maintaining
full backward compatibility with existing Tesseract-based workflows.
2026-01-12 10:11:14 -08:00

170 lines
5.7 KiB
Python

# SPDX-FileCopyrightText: 2025 James R. Barlow
# SPDX-License-Identifier: MPL-2.0
"""Unit tests for NullOcrEngine (--ocr-engine none).
Tests verify that the Null OCR engine exists and functions correctly
for scenarios where users want PDF processing without OCR.
"""
from __future__ import annotations
from pathlib import Path
from unittest.mock import MagicMock
import pytest
class TestNullOcrEngineExists:
"""Test that NullOcrEngine plugin exists and is loadable."""
def test_null_ocr_module_importable(self):
"""null_ocr module should be importable."""
from ocrmypdf.builtin_plugins import null_ocr
assert null_ocr is not None
def test_null_ocr_engine_class_exists(self):
"""NullOcrEngine class should exist."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
assert NullOcrEngine is not None
class TestNullOcrEngineInterface:
"""Test NullOcrEngine implements OcrEngine interface."""
def test_version_returns_none(self):
"""NullOcrEngine.version() should return 'none'."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
assert NullOcrEngine.version() == "none"
def test_creator_tag(self):
"""NullOcrEngine.creator_tag() should indicate no OCR."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
tag = NullOcrEngine.creator_tag(MagicMock())
tag_lower = tag.lower()
assert "no ocr" in tag_lower or "null" in tag_lower or "none" in tag_lower
def test_languages_returns_empty_set(self):
"""NullOcrEngine.languages() should return empty set."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
langs = NullOcrEngine.languages(MagicMock())
assert langs == set()
def test_supports_generate_ocr_returns_true(self):
"""NullOcrEngine should support generate_ocr()."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
assert NullOcrEngine.supports_generate_ocr() is True
def test_get_orientation_returns_zero(self):
"""NullOcrEngine.get_orientation() should return angle=0."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
result = NullOcrEngine.get_orientation(Path("test.png"), MagicMock())
assert result.angle == 0
def test_get_deskew_returns_zero(self):
"""NullOcrEngine.get_deskew() should return 0.0."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
result = NullOcrEngine.get_deskew(Path("test.png"), MagicMock())
assert result == 0.0
class TestNullOcrEngineGenerateOcr:
"""Test NullOcrEngine.generate_ocr() output."""
@pytest.fixture
def sample_image(self, tmp_path):
"""Create a simple test image."""
from PIL import Image
img_path = tmp_path / "test.png"
img = Image.new('RGB', (100, 100), color='white')
img.save(img_path, dpi=(300, 300))
return img_path
def test_generate_ocr_returns_tuple(self, sample_image):
"""generate_ocr() should return (OcrElement, str) tuple."""
from ocrmypdf import OcrElement
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
result = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
assert isinstance(result, tuple)
assert len(result) == 2
assert isinstance(result[0], OcrElement)
assert isinstance(result[1], str)
def test_generate_ocr_returns_empty_text(self, sample_image):
"""generate_ocr() should return empty text string."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
_, text = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
assert text == ""
def test_generate_ocr_returns_page_element(self, sample_image):
"""generate_ocr() should return OcrElement with ocr_class PAGE."""
from ocrmypdf import OcrClass
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
assert ocr_tree.ocr_class == OcrClass.PAGE
def test_generate_ocr_page_has_correct_dimensions(self, sample_image):
"""generate_ocr() page element should have image dimensions."""
from ocrmypdf.builtin_plugins.null_ocr import NullOcrEngine
ocr_tree, _ = NullOcrEngine.generate_ocr(sample_image, MagicMock(), 0)
# Image is 100x100
assert ocr_tree.bbox.right == 100
assert ocr_tree.bbox.bottom == 100
class TestOcrEngineOption:
"""Test --ocr-engine CLI option."""
def test_ocr_engine_option_accepted(self):
"""CLI should accept --ocr-engine option."""
from ocrmypdf.cli import get_parser
parser = get_parser()
# Should not raise
args = parser.parse_args(['--ocr-engine', 'none', 'in.pdf', 'out.pdf'])
assert args.ocr_engine == 'none'
def test_ocr_engine_choices_include_none(self):
"""--ocr-engine should include 'none' as a choice."""
from ocrmypdf.cli import get_parser
parser = get_parser()
# Find the --ocr-engine action
for action in parser._actions:
if '--ocr-engine' in action.option_strings:
assert 'none' in action.choices
break
else:
pytest.fail("--ocr-engine option not found")
def test_ocr_engine_choices_include_auto(self):
"""--ocr-engine should include 'auto' as default."""
from ocrmypdf.cli import get_parser
parser = get_parser()
for action in parser._actions:
if '--ocr-engine' in action.option_strings:
assert 'auto' in action.choices
assert action.default == 'auto'
break