OCRmyPDF/tests/test_json_serialization.py

"""Test JSON serialization of OCROptions for multiprocessing compatibility."""

import multiprocessing
from io import BytesIO
from pathlib import Path

from ocrmypdf._options import OCROptions


def worker_function(options_json: str) -> str:
    """Worker function that deserializes OCROptions from JSON and returns a result."""
    # Reconstruct OCROptions from JSON in worker process
    options = OCROptions.model_validate_json_safe(options_json)

    # Verify we can access various option types
    result = {
        'input_file': str(options.input_file),
        'output_file': str(options.output_file),
        'languages': options.languages,
        'optimize': options.optimize,
        'tesseract_timeout': options.tesseract_timeout,
        'fast_web_view': options.fast_web_view,
        'extra_attrs_count': len(options.extra_attrs),
    }

    # Return as JSON string
    import json

    return json.dumps(result)


def test_json_serialization_multiprocessing():
    """Test that OCROptions can be JSON serialized and used in multiprocessing."""
    # Create OCROptions with various field types
    options = OCROptions(
        input_file=Path('/test/input.pdf'),
        output_file=Path('/test/output.pdf'),
        languages=['eng', 'deu'],
        optimize=2,
        tesseract_timeout=120.0,
        fast_web_view=2.5,
        deskew=True,
        clean=False,
    )

    # Add some extra attributes
    options.extra_attrs['custom_field'] = 'test_value'
    options.extra_attrs['numeric_field'] = 42

    # Serialize to JSON
    options_json = options.model_dump_json_safe()

    # Test that we can deserialize in the main process
    reconstructed = OCROptions.model_validate_json_safe(options_json)
    assert reconstructed.input_file == options.input_file
    assert reconstructed.output_file == options.output_file
    assert reconstructed.languages == options.languages
    assert reconstructed.optimize == options.optimize
    assert reconstructed.tesseract_timeout == options.tesseract_timeout
    assert reconstructed.fast_web_view == options.fast_web_view
    assert reconstructed.deskew == options.deskew
    assert reconstructed.clean == options.clean
    assert reconstructed.extra_attrs == options.extra_attrs

    # Test multiprocessing with JSON serialization
    with multiprocessing.Pool(processes=2) as pool:
        # Send the JSON string to worker processes
        results = pool.map(worker_function, [options_json, options_json])

    # Verify results from worker processes
    import json

    for result_json in results:
        result = json.loads(result_json)
        assert result['input_file'] == '/test/input.pdf'
        assert result['output_file'] == '/test/output.pdf'
        assert result['languages'] == ['eng', 'deu']
        assert result['optimize'] == 2
        assert result['tesseract_timeout'] == 120.0
        assert result['fast_web_view'] == 2.5
        assert result['extra_attrs_count'] == 2  # Includes lossless_reconstruction


def test_json_serialization_with_streams():
    """Test JSON serialization with stream objects."""
    input_stream = BytesIO(b'fake pdf data')
    output_stream = BytesIO()

    options = OCROptions(
        input_file=input_stream,
        output_file=output_stream,
        languages=['eng'],
        optimize=1,
    )

    # Serialize to JSON (streams should be converted to placeholders)
    options_json = options.model_dump_json_safe()

    # Deserialize (streams will be placeholder strings)
    reconstructed = OCROptions.model_validate_json_safe(options_json)

    # Streams should be converted to placeholder strings
    assert reconstructed.input_file == 'stream'
    assert reconstructed.output_file == 'stream'
    assert reconstructed.languages == ['eng']
    assert reconstructed.optimize == 1


def test_json_serialization_with_none_values():
    """Test JSON serialization handles None values correctly."""
    options = OCROptions(
        input_file=Path('/test/input.pdf'),
        output_file=Path('/test/output.pdf'),
        languages=['eng'],
        # Many fields will be None by default
    )

    # Serialize to JSON
    options_json = options.model_dump_json_safe()

    # Deserialize
    reconstructed = OCROptions.model_validate_json_safe(options_json)

    # Verify None values are preserved (check actual defaults from model)
    assert reconstructed.tesseract_timeout == 0.0  # Default value, not None
    assert reconstructed.fast_web_view == 1.0  # Default value, not None
    assert (
        reconstructed.color_conversion_strategy == "LeaveColorUnchanged"
    )  # Default value
    assert reconstructed.pdfa_image_compression is None  # This one is actually None

    # Verify non-None values are preserved
    assert reconstructed.input_file == options.input_file
    assert reconstructed.output_file == options.output_file
    assert reconstructed.languages == options.languages