Files
OCRmyPDF/tests/test_json_serialization.py
2025-12-21 12:21:48 -08:00

136 lines
4.8 KiB
Python

"""Test JSON serialization of OCROptions for multiprocessing compatibility."""
import multiprocessing
from io import BytesIO
from pathlib import Path
from ocrmypdf._options import OCROptions
def worker_function(options_json: str) -> str:
"""Worker function that deserializes OCROptions from JSON and returns a result."""
# Reconstruct OCROptions from JSON in worker process
options = OCROptions.model_validate_json_safe(options_json)
# Verify we can access various option types
result = {
'input_file': str(options.input_file),
'output_file': str(options.output_file),
'languages': options.languages,
'optimize': options.optimize,
'tesseract_timeout': options.tesseract_timeout,
'fast_web_view': options.fast_web_view,
'extra_attrs_count': len(options.extra_attrs),
}
# Return as JSON string
import json
return json.dumps(result)
def test_json_serialization_multiprocessing():
"""Test that OCROptions can be JSON serialized and used in multiprocessing."""
# Create OCROptions with various field types
options = OCROptions(
input_file=Path('/test/input.pdf'),
output_file=Path('/test/output.pdf'),
languages=['eng', 'deu'],
optimize=2,
tesseract_timeout=120.0,
fast_web_view=2.5,
deskew=True,
clean=False,
)
# Add some extra attributes
options.extra_attrs['custom_field'] = 'test_value'
options.extra_attrs['numeric_field'] = 42
# Serialize to JSON
options_json = options.model_dump_json_safe()
# Test that we can deserialize in the main process
reconstructed = OCROptions.model_validate_json_safe(options_json)
assert reconstructed.input_file == options.input_file
assert reconstructed.output_file == options.output_file
assert reconstructed.languages == options.languages
assert reconstructed.optimize == options.optimize
assert reconstructed.tesseract_timeout == options.tesseract_timeout
assert reconstructed.fast_web_view == options.fast_web_view
assert reconstructed.deskew == options.deskew
assert reconstructed.clean == options.clean
assert reconstructed.extra_attrs == options.extra_attrs
# Test multiprocessing with JSON serialization
with multiprocessing.Pool(processes=2) as pool:
# Send the JSON string to worker processes
results = pool.map(worker_function, [options_json, options_json])
# Verify results from worker processes
import json
for result_json in results:
result = json.loads(result_json)
assert result['input_file'] == '/test/input.pdf'
assert result['output_file'] == '/test/output.pdf'
assert result['languages'] == ['eng', 'deu']
assert result['optimize'] == 2
assert result['tesseract_timeout'] == 120.0
assert result['fast_web_view'] == 2.5
assert result['extra_attrs_count'] == 2 # Includes lossless_reconstruction
def test_json_serialization_with_streams():
"""Test JSON serialization with stream objects."""
input_stream = BytesIO(b'fake pdf data')
output_stream = BytesIO()
options = OCROptions(
input_file=input_stream,
output_file=output_stream,
languages=['eng'],
optimize=1,
)
# Serialize to JSON (streams should be converted to placeholders)
options_json = options.model_dump_json_safe()
# Deserialize (streams will be placeholder strings)
reconstructed = OCROptions.model_validate_json_safe(options_json)
# Streams should be converted to placeholder strings
assert reconstructed.input_file == 'stream'
assert reconstructed.output_file == 'stream'
assert reconstructed.languages == ['eng']
assert reconstructed.optimize == 1
def test_json_serialization_with_none_values():
"""Test JSON serialization handles None values correctly."""
options = OCROptions(
input_file=Path('/test/input.pdf'),
output_file=Path('/test/output.pdf'),
languages=['eng'],
# Many fields will be None by default
)
# Serialize to JSON
options_json = options.model_dump_json_safe()
# Deserialize
reconstructed = OCROptions.model_validate_json_safe(options_json)
# Verify None values are preserved (check actual defaults from model)
assert reconstructed.tesseract_timeout == 0.0 # Default value, not None
assert reconstructed.fast_web_view == 1.0 # Default value, not None
assert (
reconstructed.color_conversion_strategy == "LeaveColorUnchanged"
) # Default value
assert reconstructed.pdfa_image_compression is None # This one is actually None
# Verify non-None values are preserved
assert reconstructed.input_file == options.input_file
assert reconstructed.output_file == options.output_file
assert reconstructed.languages == options.languages