Normalize unpaper_args to list at construction time

Use a Pydantic field validator to convert string input to list[str]
during OcrOptions construction, simplifying the type from
`str | list[str] | None` to `list[str] | None`. Security validation
(path injection check) now happens at construction rather than in
check_options_preprocessing().
This commit is contained in:
James R. Barlow
2026-01-31 12:05:37 -08:00
parent 9dcd882c83
commit 92beb474a5
4 changed files with 20 additions and 21 deletions

View File

@@ -7,7 +7,6 @@ from __future__ import annotations
import logging
import os
import shlex
from collections.abc import Iterator
from contextlib import contextmanager
from decimal import Decimal
@@ -101,13 +100,6 @@ def run_unpaper(
) from e
def validate_custom_args(args: str) -> list[str]:
unpaper_args = shlex.split(args)
if any(('/' in arg or arg == '.' or arg == '..') for arg in unpaper_args):
raise ValueError('No filenames allowed in --unpaper-args')
return unpaper_args
def clean(
input_file: Path,
output_file: Path,

View File

@@ -8,6 +8,7 @@ from __future__ import annotations
import json
import logging
import os
import shlex
import unicodedata
from collections.abc import Sequence
from enum import StrEnum
@@ -156,9 +157,7 @@ class OcrOptions(BaseModel):
remove_background: bool = False
remove_vectors: bool = False
oversample: int = 0
unpaper_args: str | list[str] | None = (
None # Can be string or list after validation
)
unpaper_args: list[str] | None = None
# OCR behavior
skip_big: float | None = None
@@ -339,6 +338,20 @@ class OcrOptions(BaseModel):
# Convert string ranges to set of page numbers
return _pages_from_ranges(v)
@field_validator('unpaper_args', mode='before')
@classmethod
def validate_unpaper_args(cls, v):
"""Normalize unpaper_args from string to list and validate security."""
if v is None:
return v
if isinstance(v, str):
v = shlex.split(v)
if isinstance(v, list):
if any(('/' in arg or arg == '.' or arg == '..') for arg in v):
raise ValueError('No filenames allowed in --unpaper-args')
return v
raise ValueError(f'unpaper_args must be a string or list, got {type(v)}')
@model_validator(mode='before')
@classmethod
def handle_special_cases(cls, data):

View File

@@ -114,15 +114,8 @@ def check_options_preprocessing(options: OcrOptions) -> None:
package='unpaper',
version_checker=unpaper.version,
need_version='6.1',
required_for="--clean, --clean-final", # Problem arguments
required_for="--clean, --clean-final",
)
try:
if options.unpaper_args:
options.unpaper_args = unpaper.validate_custom_args(
options.unpaper_args
)
except Exception as e:
raise BadArgsError("--unpaper-args: " + str(e)) from e
def _check_plugin_invariant_options(options: OcrOptions) -> None:

View File

@@ -9,11 +9,12 @@ from unittest.mock import Mock, patch
import pytest
from packaging.version import Version
from pydantic import ValidationError
from ocrmypdf._exec import unpaper
from ocrmypdf._validation import check_options
from ocrmypdf.cli import get_options_and_plugins
from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf_api
@@ -87,7 +88,7 @@ def test_unpaper_args_valid(resources, outpdf):
@needs_unpaper
def test_unpaper_args_invalid_filename(resources, outpdf, caplog):
with pytest.raises(BadArgsError):
with pytest.raises(ValidationError, match="No filenames allowed"):
run_ocrmypdf_api(
resources / "skew.pdf",
outpdf,