From 92beb474a51f095a206676aeb45cd616fcbeceb2 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 31 Jan 2026 12:05:37 -0800 Subject: [PATCH] Normalize unpaper_args to list at construction time Use a Pydantic field validator to convert string input to list[str] during OcrOptions construction, simplifying the type from `str | list[str] | None` to `list[str] | None`. Security validation (path injection check) now happens at construction rather than in check_options_preprocessing(). --- src/ocrmypdf/_exec/unpaper.py | 8 -------- src/ocrmypdf/_options.py | 19 ++++++++++++++++--- src/ocrmypdf/_validation.py | 9 +-------- tests/test_unpaper.py | 5 +++-- 4 files changed, 20 insertions(+), 21 deletions(-) diff --git a/src/ocrmypdf/_exec/unpaper.py b/src/ocrmypdf/_exec/unpaper.py index a5a92f4c..065fc9ef 100644 --- a/src/ocrmypdf/_exec/unpaper.py +++ b/src/ocrmypdf/_exec/unpaper.py @@ -7,7 +7,6 @@ from __future__ import annotations import logging import os -import shlex from collections.abc import Iterator from contextlib import contextmanager from decimal import Decimal @@ -101,13 +100,6 @@ def run_unpaper( ) from e -def validate_custom_args(args: str) -> list[str]: - unpaper_args = shlex.split(args) - if any(('/' in arg or arg == '.' or arg == '..') for arg in unpaper_args): - raise ValueError('No filenames allowed in --unpaper-args') - return unpaper_args - - def clean( input_file: Path, output_file: Path, diff --git a/src/ocrmypdf/_options.py b/src/ocrmypdf/_options.py index 9363b46a..e481363c 100644 --- a/src/ocrmypdf/_options.py +++ b/src/ocrmypdf/_options.py @@ -8,6 +8,7 @@ from __future__ import annotations import json import logging import os +import shlex import unicodedata from collections.abc import Sequence from enum import StrEnum @@ -156,9 +157,7 @@ class OcrOptions(BaseModel): remove_background: bool = False remove_vectors: bool = False oversample: int = 0 - unpaper_args: str | list[str] | None = ( - None # Can be string or list after validation - ) + unpaper_args: list[str] | None = None # OCR behavior skip_big: float | None = None @@ -339,6 +338,20 @@ class OcrOptions(BaseModel): # Convert string ranges to set of page numbers return _pages_from_ranges(v) + @field_validator('unpaper_args', mode='before') + @classmethod + def validate_unpaper_args(cls, v): + """Normalize unpaper_args from string to list and validate security.""" + if v is None: + return v + if isinstance(v, str): + v = shlex.split(v) + if isinstance(v, list): + if any(('/' in arg or arg == '.' or arg == '..') for arg in v): + raise ValueError('No filenames allowed in --unpaper-args') + return v + raise ValueError(f'unpaper_args must be a string or list, got {type(v)}') + @model_validator(mode='before') @classmethod def handle_special_cases(cls, data): diff --git a/src/ocrmypdf/_validation.py b/src/ocrmypdf/_validation.py index e28b1a2c..06927daa 100644 --- a/src/ocrmypdf/_validation.py +++ b/src/ocrmypdf/_validation.py @@ -114,15 +114,8 @@ def check_options_preprocessing(options: OcrOptions) -> None: package='unpaper', version_checker=unpaper.version, need_version='6.1', - required_for="--clean, --clean-final", # Problem arguments + required_for="--clean, --clean-final", ) - try: - if options.unpaper_args: - options.unpaper_args = unpaper.validate_custom_args( - options.unpaper_args - ) - except Exception as e: - raise BadArgsError("--unpaper-args: " + str(e)) from e def _check_plugin_invariant_options(options: OcrOptions) -> None: diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py index 319748da..cb477747 100644 --- a/tests/test_unpaper.py +++ b/tests/test_unpaper.py @@ -9,11 +9,12 @@ from unittest.mock import Mock, patch import pytest from packaging.version import Version +from pydantic import ValidationError from ocrmypdf._exec import unpaper from ocrmypdf._validation import check_options from ocrmypdf.cli import get_options_and_plugins -from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError +from ocrmypdf.exceptions import ExitCode, MissingDependencyError from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf_api @@ -87,7 +88,7 @@ def test_unpaper_args_valid(resources, outpdf): @needs_unpaper def test_unpaper_args_invalid_filename(resources, outpdf, caplog): - with pytest.raises(BadArgsError): + with pytest.raises(ValidationError, match="No filenames allowed"): run_ocrmypdf_api( resources / "skew.pdf", outpdf,