mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 20:14:53 -04:00
Normalize unpaper_args to list at construction time
Use a Pydantic field validator to convert string input to list[str] during OcrOptions construction, simplifying the type from `str | list[str] | None` to `list[str] | None`. Security validation (path injection check) now happens at construction rather than in check_options_preprocessing().
This commit is contained in:
@@ -7,7 +7,6 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
from collections.abc import Iterator
|
||||
from contextlib import contextmanager
|
||||
from decimal import Decimal
|
||||
@@ -101,13 +100,6 @@ def run_unpaper(
|
||||
) from e
|
||||
|
||||
|
||||
def validate_custom_args(args: str) -> list[str]:
|
||||
unpaper_args = shlex.split(args)
|
||||
if any(('/' in arg or arg == '.' or arg == '..') for arg in unpaper_args):
|
||||
raise ValueError('No filenames allowed in --unpaper-args')
|
||||
return unpaper_args
|
||||
|
||||
|
||||
def clean(
|
||||
input_file: Path,
|
||||
output_file: Path,
|
||||
|
||||
@@ -8,6 +8,7 @@ from __future__ import annotations
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import shlex
|
||||
import unicodedata
|
||||
from collections.abc import Sequence
|
||||
from enum import StrEnum
|
||||
@@ -156,9 +157,7 @@ class OcrOptions(BaseModel):
|
||||
remove_background: bool = False
|
||||
remove_vectors: bool = False
|
||||
oversample: int = 0
|
||||
unpaper_args: str | list[str] | None = (
|
||||
None # Can be string or list after validation
|
||||
)
|
||||
unpaper_args: list[str] | None = None
|
||||
|
||||
# OCR behavior
|
||||
skip_big: float | None = None
|
||||
@@ -339,6 +338,20 @@ class OcrOptions(BaseModel):
|
||||
# Convert string ranges to set of page numbers
|
||||
return _pages_from_ranges(v)
|
||||
|
||||
@field_validator('unpaper_args', mode='before')
|
||||
@classmethod
|
||||
def validate_unpaper_args(cls, v):
|
||||
"""Normalize unpaper_args from string to list and validate security."""
|
||||
if v is None:
|
||||
return v
|
||||
if isinstance(v, str):
|
||||
v = shlex.split(v)
|
||||
if isinstance(v, list):
|
||||
if any(('/' in arg or arg == '.' or arg == '..') for arg in v):
|
||||
raise ValueError('No filenames allowed in --unpaper-args')
|
||||
return v
|
||||
raise ValueError(f'unpaper_args must be a string or list, got {type(v)}')
|
||||
|
||||
@model_validator(mode='before')
|
||||
@classmethod
|
||||
def handle_special_cases(cls, data):
|
||||
|
||||
@@ -114,15 +114,8 @@ def check_options_preprocessing(options: OcrOptions) -> None:
|
||||
package='unpaper',
|
||||
version_checker=unpaper.version,
|
||||
need_version='6.1',
|
||||
required_for="--clean, --clean-final", # Problem arguments
|
||||
required_for="--clean, --clean-final",
|
||||
)
|
||||
try:
|
||||
if options.unpaper_args:
|
||||
options.unpaper_args = unpaper.validate_custom_args(
|
||||
options.unpaper_args
|
||||
)
|
||||
except Exception as e:
|
||||
raise BadArgsError("--unpaper-args: " + str(e)) from e
|
||||
|
||||
|
||||
def _check_plugin_invariant_options(options: OcrOptions) -> None:
|
||||
|
||||
@@ -9,11 +9,12 @@ from unittest.mock import Mock, patch
|
||||
|
||||
import pytest
|
||||
from packaging.version import Version
|
||||
from pydantic import ValidationError
|
||||
|
||||
from ocrmypdf._exec import unpaper
|
||||
from ocrmypdf._validation import check_options
|
||||
from ocrmypdf.cli import get_options_and_plugins
|
||||
from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError
|
||||
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
|
||||
|
||||
from .conftest import check_ocrmypdf, have_unpaper, run_ocrmypdf_api
|
||||
|
||||
@@ -87,7 +88,7 @@ def test_unpaper_args_valid(resources, outpdf):
|
||||
|
||||
@needs_unpaper
|
||||
def test_unpaper_args_invalid_filename(resources, outpdf, caplog):
|
||||
with pytest.raises(BadArgsError):
|
||||
with pytest.raises(ValidationError, match="No filenames allowed"):
|
||||
run_ocrmypdf_api(
|
||||
resources / "skew.pdf",
|
||||
outpdf,
|
||||
|
||||
Reference in New Issue
Block a user