From f095e91cb4aa7659176641ab87c72be3fca82bb5 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 7 Feb 2019 16:21:02 -0800 Subject: [PATCH] unpaper-args: add test case and harden feature --- src/ocrmypdf/exec/unpaper.py | 46 ++++++++++++++------- tests/conftest.py | 2 +- tests/test_unpaper.py | 78 +++++++++++++++++++++++++++++++----- 3 files changed, 101 insertions(+), 25 deletions(-) diff --git a/src/ocrmypdf/exec/unpaper.py b/src/ocrmypdf/exec/unpaper.py index cf53fd8a..40c79594 100644 --- a/src/ocrmypdf/exec/unpaper.py +++ b/src/ocrmypdf/exec/unpaper.py @@ -20,13 +20,14 @@ import os import shlex +import subprocess import sys from functools import lru_cache -from subprocess import STDOUT, CalledProcessError, check_output -from tempfile import NamedTemporaryFile +from subprocess import PIPE, STDOUT, CalledProcessError +from tempfile import TemporaryDirectory from . import get_version -from ..exceptions import MissingDependencyError +from ..exceptions import MissingDependencyError, SubprocessOutputError try: from PIL import Image @@ -65,26 +66,43 @@ def run(input_file, output_file, dpi, log, mode_args): im.close() raise MissingDependencyError() from e - with NamedTemporaryFile(suffix=suffix) as input_pnm, NamedTemporaryFile( - suffix=suffix, mode="r+b" - ) as output_pnm: + with TemporaryDirectory() as tmpdir: + input_pnm = os.path.join(tmpdir, f'input{suffix}') + output_pnm = os.path.join(tmpdir, f'output{suffix}') im.save(input_pnm, format='PPM') im.close() - os.unlink(output_pnm.name) - - args_unpaper.extend([input_pnm.name, output_pnm.name]) + # To prevent any shenanigans from accepting arbitrary parameters in + # --unpaper-args, we: + # 1) run with cwd set to a tmpdir with only unpaper's files + # 2) forbid the use of '/' in arguments, to prevent changing paths + # 3) append absolute paths for the input and output file + # This should ensure that a user cannot clobber some other file with + # their unpaper arguments (whether intentionally or otherwise) + args_unpaper.extend([input_pnm, output_pnm]) try: - stdout = check_output( - args_unpaper, close_fds=True, universal_newlines=True, stderr=STDOUT + proc = subprocess.run( + args_unpaper, + check=True, + close_fds=True, + universal_newlines=True, + stderr=STDOUT, + cwd=tmpdir, + stdout=PIPE, ) except CalledProcessError as e: log.debug(e.output) raise e from e else: - log.debug(stdout) - # unpaper sets dpi to 72 - Image.open(output_pnm.name).save(output_file, dpi=(dpi, dpi)) + log.debug(proc.stdout) + # unpaper sets dpi to 72; fix this + try: + Image.open(output_pnm).save(output_file, dpi=(dpi, dpi)) + except (FileNotFoundError, OSError): + raise SubprocessOutputError( + "unpaper: failed to produce the expected output file. Called with: " + + str(args_unpaper) + ) from None def validate_custom_args(args: str): diff --git a/tests/conftest.py b/tests/conftest.py index 61a0f922..d68a6b23 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -147,7 +147,7 @@ def no_outpdf(tmpdir): @pytest.helpers.register def check_ocrmypdf(input_file, output_file, *args, env=None): - "Run ocrmypdf and confirmed that a valid file was created" + """Run ocrmypdf and confirmed that a valid file was created""" p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env) # ensure py.test collects the output, use -s to view diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py index 62f06656..23f6d698 100644 --- a/tests/test_unpaper.py +++ b/tests/test_unpaper.py @@ -15,11 +15,16 @@ # You should have received a copy of the GNU General Public License # along with OCRmyPDF. If not, see . +import argparse +from os import fspath from pathlib import Path +from unittest.mock import MagicMock, patch import pytest +from ocrmypdf import __main__ as main from ocrmypdf.exceptions import ExitCode +from ocrmypdf.exec import unpaper # pytest.helpers is dynamic # pylint: disable=no-member @@ -30,26 +35,79 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf spoof = pytest.helpers.spoof -@pytest.fixture(scope='session') +def have_unpaper(): + try: + unpaper.version() + except Exception: + return False + else: + return True + + +@pytest.fixture(scope="session") def spoof_unpaper_oldversion(tmpdir_factory): - return spoof(tmpdir_factory, unpaper='unpaper_oldversion.py') + return spoof(tmpdir_factory, unpaper="unpaper_oldversion.py") -@pytest.mark.skipif(True, reason="needs new fixture implementation") def test_no_unpaper(resources, no_outpdf): - # - p, out, err = run_ocrmypdf( - resources / 'c02-22.pdf', no_outpdf, '--clean', env=os.environ - ) - assert p.returncode == ExitCode.missing_dependency + input_ = fspath(resources / "c02-22.pdf") + output = fspath(no_outpdf) + options = main.parser.parse_args(args=["--clean", input_, output]) + + with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version: + mock_unpaper_version.side_effect = FileNotFoundError("unpaper") + with pytest.raises(SystemExit): + main.check_options(options, log=MagicMock()) def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf): p, out, err = run_ocrmypdf( - resources / 'c02-22.pdf', no_outpdf, '--clean', env=spoof_unpaper_oldversion + resources / "c02-22.pdf", no_outpdf, "--clean", env=spoof_unpaper_oldversion ) assert p.returncode == ExitCode.missing_dependency +@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper") def test_clean(spoof_tesseract_noop, resources, outpdf): - check_ocrmypdf(resources / 'skew.pdf', outpdf, '-c', env=spoof_tesseract_noop) + check_ocrmypdf(resources / "skew.pdf", outpdf, "-c", env=spoof_tesseract_noop) + + +@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper") +def test_unpaper_args_valid(spoof_tesseract_noop, resources, outpdf): + check_ocrmypdf( + resources / "skew.pdf", + outpdf, + "-c", + "--unpaper-args", + "--layout double", # Spaces required here + env=spoof_tesseract_noop, + ) + + +@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper") +def test_unpaper_args_invalid_filename(spoof_tesseract_noop, resources, outpdf): + p, out, err = run_ocrmypdf( + resources / "skew.pdf", + outpdf, + "-c", + "--unpaper-args", + "/etc/passwd", + env=spoof_tesseract_noop, + ) + assert "No filenames allowed" in err + assert p.returncode == ExitCode.bad_args + + +@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper") +def test_unpaper_args_invalid(spoof_tesseract_noop, resources, outpdf): + p, out, err = run_ocrmypdf( + resources / "skew.pdf", + outpdf, + "-c", + "--unpaper-args", + "unpaper is not going to like these arguments", + env=spoof_tesseract_noop, + ) + # Can't tell difference between unpaper choking on bad arguments or some + # other unpaper failure + assert p.returncode == ExitCode.child_process_error