diff --git a/src/ocrmypdf/exec/unpaper.py b/src/ocrmypdf/exec/unpaper.py
index cf53fd8a..40c79594 100644
--- a/src/ocrmypdf/exec/unpaper.py
+++ b/src/ocrmypdf/exec/unpaper.py
@@ -20,13 +20,14 @@
import os
import shlex
+import subprocess
import sys
from functools import lru_cache
-from subprocess import STDOUT, CalledProcessError, check_output
-from tempfile import NamedTemporaryFile
+from subprocess import PIPE, STDOUT, CalledProcessError
+from tempfile import TemporaryDirectory
from . import get_version
-from ..exceptions import MissingDependencyError
+from ..exceptions import MissingDependencyError, SubprocessOutputError
try:
from PIL import Image
@@ -65,26 +66,43 @@ def run(input_file, output_file, dpi, log, mode_args):
im.close()
raise MissingDependencyError() from e
- with NamedTemporaryFile(suffix=suffix) as input_pnm, NamedTemporaryFile(
- suffix=suffix, mode="r+b"
- ) as output_pnm:
+ with TemporaryDirectory() as tmpdir:
+ input_pnm = os.path.join(tmpdir, f'input{suffix}')
+ output_pnm = os.path.join(tmpdir, f'output{suffix}')
im.save(input_pnm, format='PPM')
im.close()
- os.unlink(output_pnm.name)
-
- args_unpaper.extend([input_pnm.name, output_pnm.name])
+ # To prevent any shenanigans from accepting arbitrary parameters in
+ # --unpaper-args, we:
+ # 1) run with cwd set to a tmpdir with only unpaper's files
+ # 2) forbid the use of '/' in arguments, to prevent changing paths
+ # 3) append absolute paths for the input and output file
+ # This should ensure that a user cannot clobber some other file with
+ # their unpaper arguments (whether intentionally or otherwise)
+ args_unpaper.extend([input_pnm, output_pnm])
try:
- stdout = check_output(
- args_unpaper, close_fds=True, universal_newlines=True, stderr=STDOUT
+ proc = subprocess.run(
+ args_unpaper,
+ check=True,
+ close_fds=True,
+ universal_newlines=True,
+ stderr=STDOUT,
+ cwd=tmpdir,
+ stdout=PIPE,
)
except CalledProcessError as e:
log.debug(e.output)
raise e from e
else:
- log.debug(stdout)
- # unpaper sets dpi to 72
- Image.open(output_pnm.name).save(output_file, dpi=(dpi, dpi))
+ log.debug(proc.stdout)
+ # unpaper sets dpi to 72; fix this
+ try:
+ Image.open(output_pnm).save(output_file, dpi=(dpi, dpi))
+ except (FileNotFoundError, OSError):
+ raise SubprocessOutputError(
+ "unpaper: failed to produce the expected output file. Called with: "
+ + str(args_unpaper)
+ ) from None
def validate_custom_args(args: str):
diff --git a/tests/conftest.py b/tests/conftest.py
index 61a0f922..d68a6b23 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -147,7 +147,7 @@ def no_outpdf(tmpdir):
@pytest.helpers.register
def check_ocrmypdf(input_file, output_file, *args, env=None):
- "Run ocrmypdf and confirmed that a valid file was created"
+ """Run ocrmypdf and confirmed that a valid file was created"""
p, out, err = run_ocrmypdf(input_file, output_file, *args, env=env)
# ensure py.test collects the output, use -s to view
diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py
index 62f06656..23f6d698 100644
--- a/tests/test_unpaper.py
+++ b/tests/test_unpaper.py
@@ -15,11 +15,16 @@
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see .
+import argparse
+from os import fspath
from pathlib import Path
+from unittest.mock import MagicMock, patch
import pytest
+from ocrmypdf import __main__ as main
from ocrmypdf.exceptions import ExitCode
+from ocrmypdf.exec import unpaper
# pytest.helpers is dynamic
# pylint: disable=no-member
@@ -30,26 +35,79 @@ run_ocrmypdf = pytest.helpers.run_ocrmypdf
spoof = pytest.helpers.spoof
-@pytest.fixture(scope='session')
+def have_unpaper():
+ try:
+ unpaper.version()
+ except Exception:
+ return False
+ else:
+ return True
+
+
+@pytest.fixture(scope="session")
def spoof_unpaper_oldversion(tmpdir_factory):
- return spoof(tmpdir_factory, unpaper='unpaper_oldversion.py')
+ return spoof(tmpdir_factory, unpaper="unpaper_oldversion.py")
-@pytest.mark.skipif(True, reason="needs new fixture implementation")
def test_no_unpaper(resources, no_outpdf):
- #
- p, out, err = run_ocrmypdf(
- resources / 'c02-22.pdf', no_outpdf, '--clean', env=os.environ
- )
- assert p.returncode == ExitCode.missing_dependency
+ input_ = fspath(resources / "c02-22.pdf")
+ output = fspath(no_outpdf)
+ options = main.parser.parse_args(args=["--clean", input_, output])
+
+ with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
+ mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
+ with pytest.raises(SystemExit):
+ main.check_options(options, log=MagicMock())
def test_old_unpaper(spoof_unpaper_oldversion, resources, no_outpdf):
p, out, err = run_ocrmypdf(
- resources / 'c02-22.pdf', no_outpdf, '--clean', env=spoof_unpaper_oldversion
+ resources / "c02-22.pdf", no_outpdf, "--clean", env=spoof_unpaper_oldversion
)
assert p.returncode == ExitCode.missing_dependency
+@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
def test_clean(spoof_tesseract_noop, resources, outpdf):
- check_ocrmypdf(resources / 'skew.pdf', outpdf, '-c', env=spoof_tesseract_noop)
+ check_ocrmypdf(resources / "skew.pdf", outpdf, "-c", env=spoof_tesseract_noop)
+
+
+@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
+def test_unpaper_args_valid(spoof_tesseract_noop, resources, outpdf):
+ check_ocrmypdf(
+ resources / "skew.pdf",
+ outpdf,
+ "-c",
+ "--unpaper-args",
+ "--layout double", # Spaces required here
+ env=spoof_tesseract_noop,
+ )
+
+
+@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
+def test_unpaper_args_invalid_filename(spoof_tesseract_noop, resources, outpdf):
+ p, out, err = run_ocrmypdf(
+ resources / "skew.pdf",
+ outpdf,
+ "-c",
+ "--unpaper-args",
+ "/etc/passwd",
+ env=spoof_tesseract_noop,
+ )
+ assert "No filenames allowed" in err
+ assert p.returncode == ExitCode.bad_args
+
+
+@pytest.mark.skipif(not have_unpaper(), reason="requires unpaper")
+def test_unpaper_args_invalid(spoof_tesseract_noop, resources, outpdf):
+ p, out, err = run_ocrmypdf(
+ resources / "skew.pdf",
+ outpdf,
+ "-c",
+ "--unpaper-args",
+ "unpaper is not going to like these arguments",
+ env=spoof_tesseract_noop,
+ )
+ # Can't tell difference between unpaper choking on bad arguments or some
+ # other unpaper failure
+ assert p.returncode == ExitCode.child_process_error