# SPDX-FileCopyrightText: 2022 James R. Barlow # SPDX-License-Identifier: MPL-2.0 from __future__ import annotations import logging import os from unittest.mock import patch import pikepdf import pytest from ocrmypdf import _validation as vd from ocrmypdf._concurrent import NullProgressBar, SerialExecutor from ocrmypdf._exec.tesseract import TesseractVersion from ocrmypdf._plugin_manager import get_plugin_manager from ocrmypdf.api import create_options from ocrmypdf.cli import get_parser from ocrmypdf.exceptions import BadArgsError, MissingDependencyError from ocrmypdf.pdfinfo import PdfInfo from .conftest import run_ocrmypdf_api def make_opts_pm(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs): if language is not None: kwargs['language'] = language parser = get_parser() pm = get_plugin_manager(kwargs.get('plugins', [])) pm.hook.add_options(parser=parser) # pylint: disable=no-member return ( create_options( input_file=input_file, output_file=output_file, parser=parser, **kwargs ), pm, ) def make_opts(*args, **kwargs): opts, _pm = make_opts_pm(*args, **kwargs) return opts def test_hocr_notlatin_warning(caplog): vd.check_options_output( make_opts_pm(language='chi_sim', pdf_renderer='hocr', output_type='pdfa')[0] ) assert 'PDF renderer is known to cause' in caplog.text def test_old_tesseract_error(): with patch('ocrmypdf._exec.tesseract.version', return_value='4.00.00alpha'): with pytest.raises(MissingDependencyError): vd.check_options(*make_opts_pm(pdf_renderer='sandwich', language='eng')) def test_tesseract_not_installed(caplog): with patch('ocrmypdf.subprocess.run') as not_found: not_found.side_effect = FileNotFoundError('tesseract') with pytest.raises(MissingDependencyError, match="Could not find program"): vd.check_options(*make_opts_pm()) assert ( "'tesseract' could not be executed" in caplog.text ), "Error message not printed" assert 'install' in caplog.text, "Install advice not printed" not_found.assert_called() def test_lossless_redo(): with pytest.raises(BadArgsError): vd.check_options_output(make_opts(redo_ocr=True, deskew=True)) def test_mutex_options(): with pytest.raises(BadArgsError): vd.check_options_ocr_behavior(make_opts(force_ocr=True, skip_text=True)) with pytest.raises(BadArgsError): vd.check_options_ocr_behavior(make_opts(redo_ocr=True, skip_text=True)) with pytest.raises(BadArgsError): vd.check_options_ocr_behavior(make_opts(redo_ocr=True, force_ocr=True)) def test_optimizing(caplog): vd.check_options( *make_opts_pm(optimize=0, jbig2_lossy=True, png_quality=18, jpeg_quality=10) ) assert 'will be ignored because' in caplog.text def test_pillow_options(): vd.check_options_pillow(make_opts(max_image_mpixels=0)) def test_output_tty(): with patch('sys.stdout.isatty', return_value=True): with pytest.raises(BadArgsError): vd.check_requested_output_file(make_opts(output_file='-')) def test_report_file_size(tmp_path, caplog): in_ = tmp_path / 'a.pdf' out = tmp_path / 'b.pdf' pdf = pikepdf.new() pdf.save(in_) pdf.save(out) opts = make_opts(output_type='pdf') vd.report_output_file_size(opts, in_, out) assert caplog.text == '' caplog.clear() waste_of_space = b'Dummy' * 5000 pdf.Root.Dummy = waste_of_space pdf.save(in_) pdf.Root.Dummy2 = waste_of_space + waste_of_space pdf.save(out) vd.report_output_file_size(opts, in_, out, ['The optional dependency...']) assert 'optional dependency' in caplog.text caplog.clear() vd.report_output_file_size(opts, in_, out, []) assert 'No reason' in caplog.text caplog.clear() opts = make_opts(in_, out, optimize=0, output_type='pdf') vd.report_output_file_size(opts, in_, out, ["Optimization was disabled."]) assert 'disabled' in caplog.text caplog.clear() def test_false_action_store_true(): opts = make_opts(keep_temporary_files=True) assert opts.keep_temporary_files opts = make_opts(keep_temporary_files=False) assert not opts.keep_temporary_files @pytest.mark.parametrize('progress_bar', [True, False]) def test_no_progress_bar(progress_bar, resources): opts, pm = make_opts_pm( progress_bar=progress_bar, input_file=(resources / 'trivial.pdf') ) vd.check_options(opts, pm) pbar_disabled = None class CheckProgressBar(NullProgressBar): def __init__(self, disable, **kwargs): nonlocal pbar_disabled pbar_disabled = disable super().__init__(disable=disable, **kwargs) executor = SerialExecutor(pbar_class=CheckProgressBar) pdfinfo = PdfInfo(opts.input_file, progbar=opts.progress_bar, executor=executor) assert pdfinfo is not None assert pbar_disabled is not None and pbar_disabled != progress_bar def test_language_warning(caplog): opts = make_opts(language=None) _plugin_manager = get_plugin_manager(opts.plugins) caplog.set_level(logging.DEBUG) with patch( 'ocrmypdf._validation.locale.getlocale', return_value=('en_US', 'UTF-8') ) as mock: vd.check_options_languages(opts, {'eng'}) assert opts.languages == {'eng'} assert '' in caplog.text mock.assert_called_once() opts = make_opts(language=None) with patch( 'ocrmypdf._validation.locale.getlocale', return_value=('fr_FR', 'UTF-8') ) as mock: vd.check_options_languages(opts, {'eng'}) assert opts.languages == {'eng'} assert 'assuming --language' in caplog.text mock.assert_called_once() def test_version_comparison(): vd.check_external_program( program="dummy_basic", package="dummy", version_checker=lambda: '9.0', need_version='8.0.2', ) vd.check_external_program( program="dummy_doubledigit", package="dummy", version_checker=lambda: '10.0', need_version='8.0.2', ) with pytest.raises(MissingDependencyError): vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: '4.0.0-beta.1', need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: 'v5.0.0-alpha.20200201', need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: '5.0.0-rc1.20211030', need_version='4.1.1', version_parser=TesseractVersion, ) vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: 'v4.1.1.20181030', # Some Windows builds use this format need_version='4.1.1', version_parser=TesseractVersion, ) with pytest.raises(MissingDependencyError): vd.check_external_program( program="tesseract", package="tesseract", version_checker=lambda: '4.1.1-rc2-25-g9707', need_version='4.1.1', version_parser=TesseractVersion, ) with pytest.raises(MissingDependencyError): vd.check_external_program( program="dummy_fails", package="dummy", version_checker=lambda: '1.0', need_version='2.0', ) def test_optional_program_recommended(caplog): caplog.clear() def raiser(): raise FileNotFoundError('jbig2') with caplog.at_level(logging.WARNING): vd.check_external_program( program="jbig2", package="jbig2enc", version_checker=raiser, need_version='42', required_for='this test case', recommended=True, ) assert any( (loglevel == logging.WARNING and "recommended" in msg) for _logger_name, loglevel, msg in caplog.record_tuples ) def test_pagesegmode_warning(caplog): opts = make_opts(tesseract_pagesegmode='0') plugin_manager = get_plugin_manager(opts.plugins) vd.check_options(opts, plugin_manager) assert 'disable OCR' in caplog.text def test_two_languages(): vd.check_options_languages( create_options( input_file='a.pdf', output_file='b.pdf', parser=get_parser(), language='fakelang1+fakelang2', ), {'fakelang1', 'fakelang2'}, ) def test_sidecar_equals_output(resources, no_outpdf): op = no_outpdf with pytest.raises(BadArgsError, match=r'--sidecar'): run_ocrmypdf_api(resources / 'trivial.pdf', op, '--sidecar', op) def test_devnull_sidecar(resources): with pytest.raises(BadArgsError, match=r'--sidecar.*NUL'): run_ocrmypdf_api(resources / 'trivial.pdf', os.devnull, '--sidecar')