From 7566d4b76834428ddeaecc6ce0c1601ac4fcccda Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 27 May 2019 16:55:04 -0700 Subject: [PATCH] Introduce plugins/filters --- src/ocrmypdf/_filters.py | 82 +++++++++++++++++++++++++++++++++++ src/ocrmypdf/_pipeline.py | 5 ++- src/ocrmypdf/api.py | 4 +- src/ocrmypdf/cli.py | 6 ++- src/ocrmypdf/filters.py | 10 +++++ tests/test_filters.py | 91 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 193 insertions(+), 5 deletions(-) create mode 100644 src/ocrmypdf/_filters.py create mode 100644 src/ocrmypdf/filters.py create mode 100644 tests/test_filters.py diff --git a/src/ocrmypdf/_filters.py b/src/ocrmypdf/_filters.py new file mode 100644 index 00000000..44ea1867 --- /dev/null +++ b/src/ocrmypdf/_filters.py @@ -0,0 +1,82 @@ +# © 2019 James R. Barlow: github.com/jbarlow83 +# +# This file is part of OCRmyPDF. +# +# OCRmyPDF is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OCRmyPDF is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OCRmyPDF. If not, see . + +import logging +import importlib +import os +import sys + + +log = logging.getLogger(__name__) + + +def _load_function_from_module(location): + """Load a function given a module location + + For location=a.b.c, will effectively run "from a.b import c" + + Example: + _load_function_from_module("a.b.c") + + """ + module_parts = location.split('.') + module_name = '.'.join(module_parts[:-1]) + object_name = module_parts[-1] + module = importlib.import_module(module_name) + fn = getattr(module, object_name) + log.debug(f"Loaded function: from {module_name} import {object_name}") + return fn + + +def _load_function_from_pyfile(location): + """Load a function from a file + + Example: + _load_function_from_pyfile("test.py::blur_filter") + """ + filename, object_name = location.split('::', maxsplit=1) + log.debug(f"Loading function {object_name} from {filename}") + + module_name = os.path.basename(filename) + if module_name.endswith('.py'): + module_name = module_name[:-3] + + spec = importlib.util.spec_from_file_location(module_name, filename) + module = importlib.util.module_from_spec(spec) + spec.loader.exec_module(module) + fn = getattr(module, object_name) + return fn + + +def load_filter(filt): + if callable(filt): + return filt + + if not isinstance(filt, str): + raise TypeError() + + if '::' not in filt: + filt = _load_function_from_module(filt) + else: + filt = _load_function_from_pyfile(filt) + + return filt + + +def check_filter_loadable(filt): + load_filter(filt) + return filt diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index 70657a3a..8ab62309 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -28,6 +28,8 @@ import pikepdf from pikepdf.models.metadata import encode_pdf_date from . import PROGRAM_NAME, VERSION, leptonica + +from ._filters import load_filter from .exceptions import ( DpiError, EncryptedPdfError, @@ -522,7 +524,8 @@ def create_ocr_image(image, page_context): im = pix.topil() if options.filter_ocr_image: - im = options.filter_ocr_image(im) + filt = load_filter(options.filter_ocr_image) + im = filt(im) del draw # Pillow requires integer DPI diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index 701e3b73..6270a3de 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -118,7 +118,7 @@ def create_options(*, input_file, output_file, **kwargs): for arg, val in kwargs.items(): if val is None: continue - if arg.startswith('filter') and callable(val): + if arg.startswith('filter') and (callable(val) or isinstance(val, str)): filters.append((arg, val)) continue cmd_style_arg = arg.replace('_', '-') @@ -188,7 +188,7 @@ def ocrmypdf( # pylint: disable=unused-argument user_patterns=None, keep_temporary_files=None, progress_bar=None, - process_ocr_image=None, + filter_ocr_image=None, ): """Run OCRmyPDF on one PDF or image. diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py index 1480d677..d7a9cf69 100644 --- a/src/ocrmypdf/cli.py +++ b/src/ocrmypdf/cli.py @@ -18,6 +18,7 @@ import argparse from . import PROGRAM_NAME, VERSION +from ._filters import check_filter_loadable def numeric(basetype, min_=None, max_=None): @@ -466,9 +467,10 @@ advanced.add_argument( help="Specify the location of the Tesseract user patterns file.", ) - filters = parser.add_argument_group("Filters", argparse.SUPPRESS) -filters.add_argument('--filter-ocr-image', help=argparse.SUPPRESS) +filters.add_argument( + '--filter-ocr-image', help=argparse.SUPPRESS, type=check_filter_loadable +) debugging = parser.add_argument_group( "Debugging", "Arguments to help with troubleshooting and debugging" diff --git a/src/ocrmypdf/filters.py b/src/ocrmypdf/filters.py new file mode 100644 index 00000000..e51381bf --- /dev/null +++ b/src/ocrmypdf/filters.py @@ -0,0 +1,10 @@ +from PIL import Image +import PIL.ImageOps + + +def invert(im): + return PIL.ImageOps.invert(im.convert('L')) + + +def whiteout(im): + return Image.new(im.mode, im.size) diff --git a/tests/test_filters.py b/tests/test_filters.py new file mode 100644 index 00000000..80128dd3 --- /dev/null +++ b/tests/test_filters.py @@ -0,0 +1,91 @@ +# © 2019 James R. Barlow: github.com/jbarlow83 +# +# This file is part of OCRmyPDF. +# +# OCRmyPDF is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OCRmyPDF is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OCRmyPDF. If not, see . + +import os + +from PIL import Image +import pytest + + +from ocrmypdf import ocrmypdf +from ocrmypdf.filters import invert, whiteout +from ocrmypdf._filters import load_filter + + +os_environ = pytest.helpers.os_environ +check_ocrmypdf = pytest.helpers.check_ocrmypdf + + +def filter_42(): + return 42 + + +def test_pyfile(): + obj = load_filter(f'{__file__}::filter_42') + assert obj() == 42 + + +def test_pyfile_notexist(): + with pytest.raises(FileNotFoundError): + load_filter('thisfile.doesnot.exist.py::filter_42') + + +def test_pyfile_noobject(): + with pytest.raises(AttributeError): + load_filter(f'{__file__}::no_function_with_this_name') + + +def test_module(): + obj = load_filter(f'os.getuid') + assert obj() == os.getuid() + + +def test_module_notexist(): + with pytest.raises(ModuleNotFoundError): + load_filter('thismodule.doesnot.exist') + + +def test_filter_from_cmdline(resources, outdir): + (outdir / 'temp.py').write_text( + "from PIL import Image\n" + "def whiteout(im):\n" + " return Image.new(im.mode, im.size)\n" + ) + + check_ocrmypdf( + resources / 'crom.png', + outdir / 'out.pdf', + '--image-dpi', + '100', + '--sidecar', + outdir / 'sidecar.txt', + '--filter-ocr-image', + f"{outdir / 'temp.py'}::whiteout", + ) + + assert (outdir / 'sidecar.txt').read_text().strip() == '' + + +def test_filter_from_api(resources, outdir): + ocrmypdf( + resources / 'crom.png', + outdir / 'out.pdf', + image_dpi=100, + sidecar=outdir / 'sidecar.txt', + filter_ocr_image=whiteout, + ) + assert (outdir / 'sidecar.txt').read_text().strip() == ''