diff --git a/src/ocrmypdf/_filters.py b/src/ocrmypdf/_filters.py
new file mode 100644
index 00000000..44ea1867
--- /dev/null
+++ b/src/ocrmypdf/_filters.py
@@ -0,0 +1,82 @@
+# © 2019 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF. If not, see .
+
+import logging
+import importlib
+import os
+import sys
+
+
+log = logging.getLogger(__name__)
+
+
+def _load_function_from_module(location):
+ """Load a function given a module location
+
+ For location=a.b.c, will effectively run "from a.b import c"
+
+ Example:
+ _load_function_from_module("a.b.c")
+
+ """
+ module_parts = location.split('.')
+ module_name = '.'.join(module_parts[:-1])
+ object_name = module_parts[-1]
+ module = importlib.import_module(module_name)
+ fn = getattr(module, object_name)
+ log.debug(f"Loaded function: from {module_name} import {object_name}")
+ return fn
+
+
+def _load_function_from_pyfile(location):
+ """Load a function from a file
+
+ Example:
+ _load_function_from_pyfile("test.py::blur_filter")
+ """
+ filename, object_name = location.split('::', maxsplit=1)
+ log.debug(f"Loading function {object_name} from {filename}")
+
+ module_name = os.path.basename(filename)
+ if module_name.endswith('.py'):
+ module_name = module_name[:-3]
+
+ spec = importlib.util.spec_from_file_location(module_name, filename)
+ module = importlib.util.module_from_spec(spec)
+ spec.loader.exec_module(module)
+ fn = getattr(module, object_name)
+ return fn
+
+
+def load_filter(filt):
+ if callable(filt):
+ return filt
+
+ if not isinstance(filt, str):
+ raise TypeError()
+
+ if '::' not in filt:
+ filt = _load_function_from_module(filt)
+ else:
+ filt = _load_function_from_pyfile(filt)
+
+ return filt
+
+
+def check_filter_loadable(filt):
+ load_filter(filt)
+ return filt
diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py
index 70657a3a..8ab62309 100644
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -28,6 +28,8 @@ import pikepdf
from pikepdf.models.metadata import encode_pdf_date
from . import PROGRAM_NAME, VERSION, leptonica
+
+from ._filters import load_filter
from .exceptions import (
DpiError,
EncryptedPdfError,
@@ -522,7 +524,8 @@ def create_ocr_image(image, page_context):
im = pix.topil()
if options.filter_ocr_image:
- im = options.filter_ocr_image(im)
+ filt = load_filter(options.filter_ocr_image)
+ im = filt(im)
del draw
# Pillow requires integer DPI
diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py
index 701e3b73..6270a3de 100644
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@@ -118,7 +118,7 @@ def create_options(*, input_file, output_file, **kwargs):
for arg, val in kwargs.items():
if val is None:
continue
- if arg.startswith('filter') and callable(val):
+ if arg.startswith('filter') and (callable(val) or isinstance(val, str)):
filters.append((arg, val))
continue
cmd_style_arg = arg.replace('_', '-')
@@ -188,7 +188,7 @@ def ocrmypdf( # pylint: disable=unused-argument
user_patterns=None,
keep_temporary_files=None,
progress_bar=None,
- process_ocr_image=None,
+ filter_ocr_image=None,
):
"""Run OCRmyPDF on one PDF or image.
diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py
index 1480d677..d7a9cf69 100644
--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@@ -18,6 +18,7 @@
import argparse
from . import PROGRAM_NAME, VERSION
+from ._filters import check_filter_loadable
def numeric(basetype, min_=None, max_=None):
@@ -466,9 +467,10 @@ advanced.add_argument(
help="Specify the location of the Tesseract user patterns file.",
)
-
filters = parser.add_argument_group("Filters", argparse.SUPPRESS)
-filters.add_argument('--filter-ocr-image', help=argparse.SUPPRESS)
+filters.add_argument(
+ '--filter-ocr-image', help=argparse.SUPPRESS, type=check_filter_loadable
+)
debugging = parser.add_argument_group(
"Debugging", "Arguments to help with troubleshooting and debugging"
diff --git a/src/ocrmypdf/filters.py b/src/ocrmypdf/filters.py
new file mode 100644
index 00000000..e51381bf
--- /dev/null
+++ b/src/ocrmypdf/filters.py
@@ -0,0 +1,10 @@
+from PIL import Image
+import PIL.ImageOps
+
+
+def invert(im):
+ return PIL.ImageOps.invert(im.convert('L'))
+
+
+def whiteout(im):
+ return Image.new(im.mode, im.size)
diff --git a/tests/test_filters.py b/tests/test_filters.py
new file mode 100644
index 00000000..80128dd3
--- /dev/null
+++ b/tests/test_filters.py
@@ -0,0 +1,91 @@
+# © 2019 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF. If not, see .
+
+import os
+
+from PIL import Image
+import pytest
+
+
+from ocrmypdf import ocrmypdf
+from ocrmypdf.filters import invert, whiteout
+from ocrmypdf._filters import load_filter
+
+
+os_environ = pytest.helpers.os_environ
+check_ocrmypdf = pytest.helpers.check_ocrmypdf
+
+
+def filter_42():
+ return 42
+
+
+def test_pyfile():
+ obj = load_filter(f'{__file__}::filter_42')
+ assert obj() == 42
+
+
+def test_pyfile_notexist():
+ with pytest.raises(FileNotFoundError):
+ load_filter('thisfile.doesnot.exist.py::filter_42')
+
+
+def test_pyfile_noobject():
+ with pytest.raises(AttributeError):
+ load_filter(f'{__file__}::no_function_with_this_name')
+
+
+def test_module():
+ obj = load_filter(f'os.getuid')
+ assert obj() == os.getuid()
+
+
+def test_module_notexist():
+ with pytest.raises(ModuleNotFoundError):
+ load_filter('thismodule.doesnot.exist')
+
+
+def test_filter_from_cmdline(resources, outdir):
+ (outdir / 'temp.py').write_text(
+ "from PIL import Image\n"
+ "def whiteout(im):\n"
+ " return Image.new(im.mode, im.size)\n"
+ )
+
+ check_ocrmypdf(
+ resources / 'crom.png',
+ outdir / 'out.pdf',
+ '--image-dpi',
+ '100',
+ '--sidecar',
+ outdir / 'sidecar.txt',
+ '--filter-ocr-image',
+ f"{outdir / 'temp.py'}::whiteout",
+ )
+
+ assert (outdir / 'sidecar.txt').read_text().strip() == ''
+
+
+def test_filter_from_api(resources, outdir):
+ ocrmypdf(
+ resources / 'crom.png',
+ outdir / 'out.pdf',
+ image_dpi=100,
+ sidecar=outdir / 'sidecar.txt',
+ filter_ocr_image=whiteout,
+ )
+ assert (outdir / 'sidecar.txt').read_text().strip() == ''