Introduce plugins/filters

2026-05-06 05:36:29 -04:00 · 2019-05-27 16:55:04 -07:00
parent 5c4c32ab3c
commit 7566d4b768
6 changed files with 193 additions and 5 deletions
--- a/src/ocrmypdf/_filters.py
+++ b/src/ocrmypdf/_filters.py
@@ -0,0 +1,82 @@
+# © 2019 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+import logging
+import importlib
+import os
+import sys
+
+
+log = logging.getLogger(__name__)
+
+
+def _load_function_from_module(location):
+    """Load a function given a module location
+
+    For location=a.b.c, will effectively run "from a.b import c"
+
+    Example:
+        _load_function_from_module("a.b.c")
+
+    """
+    module_parts = location.split('.')
+    module_name = '.'.join(module_parts[:-1])
+    object_name = module_parts[-1]
+    module = importlib.import_module(module_name)
+    fn = getattr(module, object_name)
+    log.debug(f"Loaded function: from {module_name} import {object_name}")
+    return fn
+
+
+def _load_function_from_pyfile(location):
+    """Load a function from a file
+
+    Example:
+        _load_function_from_pyfile("test.py::blur_filter")
+    """
+    filename, object_name = location.split('::', maxsplit=1)
+    log.debug(f"Loading function {object_name} from {filename}")
+
+    module_name = os.path.basename(filename)
+    if module_name.endswith('.py'):
+        module_name = module_name[:-3]
+
+    spec = importlib.util.spec_from_file_location(module_name, filename)
+    module = importlib.util.module_from_spec(spec)
+    spec.loader.exec_module(module)
+    fn = getattr(module, object_name)
+    return fn
+
+
+def load_filter(filt):
+    if callable(filt):
+        return filt
+
+    if not isinstance(filt, str):
+        raise TypeError()
+
+    if '::' not in filt:
+        filt = _load_function_from_module(filt)
+    else:
+        filt = _load_function_from_pyfile(filt)
+
+    return filt
+
+
+def check_filter_loadable(filt):
+    load_filter(filt)
+    return filt
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -28,6 +28,8 @@ import pikepdf
 from pikepdf.models.metadata import encode_pdf_date

 from . import PROGRAM_NAME, VERSION, leptonica
+
+from ._filters import load_filter
 from .exceptions import (
    DpiError,
    EncryptedPdfError,
@@ -522,7 +524,8 @@ def create_ocr_image(image, page_context):
            im = pix.topil()

        if options.filter_ocr_image:
-            im = options.filter_ocr_image(im)
+            filt = load_filter(options.filter_ocr_image)
+            im = filt(im)

        del draw
        # Pillow requires integer DPI
--- a/src/ocrmypdf/api.py
+++ b/src/ocrmypdf/api.py
@@ -118,7 +118,7 @@ def create_options(*, input_file, output_file, **kwargs):
    for arg, val in kwargs.items():
        if val is None:
            continue
-        if arg.startswith('filter') and callable(val):
+        if arg.startswith('filter') and (callable(val) or isinstance(val, str)):
            filters.append((arg, val))
            continue
        cmd_style_arg = arg.replace('_', '-')
@@ -188,7 +188,7 @@ def ocrmypdf(  # pylint: disable=unused-argument
    user_patterns=None,
    keep_temporary_files=None,
    progress_bar=None,
-    process_ocr_image=None,
+    filter_ocr_image=None,
 ):
    """Run OCRmyPDF on one PDF or image.

--- a/src/ocrmypdf/cli.py
+++ b/src/ocrmypdf/cli.py
@@ -18,6 +18,7 @@
 import argparse

 from . import PROGRAM_NAME, VERSION
+from ._filters import check_filter_loadable


 def numeric(basetype, min_=None, max_=None):
@@ -466,9 +467,10 @@ advanced.add_argument(
    help="Specify the location of the Tesseract user patterns file.",
 )

-
 filters = parser.add_argument_group("Filters", argparse.SUPPRESS)
-filters.add_argument('--filter-ocr-image', help=argparse.SUPPRESS)
+filters.add_argument(
+    '--filter-ocr-image', help=argparse.SUPPRESS, type=check_filter_loadable
+)

 debugging = parser.add_argument_group(
    "Debugging", "Arguments to help with troubleshooting and debugging"
--- a/src/ocrmypdf/filters.py
+++ b/src/ocrmypdf/filters.py
@@ -0,0 +1,10 @@
+from PIL import Image
+import PIL.ImageOps
+
+
+def invert(im):
+    return PIL.ImageOps.invert(im.convert('L'))
+
+
+def whiteout(im):
+    return Image.new(im.mode, im.size)
--- a/tests/test_filters.py
+++ b/tests/test_filters.py
@@ -0,0 +1,91 @@
+# © 2019 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+import os
+
+from PIL import Image
+import pytest
+
+
+from ocrmypdf import ocrmypdf
+from ocrmypdf.filters import invert, whiteout
+from ocrmypdf._filters import load_filter
+
+
+os_environ = pytest.helpers.os_environ
+check_ocrmypdf = pytest.helpers.check_ocrmypdf
+
+
+def filter_42():
+    return 42
+
+
+def test_pyfile():
+    obj = load_filter(f'{__file__}::filter_42')
+    assert obj() == 42
+
+
+def test_pyfile_notexist():
+    with pytest.raises(FileNotFoundError):
+        load_filter('thisfile.doesnot.exist.py::filter_42')
+
+
+def test_pyfile_noobject():
+    with pytest.raises(AttributeError):
+        load_filter(f'{__file__}::no_function_with_this_name')
+
+
+def test_module():
+    obj = load_filter(f'os.getuid')
+    assert obj() == os.getuid()
+
+
+def test_module_notexist():
+    with pytest.raises(ModuleNotFoundError):
+        load_filter('thismodule.doesnot.exist')
+
+
+def test_filter_from_cmdline(resources, outdir):
+    (outdir / 'temp.py').write_text(
+        "from PIL import Image\n"
+        "def whiteout(im):\n"
+        "    return Image.new(im.mode, im.size)\n"
+    )
+
+    check_ocrmypdf(
+        resources / 'crom.png',
+        outdir / 'out.pdf',
+        '--image-dpi',
+        '100',
+        '--sidecar',
+        outdir / 'sidecar.txt',
+        '--filter-ocr-image',
+        f"{outdir / 'temp.py'}::whiteout",
+    )
+
+    assert (outdir / 'sidecar.txt').read_text().strip() == ''
+
+
+def test_filter_from_api(resources, outdir):
+    ocrmypdf(
+        resources / 'crom.png',
+        outdir / 'out.pdf',
+        image_dpi=100,
+        sidecar=outdir / 'sidecar.txt',
+        filter_ocr_image=whiteout,
+    )
+    assert (outdir / 'sidecar.txt').read_text().strip() == ''