Introduce plugins/filters

This commit is contained in:
James R. Barlow
2019-05-27 16:55:04 -07:00
parent 5c4c32ab3c
commit 7566d4b768
6 changed files with 193 additions and 5 deletions

82
src/ocrmypdf/_filters.py Normal file
View File

@@ -0,0 +1,82 @@
# © 2019 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import logging
import importlib
import os
import sys
log = logging.getLogger(__name__)
def _load_function_from_module(location):
"""Load a function given a module location
For location=a.b.c, will effectively run "from a.b import c"
Example:
_load_function_from_module("a.b.c")
"""
module_parts = location.split('.')
module_name = '.'.join(module_parts[:-1])
object_name = module_parts[-1]
module = importlib.import_module(module_name)
fn = getattr(module, object_name)
log.debug(f"Loaded function: from {module_name} import {object_name}")
return fn
def _load_function_from_pyfile(location):
"""Load a function from a file
Example:
_load_function_from_pyfile("test.py::blur_filter")
"""
filename, object_name = location.split('::', maxsplit=1)
log.debug(f"Loading function {object_name} from {filename}")
module_name = os.path.basename(filename)
if module_name.endswith('.py'):
module_name = module_name[:-3]
spec = importlib.util.spec_from_file_location(module_name, filename)
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module)
fn = getattr(module, object_name)
return fn
def load_filter(filt):
if callable(filt):
return filt
if not isinstance(filt, str):
raise TypeError()
if '::' not in filt:
filt = _load_function_from_module(filt)
else:
filt = _load_function_from_pyfile(filt)
return filt
def check_filter_loadable(filt):
load_filter(filt)
return filt

View File

@@ -28,6 +28,8 @@ import pikepdf
from pikepdf.models.metadata import encode_pdf_date
from . import PROGRAM_NAME, VERSION, leptonica
from ._filters import load_filter
from .exceptions import (
DpiError,
EncryptedPdfError,
@@ -522,7 +524,8 @@ def create_ocr_image(image, page_context):
im = pix.topil()
if options.filter_ocr_image:
im = options.filter_ocr_image(im)
filt = load_filter(options.filter_ocr_image)
im = filt(im)
del draw
# Pillow requires integer DPI

View File

@@ -118,7 +118,7 @@ def create_options(*, input_file, output_file, **kwargs):
for arg, val in kwargs.items():
if val is None:
continue
if arg.startswith('filter') and callable(val):
if arg.startswith('filter') and (callable(val) or isinstance(val, str)):
filters.append((arg, val))
continue
cmd_style_arg = arg.replace('_', '-')
@@ -188,7 +188,7 @@ def ocrmypdf( # pylint: disable=unused-argument
user_patterns=None,
keep_temporary_files=None,
progress_bar=None,
process_ocr_image=None,
filter_ocr_image=None,
):
"""Run OCRmyPDF on one PDF or image.

View File

@@ -18,6 +18,7 @@
import argparse
from . import PROGRAM_NAME, VERSION
from ._filters import check_filter_loadable
def numeric(basetype, min_=None, max_=None):
@@ -466,9 +467,10 @@ advanced.add_argument(
help="Specify the location of the Tesseract user patterns file.",
)
filters = parser.add_argument_group("Filters", argparse.SUPPRESS)
filters.add_argument('--filter-ocr-image', help=argparse.SUPPRESS)
filters.add_argument(
'--filter-ocr-image', help=argparse.SUPPRESS, type=check_filter_loadable
)
debugging = parser.add_argument_group(
"Debugging", "Arguments to help with troubleshooting and debugging"

10
src/ocrmypdf/filters.py Normal file
View File

@@ -0,0 +1,10 @@
from PIL import Image
import PIL.ImageOps
def invert(im):
return PIL.ImageOps.invert(im.convert('L'))
def whiteout(im):
return Image.new(im.mode, im.size)

91
tests/test_filters.py Normal file
View File

@@ -0,0 +1,91 @@
# © 2019 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import os
from PIL import Image
import pytest
from ocrmypdf import ocrmypdf
from ocrmypdf.filters import invert, whiteout
from ocrmypdf._filters import load_filter
os_environ = pytest.helpers.os_environ
check_ocrmypdf = pytest.helpers.check_ocrmypdf
def filter_42():
return 42
def test_pyfile():
obj = load_filter(f'{__file__}::filter_42')
assert obj() == 42
def test_pyfile_notexist():
with pytest.raises(FileNotFoundError):
load_filter('thisfile.doesnot.exist.py::filter_42')
def test_pyfile_noobject():
with pytest.raises(AttributeError):
load_filter(f'{__file__}::no_function_with_this_name')
def test_module():
obj = load_filter(f'os.getuid')
assert obj() == os.getuid()
def test_module_notexist():
with pytest.raises(ModuleNotFoundError):
load_filter('thismodule.doesnot.exist')
def test_filter_from_cmdline(resources, outdir):
(outdir / 'temp.py').write_text(
"from PIL import Image\n"
"def whiteout(im):\n"
" return Image.new(im.mode, im.size)\n"
)
check_ocrmypdf(
resources / 'crom.png',
outdir / 'out.pdf',
'--image-dpi',
'100',
'--sidecar',
outdir / 'sidecar.txt',
'--filter-ocr-image',
f"{outdir / 'temp.py'}::whiteout",
)
assert (outdir / 'sidecar.txt').read_text().strip() == ''
def test_filter_from_api(resources, outdir):
ocrmypdf(
resources / 'crom.png',
outdir / 'out.pdf',
image_dpi=100,
sidecar=outdir / 'sidecar.txt',
filter_ocr_image=whiteout,
)
assert (outdir / 'sidecar.txt').read_text().strip() == ''