mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-06 05:36:29 -04:00
Introduce plugins/filters
This commit is contained in:
82
src/ocrmypdf/_filters.py
Normal file
82
src/ocrmypdf/_filters.py
Normal file
@@ -0,0 +1,82 @@
|
||||
# © 2019 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import logging
|
||||
import importlib
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
log = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _load_function_from_module(location):
|
||||
"""Load a function given a module location
|
||||
|
||||
For location=a.b.c, will effectively run "from a.b import c"
|
||||
|
||||
Example:
|
||||
_load_function_from_module("a.b.c")
|
||||
|
||||
"""
|
||||
module_parts = location.split('.')
|
||||
module_name = '.'.join(module_parts[:-1])
|
||||
object_name = module_parts[-1]
|
||||
module = importlib.import_module(module_name)
|
||||
fn = getattr(module, object_name)
|
||||
log.debug(f"Loaded function: from {module_name} import {object_name}")
|
||||
return fn
|
||||
|
||||
|
||||
def _load_function_from_pyfile(location):
|
||||
"""Load a function from a file
|
||||
|
||||
Example:
|
||||
_load_function_from_pyfile("test.py::blur_filter")
|
||||
"""
|
||||
filename, object_name = location.split('::', maxsplit=1)
|
||||
log.debug(f"Loading function {object_name} from {filename}")
|
||||
|
||||
module_name = os.path.basename(filename)
|
||||
if module_name.endswith('.py'):
|
||||
module_name = module_name[:-3]
|
||||
|
||||
spec = importlib.util.spec_from_file_location(module_name, filename)
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module)
|
||||
fn = getattr(module, object_name)
|
||||
return fn
|
||||
|
||||
|
||||
def load_filter(filt):
|
||||
if callable(filt):
|
||||
return filt
|
||||
|
||||
if not isinstance(filt, str):
|
||||
raise TypeError()
|
||||
|
||||
if '::' not in filt:
|
||||
filt = _load_function_from_module(filt)
|
||||
else:
|
||||
filt = _load_function_from_pyfile(filt)
|
||||
|
||||
return filt
|
||||
|
||||
|
||||
def check_filter_loadable(filt):
|
||||
load_filter(filt)
|
||||
return filt
|
||||
@@ -28,6 +28,8 @@ import pikepdf
|
||||
from pikepdf.models.metadata import encode_pdf_date
|
||||
|
||||
from . import PROGRAM_NAME, VERSION, leptonica
|
||||
|
||||
from ._filters import load_filter
|
||||
from .exceptions import (
|
||||
DpiError,
|
||||
EncryptedPdfError,
|
||||
@@ -522,7 +524,8 @@ def create_ocr_image(image, page_context):
|
||||
im = pix.topil()
|
||||
|
||||
if options.filter_ocr_image:
|
||||
im = options.filter_ocr_image(im)
|
||||
filt = load_filter(options.filter_ocr_image)
|
||||
im = filt(im)
|
||||
|
||||
del draw
|
||||
# Pillow requires integer DPI
|
||||
|
||||
@@ -118,7 +118,7 @@ def create_options(*, input_file, output_file, **kwargs):
|
||||
for arg, val in kwargs.items():
|
||||
if val is None:
|
||||
continue
|
||||
if arg.startswith('filter') and callable(val):
|
||||
if arg.startswith('filter') and (callable(val) or isinstance(val, str)):
|
||||
filters.append((arg, val))
|
||||
continue
|
||||
cmd_style_arg = arg.replace('_', '-')
|
||||
@@ -188,7 +188,7 @@ def ocrmypdf( # pylint: disable=unused-argument
|
||||
user_patterns=None,
|
||||
keep_temporary_files=None,
|
||||
progress_bar=None,
|
||||
process_ocr_image=None,
|
||||
filter_ocr_image=None,
|
||||
):
|
||||
"""Run OCRmyPDF on one PDF or image.
|
||||
|
||||
|
||||
@@ -18,6 +18,7 @@
|
||||
import argparse
|
||||
|
||||
from . import PROGRAM_NAME, VERSION
|
||||
from ._filters import check_filter_loadable
|
||||
|
||||
|
||||
def numeric(basetype, min_=None, max_=None):
|
||||
@@ -466,9 +467,10 @@ advanced.add_argument(
|
||||
help="Specify the location of the Tesseract user patterns file.",
|
||||
)
|
||||
|
||||
|
||||
filters = parser.add_argument_group("Filters", argparse.SUPPRESS)
|
||||
filters.add_argument('--filter-ocr-image', help=argparse.SUPPRESS)
|
||||
filters.add_argument(
|
||||
'--filter-ocr-image', help=argparse.SUPPRESS, type=check_filter_loadable
|
||||
)
|
||||
|
||||
debugging = parser.add_argument_group(
|
||||
"Debugging", "Arguments to help with troubleshooting and debugging"
|
||||
|
||||
10
src/ocrmypdf/filters.py
Normal file
10
src/ocrmypdf/filters.py
Normal file
@@ -0,0 +1,10 @@
|
||||
from PIL import Image
|
||||
import PIL.ImageOps
|
||||
|
||||
|
||||
def invert(im):
|
||||
return PIL.ImageOps.invert(im.convert('L'))
|
||||
|
||||
|
||||
def whiteout(im):
|
||||
return Image.new(im.mode, im.size)
|
||||
91
tests/test_filters.py
Normal file
91
tests/test_filters.py
Normal file
@@ -0,0 +1,91 @@
|
||||
# © 2019 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import os
|
||||
|
||||
from PIL import Image
|
||||
import pytest
|
||||
|
||||
|
||||
from ocrmypdf import ocrmypdf
|
||||
from ocrmypdf.filters import invert, whiteout
|
||||
from ocrmypdf._filters import load_filter
|
||||
|
||||
|
||||
os_environ = pytest.helpers.os_environ
|
||||
check_ocrmypdf = pytest.helpers.check_ocrmypdf
|
||||
|
||||
|
||||
def filter_42():
|
||||
return 42
|
||||
|
||||
|
||||
def test_pyfile():
|
||||
obj = load_filter(f'{__file__}::filter_42')
|
||||
assert obj() == 42
|
||||
|
||||
|
||||
def test_pyfile_notexist():
|
||||
with pytest.raises(FileNotFoundError):
|
||||
load_filter('thisfile.doesnot.exist.py::filter_42')
|
||||
|
||||
|
||||
def test_pyfile_noobject():
|
||||
with pytest.raises(AttributeError):
|
||||
load_filter(f'{__file__}::no_function_with_this_name')
|
||||
|
||||
|
||||
def test_module():
|
||||
obj = load_filter(f'os.getuid')
|
||||
assert obj() == os.getuid()
|
||||
|
||||
|
||||
def test_module_notexist():
|
||||
with pytest.raises(ModuleNotFoundError):
|
||||
load_filter('thismodule.doesnot.exist')
|
||||
|
||||
|
||||
def test_filter_from_cmdline(resources, outdir):
|
||||
(outdir / 'temp.py').write_text(
|
||||
"from PIL import Image\n"
|
||||
"def whiteout(im):\n"
|
||||
" return Image.new(im.mode, im.size)\n"
|
||||
)
|
||||
|
||||
check_ocrmypdf(
|
||||
resources / 'crom.png',
|
||||
outdir / 'out.pdf',
|
||||
'--image-dpi',
|
||||
'100',
|
||||
'--sidecar',
|
||||
outdir / 'sidecar.txt',
|
||||
'--filter-ocr-image',
|
||||
f"{outdir / 'temp.py'}::whiteout",
|
||||
)
|
||||
|
||||
assert (outdir / 'sidecar.txt').read_text().strip() == ''
|
||||
|
||||
|
||||
def test_filter_from_api(resources, outdir):
|
||||
ocrmypdf(
|
||||
resources / 'crom.png',
|
||||
outdir / 'out.pdf',
|
||||
image_dpi=100,
|
||||
sidecar=outdir / 'sidecar.txt',
|
||||
filter_ocr_image=whiteout,
|
||||
)
|
||||
assert (outdir / 'sidecar.txt').read_text().strip() == ''
|
||||
Reference in New Issue
Block a user