Support plugin invocation with API

This commit is contained in:
James R. Barlow
2020-05-02 03:34:31 -07:00
parent 8c9a8fc85c
commit e02f6c1e97
12 changed files with 472 additions and 429 deletions

View File

@@ -26,7 +26,7 @@ from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf._sync import run_pipeline
from ocrmypdf._validation import check_closed_streams, check_options
from ocrmypdf.api import Verbosity, configure_logging
from ocrmypdf.cli import parser, plugins_only_parser
from ocrmypdf.cli import get_parser, plugins_only_parser
from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError
log = logging.getLogger('ocrmypdf')
@@ -34,9 +34,10 @@ log = logging.getLogger('ocrmypdf')
def run(args=None):
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
if pre_options.plugins:
pm = get_plugin_manager(pre_options)
pm.hook.install_cli(parser=parser)
plugin_manager = get_plugin_manager(pre_options.plugins)
parser = get_parser()
plugin_manager.hook.install_cli(parser=parser)
options = parser.parse_args(args=args)
@@ -68,7 +69,7 @@ def run(args=None):
log.error(e)
return ExitCode.missing_dependency
result = run_pipeline(options=options)
result = run_pipeline(options=options, plugin_manager=plugin_manager)
return result

View File

@@ -69,13 +69,19 @@ class PageContext:
def __getstate__(self):
state = self.__dict__.copy()
del state['plugin_manager']
state['construct_plugin_manager'] = partial(get_plugin_manager, self.options)
if state['plugin_manager'] is not None:
del state['plugin_manager']
state['construct_plugin_manager'] = partial(
get_plugin_manager, self.options.plugins
)
return state
def __setstate__(self, state):
self.__dict__.update(state)
self.plugin_manager = self.__dict__['construct_plugin_manager']()
if 'construct_plugin_manager' in state:
self.plugin_manager = state['construct_plugin_manager']()
else:
self.plugin_manager = None
del self.__dict__['construct_plugin_manager']

View File

@@ -522,9 +522,11 @@ def create_ocr_image(image, page_context):
del draw
im = page_context.plugin_manager.hook.filter_ocr_image(
filter_im = page_context.plugin_manager.hook.filter_ocr_image(
page=page_context, image=im
)
if filter_im is not None:
im = filter_im
# Pillow requires integer DPI
dpi = tuple(round(coord) for coord in im.info['dpi'])

View File

@@ -16,17 +16,17 @@
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
import importlib
from typing import List
import pluggy
from ocrmypdf import pluginspec
def get_plugin_manager(options):
def get_plugin_manager(plugins: List[str]):
pm = pluggy.PluginManager('ocrmypdf')
pm.add_hookspecs(pluginspec)
for name in options.plugins:
for name in plugins:
module = importlib.import_module(name)
pm.register(module)
return pm

View File

@@ -293,12 +293,14 @@ def configure_debug_logging(log_filename, prefix=''):
return log_file_handler
def run_pipeline(options, api=False):
def run_pipeline(options, *, plugin_manager, api=False):
# Any changes to options will not take effect for options that are already
# bound to function parameters in the pipeline. (For example
# options.input_file, options.pdf_renderer are already bound.)
if not options.jobs:
options.jobs = available_cpu_count()
if not plugin_manager:
plugin_manager = get_plugin_manager([])
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
debug_log_handler = None
@@ -307,7 +309,6 @@ def run_pipeline(options, api=False):
):
debug_log_handler = configure_debug_logging(Path(work_folder) / "debug.log")
pm = get_plugin_manager(options)
try:
check_requested_output_file(options)
start_input_file, original_filename = create_input_file(options, work_folder)
@@ -320,7 +321,7 @@ def run_pipeline(options, api=False):
options,
)
pm.hook.prepare(options=options)
plugin_manager.hook.prepare(options=options)
# Gather pdfinfo and create context
pdfinfo = get_pdfinfo(
@@ -329,7 +330,7 @@ def run_pipeline(options, api=False):
max_workers=options.jobs if not options.use_threads else 1, # To help debug
)
context = PDFContext(options, work_folder, origin_pdf, pdfinfo, pm)
context = PDFContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
# Validate options are okay for this pdf
validate_pdfinfo_options(context)

View File

@@ -18,15 +18,17 @@
import logging
import os
import sys
from argparse import ArgumentParser
from contextlib import suppress
from enum import IntEnum
from pathlib import Path
from typing import Dict, Iterable
from ._logging import PageNumberFilter, TqdmConsole
from ._sync import run_pipeline
from ._validation import check_options
from .cli import parser
from ocrmypdf._logging import PageNumberFilter, TqdmConsole
from ocrmypdf._plugin_manager import get_plugin_manager
from ocrmypdf._sync import run_pipeline
from ocrmypdf._validation import check_options
from ocrmypdf.cli import get_parser, plugins_only_parser
try:
import coloredlogs
@@ -125,7 +127,13 @@ def configure_logging(
return log
def create_options(*, input_file: os.PathLike, output_file: os.PathLike, **kwargs):
def create_options(
*,
input_file: os.PathLike,
output_file: os.PathLike,
parser: ArgumentParser,
**kwargs,
):
cmdline = []
deferred = []
@@ -223,9 +231,11 @@ def ocr( # pylint: disable=unused-argument
user_words: os.PathLike = None,
user_patterns: os.PathLike = None,
fast_web_view: float = None,
plugins: Iterable[str] = None,
keep_temporary_files: bool = None,
progress_bar: bool = None,
tesseract_env: Dict[str, str] = None,
**kwargs,
):
"""Run OCRmyPDF on one PDF or image.
@@ -260,7 +270,15 @@ def ocr( # pylint: disable=unused-argument
Returns:
:class:`ocrmypdf.ExitCode`
"""
if not plugins:
plugins = []
options = create_options(**locals())
parser = get_parser()
_plugin_manager = get_plugin_manager(plugins)
_plugin_manager.hook.install_cli(parser=parser)
options = create_options(
**{k: v for k, v in locals().items() if not k.startswith('_')}
)
check_options(options)
return run_pipeline(options, api=True)
return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)

View File

@@ -17,10 +17,8 @@
import argparse
from ._version import PROGRAM_NAME as _PROGRAM_NAME
from ._version import __version__ as _VERSION
__all__ = ['parser']
from ocrmypdf._version import PROGRAM_NAME as _PROGRAM_NAME
from ocrmypdf._version import __version__ as _VERSION
def numeric(basetype, min_=None, max_=None):
@@ -56,18 +54,19 @@ class ArgumentParser(argparse.ArgumentParser):
raise ValueError(message)
parser = ArgumentParser(
prog=_PROGRAM_NAME,
fromfile_prefix_chars='@',
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""\
def get_parser():
parser = ArgumentParser(
prog=_PROGRAM_NAME,
fromfile_prefix_chars='@',
formatter_class=argparse.RawDescriptionHelpFormatter,
description="""\
Generates a searchable PDF or PDF/A from a regular PDF.
OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
rotation and performs image processing, runs the Tesseract OCR engine on the
image, and then creates a PDF from the OCR information.
""",
epilog="""\
epilog="""\
OCRmyPDF attempts to keep the output file at about the same size. If a file
contains losslessly compressed images, and output file will be losslessly
compressed as well.
@@ -108,395 +107,409 @@ Online documentation is located at:
https://ocrmypdf.readthedocs.io/en/latest/introduction.html
""",
)
)
parser.add_argument(
'input_file',
metavar="input_pdf_or_image",
help="PDF file containing the images to be OCRed (or '-' to read from "
"standard input)",
)
parser.add_argument(
'output_file',
metavar="output_pdf",
help="Output searchable PDF file (or '-' to write to standard output). "
"Existing files will be ovewritten. If same as input file, the "
"input file will be updated only if processing is successful.",
)
parser.add_argument(
'-l',
'--language',
action='append',
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
"all language packs installed in your system). Use -l eng+deu for "
"multiple languages.",
)
parser.add_argument(
'--image-dpi',
metavar='DPI',
type=int,
help="For input image instead of PDF, use this DPI instead of file's.",
)
parser.add_argument(
'--output-type',
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
"long term archiving (default, recommended) but may not suitable "
"for users who want their file altered as little as possible. 'pdfa' "
"also has problems with full Unicode text. 'pdf' attempts to "
"preserve file contents as much as possible. 'pdf-a1' creates a "
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
"PDF/A3-b file.",
)
parser.add_argument(
'input_file',
metavar="input_pdf_or_image",
help="PDF file containing the images to be OCRed (or '-' to read from "
"standard input)",
)
parser.add_argument(
'output_file',
metavar="output_pdf",
help="Output searchable PDF file (or '-' to write to standard output). "
"Existing files will be ovewritten. If same as input file, the "
"input file will be updated only if processing is successful.",
)
parser.add_argument(
'-l',
'--language',
action='append',
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
"all language packs installed in your system). Use -l eng+deu for "
"multiple languages.",
)
parser.add_argument(
'--image-dpi',
metavar='DPI',
type=int,
help="For input image instead of PDF, use this DPI instead of file's.",
)
parser.add_argument(
'--output-type',
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
default='pdfa',
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
"long term archiving (default, recommended) but may not suitable "
"for users who want their file altered as little as possible. 'pdfa' "
"also has problems with full Unicode text. 'pdf' attempts to "
"preserve file contents as much as possible. 'pdf-a1' creates a "
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
"PDF/A3-b file.",
)
# Use null string '\0' as sentinel to indicate the user supplied no argument,
# since that is the only invalid character for filepaths on all platforms
# bool('\0') is True in Python
parser.add_argument(
'--sidecar',
nargs='?',
const='\0',
default=None,
metavar='FILE',
help="Generate sidecar text files that contain the same text recognized "
"by Tesseract. This may be useful for building a OCR text database. "
"If FILE is omitted, the sidecar file be named {output_file}.txt "
"If FILE is set to '-', the sidecar is written to stdout (a "
"convenient way to preview OCR quality). The output file and sidecar "
"may not both use stdout at the same time.",
)
# Use null string '\0' as sentinel to indicate the user supplied no argument,
# since that is the only invalid character for filepaths on all platforms
# bool('\0') is True in Python
parser.add_argument(
'--sidecar',
nargs='?',
const='\0',
default=None,
metavar='FILE',
help="Generate sidecar text files that contain the same text recognized "
"by Tesseract. This may be useful for building a OCR text database. "
"If FILE is omitted, the sidecar file be named {output_file}.txt "
"If FILE is set to '-', the sidecar is written to stdout (a "
"convenient way to preview OCR quality). The output file and sidecar "
"may not both use stdout at the same time.",
)
parser.add_argument(
'--version',
action='version',
version=_VERSION,
help="Print program version and exit",
)
parser.add_argument(
'--version',
action='version',
version=_VERSION,
help="Print program version and exit",
)
jobcontrol = parser.add_argument_group("Job control options")
jobcontrol.add_argument(
'-j',
'--jobs',
metavar='N',
type=numeric(int, 0, 256),
help="Use up to N CPU cores simultaneously (default: use all).",
)
jobcontrol.add_argument(
'-q', '--quiet', action='store_true', help="Suppress INFO messages"
)
jobcontrol.add_argument(
'-v',
'--verbose',
type=numeric(int, 0, 2),
default=0,
const=1,
nargs='?',
help="Print more verbose messages for each additional verbose level. Use "
"`-v 1` typically for much more detailed logging. Higher numbers "
"are probably only useful in debugging.",
)
jobcontrol.add_argument(
'--no-progress-bar',
action='store_false',
dest='progress_bar',
help=argparse.SUPPRESS,
)
jobcontrol.add_argument('--use-threads', action='store_true', help=argparse.SUPPRESS)
jobcontrol = parser.add_argument_group("Job control options")
jobcontrol.add_argument(
'-j',
'--jobs',
metavar='N',
type=numeric(int, 0, 256),
help="Use up to N CPU cores simultaneously (default: use all).",
)
jobcontrol.add_argument(
'-q', '--quiet', action='store_true', help="Suppress INFO messages"
)
jobcontrol.add_argument(
'-v',
'--verbose',
type=numeric(int, 0, 2),
default=0,
const=1,
nargs='?',
help="Print more verbose messages for each additional verbose level. Use "
"`-v 1` typically for much more detailed logging. Higher numbers "
"are probably only useful in debugging.",
)
jobcontrol.add_argument(
'--no-progress-bar',
action='store_false',
dest='progress_bar',
help=argparse.SUPPRESS,
)
jobcontrol.add_argument(
'--use-threads', action='store_true', help=argparse.SUPPRESS
)
metadata = parser.add_argument_group(
"Metadata options",
"Set output PDF/A metadata (default: copy input document's metadata)",
)
metadata.add_argument(
'--title', type=str, help="Set document title (place multiple words in quotes)"
)
metadata.add_argument('--author', type=str, help="Set document author")
metadata.add_argument('--subject', type=str, help="Set document subject description")
metadata.add_argument('--keywords', type=str, help="Set document keywords")
metadata = parser.add_argument_group(
"Metadata options",
"Set output PDF/A metadata (default: copy input document's metadata)",
)
metadata.add_argument(
'--title', type=str, help="Set document title (place multiple words in quotes)"
)
metadata.add_argument('--author', type=str, help="Set document author")
metadata.add_argument(
'--subject', type=str, help="Set document subject description"
)
metadata.add_argument('--keywords', type=str, help="Set document keywords")
preprocessing = parser.add_argument_group(
"Image preprocessing options",
"Options to improve the quality of the final PDF and OCR",
)
preprocessing.add_argument(
'-r',
'--rotate-pages',
action='store_true',
help="Automatically rotate pages based on detected text orientation",
)
preprocessing.add_argument(
'--remove-background',
action='store_true',
help="Attempt to remove background from gray or color pages, setting it "
"to white ",
)
preprocessing.add_argument(
'-d', '--deskew', action='store_true', help="Deskew each page before performing OCR"
)
preprocessing.add_argument(
'-c',
'--clean',
action='store_true',
help="Clean pages from scanning artifacts before performing OCR, and send "
"the cleaned page to OCR, but do not include the cleaned page in "
"the output",
)
preprocessing.add_argument(
'-i',
'--clean-final',
action='store_true',
help="Clean page as above, and incorporate the cleaned image in the final "
"PDF. Might remove desired content.",
)
preprocessing.add_argument(
'--unpaper-args',
type=str,
default=None,
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
"Example: --unpaper-args '--layout double'.",
)
preprocessing.add_argument(
'--oversample',
metavar='DPI',
type=numeric(int, 0, 5000),
default=0,
help="Oversample images to at least the specified DPI, to improve OCR "
"results slightly",
)
preprocessing.add_argument(
'--remove-vectors',
action='store_true',
help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
"will not be included in OCR. This can eliminate false characters.",
)
preprocessing.add_argument(
'--threshold',
action='store_true',
help="EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract for OCR. Can "
"improve OCR quality compared to Tesseract's thresholder.",
)
preprocessing = parser.add_argument_group(
"Image preprocessing options",
"Options to improve the quality of the final PDF and OCR",
)
preprocessing.add_argument(
'-r',
'--rotate-pages',
action='store_true',
help="Automatically rotate pages based on detected text orientation",
)
preprocessing.add_argument(
'--remove-background',
action='store_true',
help="Attempt to remove background from gray or color pages, setting it "
"to white ",
)
preprocessing.add_argument(
'-d',
'--deskew',
action='store_true',
help="Deskew each page before performing OCR",
)
preprocessing.add_argument(
'-c',
'--clean',
action='store_true',
help="Clean pages from scanning artifacts before performing OCR, and send "
"the cleaned page to OCR, but do not include the cleaned page in "
"the output",
)
preprocessing.add_argument(
'-i',
'--clean-final',
action='store_true',
help="Clean page as above, and incorporate the cleaned image in the final "
"PDF. Might remove desired content.",
)
preprocessing.add_argument(
'--unpaper-args',
type=str,
default=None,
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
"Example: --unpaper-args '--layout double'.",
)
preprocessing.add_argument(
'--oversample',
metavar='DPI',
type=numeric(int, 0, 5000),
default=0,
help="Oversample images to at least the specified DPI, to improve OCR "
"results slightly",
)
preprocessing.add_argument(
'--remove-vectors',
action='store_true',
help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
"will not be included in OCR. This can eliminate false characters.",
)
preprocessing.add_argument(
'--threshold',
action='store_true',
help=(
"EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract "
"for OCR. Can improve OCR quality compared to Tesseract's thresholder."
),
)
ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
ocrsettings.add_argument(
'-f',
'--force-ocr',
action='store_true',
help="Rasterize any text or vector objects on each page, apply OCR, and "
"save the rastered output (this rewrites the PDF)",
)
ocrsettings.add_argument(
'-s',
'--skip-text',
action='store_true',
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages",
)
ocrsettings.add_argument(
'--redo-ocr',
action='store_true',
help="Attempt to detect and remove the hidden OCR layer from files that "
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
"to text found in raster images. Existing visible text objects will "
"not be changed. If there is no existing OCR, OCR will be added.",
)
ocrsettings.add_argument(
'--skip-big',
type=numeric(float, 0, 5000),
metavar='MPixels',
help="Skip OCR on pages larger than the specified amount of megapixels, "
"but include skipped pages in final output",
)
ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
ocrsettings.add_argument(
'-f',
'--force-ocr',
action='store_true',
help="Rasterize any text or vector objects on each page, apply OCR, and "
"save the rastered output (this rewrites the PDF)",
)
ocrsettings.add_argument(
'-s',
'--skip-text',
action='store_true',
help="Skip OCR on any pages that already contain text, but include the "
"page in final output; useful for PDFs that contain a mix of "
"images, text pages, and/or previously OCRed pages",
)
ocrsettings.add_argument(
'--redo-ocr',
action='store_true',
help="Attempt to detect and remove the hidden OCR layer from files that "
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
"to text found in raster images. Existing visible text objects will "
"not be changed. If there is no existing OCR, OCR will be added.",
)
ocrsettings.add_argument(
'--skip-big',
type=numeric(float, 0, 5000),
metavar='MPixels',
help="Skip OCR on pages larger than the specified amount of megapixels, "
"but include skipped pages in final output",
)
optimizing = parser.add_argument_group(
"Optimization options", "Control how the PDF is optimized after OCR"
)
optimizing.add_argument(
'-O',
'--optimize',
type=int,
choices=range(0, 4),
default=1,
help=(
"Control how PDF is optimized after processing:"
"0 - do not optimize; "
"1 - do safe, lossless optimizations (default); "
"2 - do some lossy optimizations; "
"3 - do aggressive lossy optimizations (including lossy JBIG2)"
),
)
optimizing.add_argument(
'--jpeg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust JPEG quality level for JPEG optimization. "
"100 is best quality and largest output size; "
"1 is lowest quality and smallest output; "
"0 uses the default."
),
)
optimizing.add_argument(
'--jpg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
dest='jpeg_quality',
help=argparse.SUPPRESS, # Alias for --jpeg-quality
)
optimizing.add_argument(
'--png-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust PNG quality level to use when quantizing PNGs. "
"Values have same meaning as with --jpeg-quality"
),
)
optimizing.add_argument(
'--jbig2-lossy',
action='store_true',
help=(
"Enable JBIG2 lossy mode (better compression, not suitable for some "
"use cases - see documentation)."
),
)
optimizing.add_argument(
'--jbig2-page-group-size',
type=numeric(int, 1, 10000),
default=0,
metavar='N',
# Adjust number of pages to consider at once for JBIG2 compression
help=argparse.SUPPRESS,
)
optimizing = parser.add_argument_group(
"Optimization options", "Control how the PDF is optimized after OCR"
)
optimizing.add_argument(
'-O',
'--optimize',
type=int,
choices=range(0, 4),
default=1,
help=(
"Control how PDF is optimized after processing:"
"0 - do not optimize; "
"1 - do safe, lossless optimizations (default); "
"2 - do some lossy optimizations; "
"3 - do aggressive lossy optimizations (including lossy JBIG2)"
),
)
optimizing.add_argument(
'--jpeg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust JPEG quality level for JPEG optimization. "
"100 is best quality and largest output size; "
"1 is lowest quality and smallest output; "
"0 uses the default."
),
)
optimizing.add_argument(
'--jpg-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
dest='jpeg_quality',
help=argparse.SUPPRESS, # Alias for --jpeg-quality
)
optimizing.add_argument(
'--png-quality',
type=numeric(int, 0, 100),
default=0,
metavar='Q',
help=(
"Adjust PNG quality level to use when quantizing PNGs. "
"Values have same meaning as with --jpeg-quality"
),
)
optimizing.add_argument(
'--jbig2-lossy',
action='store_true',
help=(
"Enable JBIG2 lossy mode (better compression, not suitable for some "
"use cases - see documentation)."
),
)
optimizing.add_argument(
'--jbig2-page-group-size',
type=numeric(int, 1, 10000),
default=0,
metavar='N',
# Adjust number of pages to consider at once for JBIG2 compression
help=argparse.SUPPRESS,
)
advanced = parser.add_argument_group(
"Advanced", "Advanced options to control Tesseract's OCR behavior"
)
advanced.add_argument(
'--pages',
type=str,
help="Limit OCR to the specified pages (ranges or comma separated), skipping others",
)
advanced.add_argument(
'--max-image-mpixels',
action='store',
type=numeric(float, 0),
metavar='MPixels',
help="Set maximum number of pixels to unpack before treating an image as a "
"decompression bomb",
default=128.0,
)
advanced.add_argument(
'--tesseract-config',
action='append',
metavar='CFG',
default=[],
help="Additional Tesseract configuration files -- see documentation",
)
advanced.add_argument(
'--tesseract-pagesegmode',
action='store',
type=int,
metavar='PSM',
choices=range(0, 14),
help="Set Tesseract page segmentation mode (see tesseract --help)",
)
advanced.add_argument(
'--tesseract-oem',
action='store',
type=int,
metavar='MODE',
choices=range(0, 4),
help=(
"Set Tesseract 4.0 OCR engine mode: "
"0 - original Tesseract only; "
"1 - neural nets LSTM only; "
"2 - Tesseract + LSTM; "
"3 - default."
),
)
advanced.add_argument(
'--pdf-renderer',
choices=['auto', 'hocr', 'sandwich'],
default='auto',
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
"choose. See documentation for discussion.",
)
advanced.add_argument(
'--tesseract-timeout',
default=180.0,
type=numeric(float, 0),
metavar='SECONDS',
help='Give up on OCR after the timeout, but copy the preprocessed page '
'into the final output',
)
advanced.add_argument(
'--rotate-pages-threshold',
default=14.0,
type=numeric(float, 0, 1000),
metavar='CONFIDENCE',
help="Only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)",
)
advanced.add_argument(
'--pdfa-image-compression',
choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.",
)
advanced.add_argument(
'--user-words',
metavar='FILE',
help="Specify the location of the Tesseract user words file. This is a "
"list of words Tesseract should consider while performing OCR in "
"addition to its standard language dictionaries. This can improve "
"OCR quality especially for specialized and technical documents.",
)
advanced.add_argument(
'--user-patterns',
metavar='FILE',
help="Specify the location of the Tesseract user patterns file.",
)
advanced.add_argument(
'--fast-web-view',
type=numeric(float, 0),
default=1.0,
metavar="MEGABYTES",
help="If the size of file is more than this threshold (in MB), then "
"linearize the PDF for fast web viewing. This allows the PDF to be "
"displayed before it is fully downloaded in web browsers, but increases "
"the space required slightly. By default we skip this for small files "
"which do not benefit. If the threshold is 0 it will be apply to all files. "
"Set the threshold very high to disable.",
)
advanced.add_argument(
'--plugins',
action='append',
default=[],
help="Path to a folder than contains plugins.",
)
advanced = parser.add_argument_group(
"Advanced", "Advanced options to control Tesseract's OCR behavior"
)
advanced.add_argument(
'--pages',
type=str,
help=(
"Limit OCR to the specified pages (ranges or comma separated), "
"skipping others",
),
)
advanced.add_argument(
'--max-image-mpixels',
action='store',
type=numeric(float, 0),
metavar='MPixels',
help="Set maximum number of pixels to unpack before treating an image as a "
"decompression bomb",
default=128.0,
)
advanced.add_argument(
'--tesseract-config',
action='append',
metavar='CFG',
default=[],
help="Additional Tesseract configuration files -- see documentation",
)
advanced.add_argument(
'--tesseract-pagesegmode',
action='store',
type=int,
metavar='PSM',
choices=range(0, 14),
help="Set Tesseract page segmentation mode (see tesseract --help)",
)
advanced.add_argument(
'--tesseract-oem',
action='store',
type=int,
metavar='MODE',
choices=range(0, 4),
help=(
"Set Tesseract 4.0 OCR engine mode: "
"0 - original Tesseract only; "
"1 - neural nets LSTM only; "
"2 - Tesseract + LSTM; "
"3 - default."
),
)
advanced.add_argument(
'--pdf-renderer',
choices=['auto', 'hocr', 'sandwich'],
default='auto',
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
"choose. See documentation for discussion.",
)
advanced.add_argument(
'--tesseract-timeout',
default=180.0,
type=numeric(float, 0),
metavar='SECONDS',
help='Give up on OCR after the timeout, but copy the preprocessed page '
'into the final output',
)
advanced.add_argument(
'--rotate-pages-threshold',
default=14.0,
type=numeric(float, 0, 1000),
metavar='CONFIDENCE',
help="Only rotate pages when confidence is above this value (arbitrary "
"units reported by tesseract)",
)
advanced.add_argument(
'--pdfa-image-compression',
choices=['auto', 'jpeg', 'lossless'],
default='auto',
help="Specify how to compress images in the output PDF/A. 'auto' lets "
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
"JPEG compression. 'lossless' uses PNG-style lossless compression "
"for all images. Monochrome images are always compressed using a "
"lossless codec. Compression settings "
"are applied to all pages, including those for which OCR was "
"skipped. Not supported for --output-type=pdf ; that setting "
"preserves the original compression of all images.",
)
advanced.add_argument(
'--user-words',
metavar='FILE',
help="Specify the location of the Tesseract user words file. This is a "
"list of words Tesseract should consider while performing OCR in "
"addition to its standard language dictionaries. This can improve "
"OCR quality especially for specialized and technical documents.",
)
advanced.add_argument(
'--user-patterns',
metavar='FILE',
help="Specify the location of the Tesseract user patterns file.",
)
advanced.add_argument(
'--fast-web-view',
type=numeric(float, 0),
default=1.0,
metavar="MEGABYTES",
help="If the size of file is more than this threshold (in MB), then "
"linearize the PDF for fast web viewing. This allows the PDF to be "
"displayed before it is fully downloaded in web browsers, but increases "
"the space required slightly. By default we skip this for small files "
"which do not benefit. If the threshold is 0 it will be apply to all files. "
"Set the threshold very high to disable.",
)
advanced.add_argument(
'--plugins',
action='append',
default=[],
help="Path to a folder than contains plugins.",
)
debugging = parser.add_argument_group(
"Debugging", "Arguments to help with troubleshooting and debugging"
)
debugging.add_argument(
'-k',
'--keep-temporary-files',
action='store_true',
help="Keep temporary files (helpful for debugging)",
)
debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
return parser
debugging = parser.add_argument_group(
"Debugging", "Arguments to help with troubleshooting and debugging"
)
debugging.add_argument(
'-k',
'--keep-temporary-files',
action='store_true',
help="Keep temporary files (helpful for debugging)",
)
debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
plugins_only_parser = ArgumentParser(
prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False

View File

@@ -582,7 +582,7 @@ def main(infile, outfile, level, jobs=1):
)
with TemporaryDirectory() as td:
context = PDFContext(options, td, infile, None)
context = PDFContext(options, td, infile, None, None)
tmpout = Path(td) / 'out.pdf'
optimize(
infile,

View File

@@ -214,7 +214,7 @@ def no_outpdf(tmp_path):
def check_ocrmypdf(input_file, output_file, *args, env=None):
"""Run ocrmypdf and confirmed that a valid file was created"""
options = cli.parser.parse_args(
options = cli.get_parser().parse_args(
[str(input_file), str(output_file)]
+ [str(arg) for arg in args if arg is not None]
)
@@ -222,7 +222,7 @@ def check_ocrmypdf(input_file, output_file, *args, env=None):
if env:
options.tesseract_env = env
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
result = api.run_pipeline(options, api=True)
result = api.run_pipeline(options, plugin_manager=None, api=True)
assert result == 0
assert os.path.exists(str(output_file)), "Output file not created"
@@ -238,7 +238,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None):
Does not currently have a way to manipulate the PATH except for Tesseract.
"""
options = cli.parser.parse_args(
options = cli.get_parser().parse_args(
[str(input_file), str(output_file)]
+ [str(arg) for arg in args if arg is not None]
)
@@ -253,7 +253,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None):
if options.tesseract_env:
assert all(isinstance(v, (str, bytes)) for v in options.tesseract_env.values())
return api.run_pipeline(options, api=False)
return api.run_pipeline(options, plugin_manager=None, api=False)
@pytest.helpers.register

View File

@@ -32,7 +32,7 @@ from pikepdf.models.metadata import decode_pdf_date
from ocrmypdf._jobcontext import PDFContext
from ocrmypdf._pipeline import convert_to_pdfa
from ocrmypdf.cli import parser
from ocrmypdf.cli import get_parser
from ocrmypdf.exceptions import ExitCode
from ocrmypdf.pdfa import SRGB_ICC_PROFILE, file_claims_pdfa, generate_pdfa_ps
from ocrmypdf.pdfinfo import PdfInfo
@@ -290,16 +290,15 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
def test_metadata_fixup_warning(resources, outdir, caplog):
from ocrmypdf.__main__ import parser
from ocrmypdf._pipeline import metadata_fixup
options = parser.parse_args(
options = get_parser().parse_args(
args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
)
copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')
context = PDFContext(options, outdir, outdir / 'graph.pdf', None)
context = PDFContext(options, outdir, outdir / 'graph.pdf', None, None)
metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
for record in caplog.records:
assert record.levelname != 'WARNING'
@@ -310,7 +309,7 @@ def test_metadata_fixup_warning(resources, outdir, caplog):
meta['prism2:publicationName'] = 'OCRmyPDF Test'
graph.save(outdir / 'graph_mod.pdf')
context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None)
context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None, None)
metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
assert any(record.levelname == 'WARNING' for record in caplog.records)
@@ -326,11 +325,11 @@ def test_prevent_gs_invalid_xml(resources, outdir):
Title=b'String with trailing nul\x00'
)
options = parser.parse_args(
options = get_parser().parse_args(
args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
)
pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo)
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None)
convert_to_pdfa(
str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
@@ -357,11 +356,11 @@ def test_malformed_docinfo(caplog, resources, outdir):
pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
options = parser.parse_args(
options = get_parser().parse_args(
args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
)
pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo)
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None)
convert_to_pdfa(
str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context

View File

@@ -21,7 +21,7 @@ from unittest.mock import patch
import pytest
from ocrmypdf._validation import check_options
from ocrmypdf.cli import parser
from ocrmypdf.cli import get_parser
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
from ocrmypdf.exec import unpaper
@@ -51,7 +51,7 @@ def spoof_unpaper_oldversion(tmp_path_factory):
def test_no_unpaper(resources, no_outpdf):
input_ = fspath(resources / "c02-22.pdf")
output = fspath(no_outpdf)
options = parser.parse_args(args=["--clean", input_, output])
options = get_parser().parse_args(args=["--clean", input_, output])
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")

View File

@@ -23,6 +23,7 @@ import pytest
import ocrmypdf._validation as vd
from ocrmypdf.api import create_options
from ocrmypdf.cli import get_parser
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
from ocrmypdf.pdfinfo import PdfInfo
@@ -30,7 +31,9 @@ from ocrmypdf.pdfinfo import PdfInfo
def make_opts(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):
if language is not None:
kwargs['language'] = language
return create_options(input_file=input_file, output_file=output_file, **kwargs)
return create_options(
input_file=input_file, output_file=output_file, parser=get_parser(), **kwargs
)
def test_hocr_notlatin_warning(caplog):