From e02f6c1e97c4353834f7c982ec2d79c15b60aef7 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 2 May 2020 03:34:31 -0700 Subject: [PATCH] Support plugin invocation with API --- src/ocrmypdf/__main__.py | 11 +- src/ocrmypdf/_jobcontext.py | 12 +- src/ocrmypdf/_pipeline.py | 4 +- src/ocrmypdf/_plugin_manager.py | 6 +- src/ocrmypdf/_sync.py | 9 +- src/ocrmypdf/api.py | 32 +- src/ocrmypdf/cli.py | 791 ++++++++++++++++---------------- src/ocrmypdf/optimize.py | 2 +- tests/conftest.py | 8 +- tests/test_metadata.py | 17 +- tests/test_unpaper.py | 4 +- tests/test_validation.py | 5 +- 12 files changed, 472 insertions(+), 429 deletions(-) diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index c62c9409..b6dc0466 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -26,7 +26,7 @@ from ocrmypdf._plugin_manager import get_plugin_manager from ocrmypdf._sync import run_pipeline from ocrmypdf._validation import check_closed_streams, check_options from ocrmypdf.api import Verbosity, configure_logging -from ocrmypdf.cli import parser, plugins_only_parser +from ocrmypdf.cli import get_parser, plugins_only_parser from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError log = logging.getLogger('ocrmypdf') @@ -34,9 +34,10 @@ log = logging.getLogger('ocrmypdf') def run(args=None): pre_options, _unused = plugins_only_parser.parse_known_args(args=args) - if pre_options.plugins: - pm = get_plugin_manager(pre_options) - pm.hook.install_cli(parser=parser) + plugin_manager = get_plugin_manager(pre_options.plugins) + + parser = get_parser() + plugin_manager.hook.install_cli(parser=parser) options = parser.parse_args(args=args) @@ -68,7 +69,7 @@ def run(args=None): log.error(e) return ExitCode.missing_dependency - result = run_pipeline(options=options) + result = run_pipeline(options=options, plugin_manager=plugin_manager) return result diff --git a/src/ocrmypdf/_jobcontext.py b/src/ocrmypdf/_jobcontext.py index ea48380b..7158abe8 100644 --- a/src/ocrmypdf/_jobcontext.py +++ b/src/ocrmypdf/_jobcontext.py @@ -69,13 +69,19 @@ class PageContext: def __getstate__(self): state = self.__dict__.copy() - del state['plugin_manager'] - state['construct_plugin_manager'] = partial(get_plugin_manager, self.options) + if state['plugin_manager'] is not None: + del state['plugin_manager'] + state['construct_plugin_manager'] = partial( + get_plugin_manager, self.options.plugins + ) return state def __setstate__(self, state): self.__dict__.update(state) - self.plugin_manager = self.__dict__['construct_plugin_manager']() + if 'construct_plugin_manager' in state: + self.plugin_manager = state['construct_plugin_manager']() + else: + self.plugin_manager = None del self.__dict__['construct_plugin_manager'] diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index a8d6365f..cd82fea4 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -522,9 +522,11 @@ def create_ocr_image(image, page_context): del draw - im = page_context.plugin_manager.hook.filter_ocr_image( + filter_im = page_context.plugin_manager.hook.filter_ocr_image( page=page_context, image=im ) + if filter_im is not None: + im = filter_im # Pillow requires integer DPI dpi = tuple(round(coord) for coord in im.info['dpi']) diff --git a/src/ocrmypdf/_plugin_manager.py b/src/ocrmypdf/_plugin_manager.py index 2c3df5a1..295e955c 100644 --- a/src/ocrmypdf/_plugin_manager.py +++ b/src/ocrmypdf/_plugin_manager.py @@ -16,17 +16,17 @@ # along with OCRmyPDF. If not, see . import importlib +from typing import List import pluggy from ocrmypdf import pluginspec -def get_plugin_manager(options): +def get_plugin_manager(plugins: List[str]): pm = pluggy.PluginManager('ocrmypdf') pm.add_hookspecs(pluginspec) - - for name in options.plugins: + for name in plugins: module = importlib.import_module(name) pm.register(module) return pm diff --git a/src/ocrmypdf/_sync.py b/src/ocrmypdf/_sync.py index 1691eb00..0834993d 100644 --- a/src/ocrmypdf/_sync.py +++ b/src/ocrmypdf/_sync.py @@ -293,12 +293,14 @@ def configure_debug_logging(log_filename, prefix=''): return log_file_handler -def run_pipeline(options, api=False): +def run_pipeline(options, *, plugin_manager, api=False): # Any changes to options will not take effect for options that are already # bound to function parameters in the pipeline. (For example # options.input_file, options.pdf_renderer are already bound.) if not options.jobs: options.jobs = available_cpu_count() + if not plugin_manager: + plugin_manager = get_plugin_manager([]) work_folder = mkdtemp(prefix="com.github.ocrmypdf.") debug_log_handler = None @@ -307,7 +309,6 @@ def run_pipeline(options, api=False): ): debug_log_handler = configure_debug_logging(Path(work_folder) / "debug.log") - pm = get_plugin_manager(options) try: check_requested_output_file(options) start_input_file, original_filename = create_input_file(options, work_folder) @@ -320,7 +321,7 @@ def run_pipeline(options, api=False): options, ) - pm.hook.prepare(options=options) + plugin_manager.hook.prepare(options=options) # Gather pdfinfo and create context pdfinfo = get_pdfinfo( @@ -329,7 +330,7 @@ def run_pipeline(options, api=False): max_workers=options.jobs if not options.use_threads else 1, # To help debug ) - context = PDFContext(options, work_folder, origin_pdf, pdfinfo, pm) + context = PDFContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager) # Validate options are okay for this pdf validate_pdfinfo_options(context) diff --git a/src/ocrmypdf/api.py b/src/ocrmypdf/api.py index efa24c7f..cc7e103f 100644 --- a/src/ocrmypdf/api.py +++ b/src/ocrmypdf/api.py @@ -18,15 +18,17 @@ import logging import os import sys +from argparse import ArgumentParser from contextlib import suppress from enum import IntEnum from pathlib import Path from typing import Dict, Iterable -from ._logging import PageNumberFilter, TqdmConsole -from ._sync import run_pipeline -from ._validation import check_options -from .cli import parser +from ocrmypdf._logging import PageNumberFilter, TqdmConsole +from ocrmypdf._plugin_manager import get_plugin_manager +from ocrmypdf._sync import run_pipeline +from ocrmypdf._validation import check_options +from ocrmypdf.cli import get_parser, plugins_only_parser try: import coloredlogs @@ -125,7 +127,13 @@ def configure_logging( return log -def create_options(*, input_file: os.PathLike, output_file: os.PathLike, **kwargs): +def create_options( + *, + input_file: os.PathLike, + output_file: os.PathLike, + parser: ArgumentParser, + **kwargs, +): cmdline = [] deferred = [] @@ -223,9 +231,11 @@ def ocr( # pylint: disable=unused-argument user_words: os.PathLike = None, user_patterns: os.PathLike = None, fast_web_view: float = None, + plugins: Iterable[str] = None, keep_temporary_files: bool = None, progress_bar: bool = None, tesseract_env: Dict[str, str] = None, + **kwargs, ): """Run OCRmyPDF on one PDF or image. @@ -260,7 +270,15 @@ def ocr( # pylint: disable=unused-argument Returns: :class:`ocrmypdf.ExitCode` """ + if not plugins: + plugins = [] - options = create_options(**locals()) + parser = get_parser() + _plugin_manager = get_plugin_manager(plugins) + _plugin_manager.hook.install_cli(parser=parser) + + options = create_options( + **{k: v for k, v in locals().items() if not k.startswith('_')} + ) check_options(options) - return run_pipeline(options, api=True) + return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True) diff --git a/src/ocrmypdf/cli.py b/src/ocrmypdf/cli.py index d243364a..3edf6cd7 100644 --- a/src/ocrmypdf/cli.py +++ b/src/ocrmypdf/cli.py @@ -17,10 +17,8 @@ import argparse -from ._version import PROGRAM_NAME as _PROGRAM_NAME -from ._version import __version__ as _VERSION - -__all__ = ['parser'] +from ocrmypdf._version import PROGRAM_NAME as _PROGRAM_NAME +from ocrmypdf._version import __version__ as _VERSION def numeric(basetype, min_=None, max_=None): @@ -56,18 +54,19 @@ class ArgumentParser(argparse.ArgumentParser): raise ValueError(message) -parser = ArgumentParser( - prog=_PROGRAM_NAME, - fromfile_prefix_chars='@', - formatter_class=argparse.RawDescriptionHelpFormatter, - description="""\ +def get_parser(): + parser = ArgumentParser( + prog=_PROGRAM_NAME, + fromfile_prefix_chars='@', + formatter_class=argparse.RawDescriptionHelpFormatter, + description="""\ Generates a searchable PDF or PDF/A from a regular PDF. OCRmyPDF rasterizes each page of the input PDF, optionally corrects page rotation and performs image processing, runs the Tesseract OCR engine on the image, and then creates a PDF from the OCR information. """, - epilog="""\ + epilog="""\ OCRmyPDF attempts to keep the output file at about the same size. If a file contains losslessly compressed images, and output file will be losslessly compressed as well. @@ -108,395 +107,409 @@ Online documentation is located at: https://ocrmypdf.readthedocs.io/en/latest/introduction.html """, -) + ) -parser.add_argument( - 'input_file', - metavar="input_pdf_or_image", - help="PDF file containing the images to be OCRed (or '-' to read from " - "standard input)", -) -parser.add_argument( - 'output_file', - metavar="output_pdf", - help="Output searchable PDF file (or '-' to write to standard output). " - "Existing files will be ovewritten. If same as input file, the " - "input file will be updated only if processing is successful.", -) -parser.add_argument( - '-l', - '--language', - action='append', - help="Language(s) of the file to be OCRed (see tesseract --list-langs for " - "all language packs installed in your system). Use -l eng+deu for " - "multiple languages.", -) -parser.add_argument( - '--image-dpi', - metavar='DPI', - type=int, - help="For input image instead of PDF, use this DPI instead of file's.", -) -parser.add_argument( - '--output-type', - choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], - default='pdfa', - help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for " - "long term archiving (default, recommended) but may not suitable " - "for users who want their file altered as little as possible. 'pdfa' " - "also has problems with full Unicode text. 'pdf' attempts to " - "preserve file contents as much as possible. 'pdf-a1' creates a " - "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a " - "PDF/A3-b file.", -) + parser.add_argument( + 'input_file', + metavar="input_pdf_or_image", + help="PDF file containing the images to be OCRed (or '-' to read from " + "standard input)", + ) + parser.add_argument( + 'output_file', + metavar="output_pdf", + help="Output searchable PDF file (or '-' to write to standard output). " + "Existing files will be ovewritten. If same as input file, the " + "input file will be updated only if processing is successful.", + ) + parser.add_argument( + '-l', + '--language', + action='append', + help="Language(s) of the file to be OCRed (see tesseract --list-langs for " + "all language packs installed in your system). Use -l eng+deu for " + "multiple languages.", + ) + parser.add_argument( + '--image-dpi', + metavar='DPI', + type=int, + help="For input image instead of PDF, use this DPI instead of file's.", + ) + parser.add_argument( + '--output-type', + choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'], + default='pdfa', + help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for " + "long term archiving (default, recommended) but may not suitable " + "for users who want their file altered as little as possible. 'pdfa' " + "also has problems with full Unicode text. 'pdf' attempts to " + "preserve file contents as much as possible. 'pdf-a1' creates a " + "PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a " + "PDF/A3-b file.", + ) -# Use null string '\0' as sentinel to indicate the user supplied no argument, -# since that is the only invalid character for filepaths on all platforms -# bool('\0') is True in Python -parser.add_argument( - '--sidecar', - nargs='?', - const='\0', - default=None, - metavar='FILE', - help="Generate sidecar text files that contain the same text recognized " - "by Tesseract. This may be useful for building a OCR text database. " - "If FILE is omitted, the sidecar file be named {output_file}.txt " - "If FILE is set to '-', the sidecar is written to stdout (a " - "convenient way to preview OCR quality). The output file and sidecar " - "may not both use stdout at the same time.", -) + # Use null string '\0' as sentinel to indicate the user supplied no argument, + # since that is the only invalid character for filepaths on all platforms + # bool('\0') is True in Python + parser.add_argument( + '--sidecar', + nargs='?', + const='\0', + default=None, + metavar='FILE', + help="Generate sidecar text files that contain the same text recognized " + "by Tesseract. This may be useful for building a OCR text database. " + "If FILE is omitted, the sidecar file be named {output_file}.txt " + "If FILE is set to '-', the sidecar is written to stdout (a " + "convenient way to preview OCR quality). The output file and sidecar " + "may not both use stdout at the same time.", + ) -parser.add_argument( - '--version', - action='version', - version=_VERSION, - help="Print program version and exit", -) + parser.add_argument( + '--version', + action='version', + version=_VERSION, + help="Print program version and exit", + ) -jobcontrol = parser.add_argument_group("Job control options") -jobcontrol.add_argument( - '-j', - '--jobs', - metavar='N', - type=numeric(int, 0, 256), - help="Use up to N CPU cores simultaneously (default: use all).", -) -jobcontrol.add_argument( - '-q', '--quiet', action='store_true', help="Suppress INFO messages" -) -jobcontrol.add_argument( - '-v', - '--verbose', - type=numeric(int, 0, 2), - default=0, - const=1, - nargs='?', - help="Print more verbose messages for each additional verbose level. Use " - "`-v 1` typically for much more detailed logging. Higher numbers " - "are probably only useful in debugging.", -) -jobcontrol.add_argument( - '--no-progress-bar', - action='store_false', - dest='progress_bar', - help=argparse.SUPPRESS, -) -jobcontrol.add_argument('--use-threads', action='store_true', help=argparse.SUPPRESS) + jobcontrol = parser.add_argument_group("Job control options") + jobcontrol.add_argument( + '-j', + '--jobs', + metavar='N', + type=numeric(int, 0, 256), + help="Use up to N CPU cores simultaneously (default: use all).", + ) + jobcontrol.add_argument( + '-q', '--quiet', action='store_true', help="Suppress INFO messages" + ) + jobcontrol.add_argument( + '-v', + '--verbose', + type=numeric(int, 0, 2), + default=0, + const=1, + nargs='?', + help="Print more verbose messages for each additional verbose level. Use " + "`-v 1` typically for much more detailed logging. Higher numbers " + "are probably only useful in debugging.", + ) + jobcontrol.add_argument( + '--no-progress-bar', + action='store_false', + dest='progress_bar', + help=argparse.SUPPRESS, + ) + jobcontrol.add_argument( + '--use-threads', action='store_true', help=argparse.SUPPRESS + ) -metadata = parser.add_argument_group( - "Metadata options", - "Set output PDF/A metadata (default: copy input document's metadata)", -) -metadata.add_argument( - '--title', type=str, help="Set document title (place multiple words in quotes)" -) -metadata.add_argument('--author', type=str, help="Set document author") -metadata.add_argument('--subject', type=str, help="Set document subject description") -metadata.add_argument('--keywords', type=str, help="Set document keywords") + metadata = parser.add_argument_group( + "Metadata options", + "Set output PDF/A metadata (default: copy input document's metadata)", + ) + metadata.add_argument( + '--title', type=str, help="Set document title (place multiple words in quotes)" + ) + metadata.add_argument('--author', type=str, help="Set document author") + metadata.add_argument( + '--subject', type=str, help="Set document subject description" + ) + metadata.add_argument('--keywords', type=str, help="Set document keywords") -preprocessing = parser.add_argument_group( - "Image preprocessing options", - "Options to improve the quality of the final PDF and OCR", -) -preprocessing.add_argument( - '-r', - '--rotate-pages', - action='store_true', - help="Automatically rotate pages based on detected text orientation", -) -preprocessing.add_argument( - '--remove-background', - action='store_true', - help="Attempt to remove background from gray or color pages, setting it " - "to white ", -) -preprocessing.add_argument( - '-d', '--deskew', action='store_true', help="Deskew each page before performing OCR" -) -preprocessing.add_argument( - '-c', - '--clean', - action='store_true', - help="Clean pages from scanning artifacts before performing OCR, and send " - "the cleaned page to OCR, but do not include the cleaned page in " - "the output", -) -preprocessing.add_argument( - '-i', - '--clean-final', - action='store_true', - help="Clean page as above, and incorporate the cleaned image in the final " - "PDF. Might remove desired content.", -) -preprocessing.add_argument( - '--unpaper-args', - type=str, - default=None, - help="A quoted string of arguments to pass to unpaper. Requires --clean. " - "Example: --unpaper-args '--layout double'.", -) -preprocessing.add_argument( - '--oversample', - metavar='DPI', - type=numeric(int, 0, 5000), - default=0, - help="Oversample images to at least the specified DPI, to improve OCR " - "results slightly", -) -preprocessing.add_argument( - '--remove-vectors', - action='store_true', - help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they " - "will not be included in OCR. This can eliminate false characters.", -) -preprocessing.add_argument( - '--threshold', - action='store_true', - help="EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract for OCR. Can " - "improve OCR quality compared to Tesseract's thresholder.", -) + preprocessing = parser.add_argument_group( + "Image preprocessing options", + "Options to improve the quality of the final PDF and OCR", + ) + preprocessing.add_argument( + '-r', + '--rotate-pages', + action='store_true', + help="Automatically rotate pages based on detected text orientation", + ) + preprocessing.add_argument( + '--remove-background', + action='store_true', + help="Attempt to remove background from gray or color pages, setting it " + "to white ", + ) + preprocessing.add_argument( + '-d', + '--deskew', + action='store_true', + help="Deskew each page before performing OCR", + ) + preprocessing.add_argument( + '-c', + '--clean', + action='store_true', + help="Clean pages from scanning artifacts before performing OCR, and send " + "the cleaned page to OCR, but do not include the cleaned page in " + "the output", + ) + preprocessing.add_argument( + '-i', + '--clean-final', + action='store_true', + help="Clean page as above, and incorporate the cleaned image in the final " + "PDF. Might remove desired content.", + ) + preprocessing.add_argument( + '--unpaper-args', + type=str, + default=None, + help="A quoted string of arguments to pass to unpaper. Requires --clean. " + "Example: --unpaper-args '--layout double'.", + ) + preprocessing.add_argument( + '--oversample', + metavar='DPI', + type=numeric(int, 0, 5000), + default=0, + help="Oversample images to at least the specified DPI, to improve OCR " + "results slightly", + ) + preprocessing.add_argument( + '--remove-vectors', + action='store_true', + help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they " + "will not be included in OCR. This can eliminate false characters.", + ) + preprocessing.add_argument( + '--threshold', + action='store_true', + help=( + "EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract " + "for OCR. Can improve OCR quality compared to Tesseract's thresholder." + ), + ) -ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied") -ocrsettings.add_argument( - '-f', - '--force-ocr', - action='store_true', - help="Rasterize any text or vector objects on each page, apply OCR, and " - "save the rastered output (this rewrites the PDF)", -) -ocrsettings.add_argument( - '-s', - '--skip-text', - action='store_true', - help="Skip OCR on any pages that already contain text, but include the " - "page in final output; useful for PDFs that contain a mix of " - "images, text pages, and/or previously OCRed pages", -) -ocrsettings.add_argument( - '--redo-ocr', - action='store_true', - help="Attempt to detect and remove the hidden OCR layer from files that " - "were previously OCRed with OCRmyPDF or another program. Apply OCR " - "to text found in raster images. Existing visible text objects will " - "not be changed. If there is no existing OCR, OCR will be added.", -) -ocrsettings.add_argument( - '--skip-big', - type=numeric(float, 0, 5000), - metavar='MPixels', - help="Skip OCR on pages larger than the specified amount of megapixels, " - "but include skipped pages in final output", -) + ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied") + ocrsettings.add_argument( + '-f', + '--force-ocr', + action='store_true', + help="Rasterize any text or vector objects on each page, apply OCR, and " + "save the rastered output (this rewrites the PDF)", + ) + ocrsettings.add_argument( + '-s', + '--skip-text', + action='store_true', + help="Skip OCR on any pages that already contain text, but include the " + "page in final output; useful for PDFs that contain a mix of " + "images, text pages, and/or previously OCRed pages", + ) + ocrsettings.add_argument( + '--redo-ocr', + action='store_true', + help="Attempt to detect and remove the hidden OCR layer from files that " + "were previously OCRed with OCRmyPDF or another program. Apply OCR " + "to text found in raster images. Existing visible text objects will " + "not be changed. If there is no existing OCR, OCR will be added.", + ) + ocrsettings.add_argument( + '--skip-big', + type=numeric(float, 0, 5000), + metavar='MPixels', + help="Skip OCR on pages larger than the specified amount of megapixels, " + "but include skipped pages in final output", + ) -optimizing = parser.add_argument_group( - "Optimization options", "Control how the PDF is optimized after OCR" -) -optimizing.add_argument( - '-O', - '--optimize', - type=int, - choices=range(0, 4), - default=1, - help=( - "Control how PDF is optimized after processing:" - "0 - do not optimize; " - "1 - do safe, lossless optimizations (default); " - "2 - do some lossy optimizations; " - "3 - do aggressive lossy optimizations (including lossy JBIG2)" - ), -) -optimizing.add_argument( - '--jpeg-quality', - type=numeric(int, 0, 100), - default=0, - metavar='Q', - help=( - "Adjust JPEG quality level for JPEG optimization. " - "100 is best quality and largest output size; " - "1 is lowest quality and smallest output; " - "0 uses the default." - ), -) -optimizing.add_argument( - '--jpg-quality', - type=numeric(int, 0, 100), - default=0, - metavar='Q', - dest='jpeg_quality', - help=argparse.SUPPRESS, # Alias for --jpeg-quality -) -optimizing.add_argument( - '--png-quality', - type=numeric(int, 0, 100), - default=0, - metavar='Q', - help=( - "Adjust PNG quality level to use when quantizing PNGs. " - "Values have same meaning as with --jpeg-quality" - ), -) -optimizing.add_argument( - '--jbig2-lossy', - action='store_true', - help=( - "Enable JBIG2 lossy mode (better compression, not suitable for some " - "use cases - see documentation)." - ), -) -optimizing.add_argument( - '--jbig2-page-group-size', - type=numeric(int, 1, 10000), - default=0, - metavar='N', - # Adjust number of pages to consider at once for JBIG2 compression - help=argparse.SUPPRESS, -) + optimizing = parser.add_argument_group( + "Optimization options", "Control how the PDF is optimized after OCR" + ) + optimizing.add_argument( + '-O', + '--optimize', + type=int, + choices=range(0, 4), + default=1, + help=( + "Control how PDF is optimized after processing:" + "0 - do not optimize; " + "1 - do safe, lossless optimizations (default); " + "2 - do some lossy optimizations; " + "3 - do aggressive lossy optimizations (including lossy JBIG2)" + ), + ) + optimizing.add_argument( + '--jpeg-quality', + type=numeric(int, 0, 100), + default=0, + metavar='Q', + help=( + "Adjust JPEG quality level for JPEG optimization. " + "100 is best quality and largest output size; " + "1 is lowest quality and smallest output; " + "0 uses the default." + ), + ) + optimizing.add_argument( + '--jpg-quality', + type=numeric(int, 0, 100), + default=0, + metavar='Q', + dest='jpeg_quality', + help=argparse.SUPPRESS, # Alias for --jpeg-quality + ) + optimizing.add_argument( + '--png-quality', + type=numeric(int, 0, 100), + default=0, + metavar='Q', + help=( + "Adjust PNG quality level to use when quantizing PNGs. " + "Values have same meaning as with --jpeg-quality" + ), + ) + optimizing.add_argument( + '--jbig2-lossy', + action='store_true', + help=( + "Enable JBIG2 lossy mode (better compression, not suitable for some " + "use cases - see documentation)." + ), + ) + optimizing.add_argument( + '--jbig2-page-group-size', + type=numeric(int, 1, 10000), + default=0, + metavar='N', + # Adjust number of pages to consider at once for JBIG2 compression + help=argparse.SUPPRESS, + ) -advanced = parser.add_argument_group( - "Advanced", "Advanced options to control Tesseract's OCR behavior" -) -advanced.add_argument( - '--pages', - type=str, - help="Limit OCR to the specified pages (ranges or comma separated), skipping others", -) -advanced.add_argument( - '--max-image-mpixels', - action='store', - type=numeric(float, 0), - metavar='MPixels', - help="Set maximum number of pixels to unpack before treating an image as a " - "decompression bomb", - default=128.0, -) -advanced.add_argument( - '--tesseract-config', - action='append', - metavar='CFG', - default=[], - help="Additional Tesseract configuration files -- see documentation", -) -advanced.add_argument( - '--tesseract-pagesegmode', - action='store', - type=int, - metavar='PSM', - choices=range(0, 14), - help="Set Tesseract page segmentation mode (see tesseract --help)", -) -advanced.add_argument( - '--tesseract-oem', - action='store', - type=int, - metavar='MODE', - choices=range(0, 4), - help=( - "Set Tesseract 4.0 OCR engine mode: " - "0 - original Tesseract only; " - "1 - neural nets LSTM only; " - "2 - Tesseract + LSTM; " - "3 - default." - ), -) -advanced.add_argument( - '--pdf-renderer', - choices=['auto', 'hocr', 'sandwich'], - default='auto', - help="Choose OCR PDF renderer - the default option is to let OCRmyPDF " - "choose. See documentation for discussion.", -) -advanced.add_argument( - '--tesseract-timeout', - default=180.0, - type=numeric(float, 0), - metavar='SECONDS', - help='Give up on OCR after the timeout, but copy the preprocessed page ' - 'into the final output', -) -advanced.add_argument( - '--rotate-pages-threshold', - default=14.0, - type=numeric(float, 0, 1000), - metavar='CONFIDENCE', - help="Only rotate pages when confidence is above this value (arbitrary " - "units reported by tesseract)", -) -advanced.add_argument( - '--pdfa-image-compression', - choices=['auto', 'jpeg', 'lossless'], - default='auto', - help="Specify how to compress images in the output PDF/A. 'auto' lets " - "OCRmyPDF decide. 'jpeg' changes all grayscale and color images to " - "JPEG compression. 'lossless' uses PNG-style lossless compression " - "for all images. Monochrome images are always compressed using a " - "lossless codec. Compression settings " - "are applied to all pages, including those for which OCR was " - "skipped. Not supported for --output-type=pdf ; that setting " - "preserves the original compression of all images.", -) -advanced.add_argument( - '--user-words', - metavar='FILE', - help="Specify the location of the Tesseract user words file. This is a " - "list of words Tesseract should consider while performing OCR in " - "addition to its standard language dictionaries. This can improve " - "OCR quality especially for specialized and technical documents.", -) -advanced.add_argument( - '--user-patterns', - metavar='FILE', - help="Specify the location of the Tesseract user patterns file.", -) -advanced.add_argument( - '--fast-web-view', - type=numeric(float, 0), - default=1.0, - metavar="MEGABYTES", - help="If the size of file is more than this threshold (in MB), then " - "linearize the PDF for fast web viewing. This allows the PDF to be " - "displayed before it is fully downloaded in web browsers, but increases " - "the space required slightly. By default we skip this for small files " - "which do not benefit. If the threshold is 0 it will be apply to all files. " - "Set the threshold very high to disable.", -) -advanced.add_argument( - '--plugins', - action='append', - default=[], - help="Path to a folder than contains plugins.", -) + advanced = parser.add_argument_group( + "Advanced", "Advanced options to control Tesseract's OCR behavior" + ) + advanced.add_argument( + '--pages', + type=str, + help=( + "Limit OCR to the specified pages (ranges or comma separated), " + "skipping others", + ), + ) + advanced.add_argument( + '--max-image-mpixels', + action='store', + type=numeric(float, 0), + metavar='MPixels', + help="Set maximum number of pixels to unpack before treating an image as a " + "decompression bomb", + default=128.0, + ) + advanced.add_argument( + '--tesseract-config', + action='append', + metavar='CFG', + default=[], + help="Additional Tesseract configuration files -- see documentation", + ) + advanced.add_argument( + '--tesseract-pagesegmode', + action='store', + type=int, + metavar='PSM', + choices=range(0, 14), + help="Set Tesseract page segmentation mode (see tesseract --help)", + ) + advanced.add_argument( + '--tesseract-oem', + action='store', + type=int, + metavar='MODE', + choices=range(0, 4), + help=( + "Set Tesseract 4.0 OCR engine mode: " + "0 - original Tesseract only; " + "1 - neural nets LSTM only; " + "2 - Tesseract + LSTM; " + "3 - default." + ), + ) + advanced.add_argument( + '--pdf-renderer', + choices=['auto', 'hocr', 'sandwich'], + default='auto', + help="Choose OCR PDF renderer - the default option is to let OCRmyPDF " + "choose. See documentation for discussion.", + ) + advanced.add_argument( + '--tesseract-timeout', + default=180.0, + type=numeric(float, 0), + metavar='SECONDS', + help='Give up on OCR after the timeout, but copy the preprocessed page ' + 'into the final output', + ) + advanced.add_argument( + '--rotate-pages-threshold', + default=14.0, + type=numeric(float, 0, 1000), + metavar='CONFIDENCE', + help="Only rotate pages when confidence is above this value (arbitrary " + "units reported by tesseract)", + ) + advanced.add_argument( + '--pdfa-image-compression', + choices=['auto', 'jpeg', 'lossless'], + default='auto', + help="Specify how to compress images in the output PDF/A. 'auto' lets " + "OCRmyPDF decide. 'jpeg' changes all grayscale and color images to " + "JPEG compression. 'lossless' uses PNG-style lossless compression " + "for all images. Monochrome images are always compressed using a " + "lossless codec. Compression settings " + "are applied to all pages, including those for which OCR was " + "skipped. Not supported for --output-type=pdf ; that setting " + "preserves the original compression of all images.", + ) + advanced.add_argument( + '--user-words', + metavar='FILE', + help="Specify the location of the Tesseract user words file. This is a " + "list of words Tesseract should consider while performing OCR in " + "addition to its standard language dictionaries. This can improve " + "OCR quality especially for specialized and technical documents.", + ) + advanced.add_argument( + '--user-patterns', + metavar='FILE', + help="Specify the location of the Tesseract user patterns file.", + ) + advanced.add_argument( + '--fast-web-view', + type=numeric(float, 0), + default=1.0, + metavar="MEGABYTES", + help="If the size of file is more than this threshold (in MB), then " + "linearize the PDF for fast web viewing. This allows the PDF to be " + "displayed before it is fully downloaded in web browsers, but increases " + "the space required slightly. By default we skip this for small files " + "which do not benefit. If the threshold is 0 it will be apply to all files. " + "Set the threshold very high to disable.", + ) + advanced.add_argument( + '--plugins', + action='append', + default=[], + help="Path to a folder than contains plugins.", + ) + + debugging = parser.add_argument_group( + "Debugging", "Arguments to help with troubleshooting and debugging" + ) + debugging.add_argument( + '-k', + '--keep-temporary-files', + action='store_true', + help="Keep temporary files (helpful for debugging)", + ) + debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS) + return parser -debugging = parser.add_argument_group( - "Debugging", "Arguments to help with troubleshooting and debugging" -) -debugging.add_argument( - '-k', - '--keep-temporary-files', - action='store_true', - help="Keep temporary files (helpful for debugging)", -) -debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS) plugins_only_parser = ArgumentParser( prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py index e6f0bd3f..9459b70e 100644 --- a/src/ocrmypdf/optimize.py +++ b/src/ocrmypdf/optimize.py @@ -582,7 +582,7 @@ def main(infile, outfile, level, jobs=1): ) with TemporaryDirectory() as td: - context = PDFContext(options, td, infile, None) + context = PDFContext(options, td, infile, None, None) tmpout = Path(td) / 'out.pdf' optimize( infile, diff --git a/tests/conftest.py b/tests/conftest.py index 8724ead5..54960cdc 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -214,7 +214,7 @@ def no_outpdf(tmp_path): def check_ocrmypdf(input_file, output_file, *args, env=None): """Run ocrmypdf and confirmed that a valid file was created""" - options = cli.parser.parse_args( + options = cli.get_parser().parse_args( [str(input_file), str(output_file)] + [str(arg) for arg in args if arg is not None] ) @@ -222,7 +222,7 @@ def check_ocrmypdf(input_file, output_file, *args, env=None): if env: options.tesseract_env = env options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file) - result = api.run_pipeline(options, api=True) + result = api.run_pipeline(options, plugin_manager=None, api=True) assert result == 0 assert os.path.exists(str(output_file)), "Output file not created" @@ -238,7 +238,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None): Does not currently have a way to manipulate the PATH except for Tesseract. """ - options = cli.parser.parse_args( + options = cli.get_parser().parse_args( [str(input_file), str(output_file)] + [str(arg) for arg in args if arg is not None] ) @@ -253,7 +253,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None): if options.tesseract_env: assert all(isinstance(v, (str, bytes)) for v in options.tesseract_env.values()) - return api.run_pipeline(options, api=False) + return api.run_pipeline(options, plugin_manager=None, api=False) @pytest.helpers.register diff --git a/tests/test_metadata.py b/tests/test_metadata.py index 270a8b62..79e63604 100644 --- a/tests/test_metadata.py +++ b/tests/test_metadata.py @@ -32,7 +32,7 @@ from pikepdf.models.metadata import decode_pdf_date from ocrmypdf._jobcontext import PDFContext from ocrmypdf._pipeline import convert_to_pdfa -from ocrmypdf.cli import parser +from ocrmypdf.cli import get_parser from ocrmypdf.exceptions import ExitCode from ocrmypdf.pdfa import SRGB_ICC_PROFILE, file_claims_pdfa, generate_pdfa_ps from ocrmypdf.pdfinfo import PdfInfo @@ -290,16 +290,15 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop): def test_metadata_fixup_warning(resources, outdir, caplog): - from ocrmypdf.__main__ import parser from ocrmypdf._pipeline import metadata_fixup - options = parser.parse_args( + options = get_parser().parse_args( args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf'] ) copyfile(resources / 'graph.pdf', outdir / 'graph.pdf') - context = PDFContext(options, outdir, outdir / 'graph.pdf', None) + context = PDFContext(options, outdir, outdir / 'graph.pdf', None, None) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) for record in caplog.records: assert record.levelname != 'WARNING' @@ -310,7 +309,7 @@ def test_metadata_fixup_warning(resources, outdir, caplog): meta['prism2:publicationName'] = 'OCRmyPDF Test' graph.save(outdir / 'graph_mod.pdf') - context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None) + context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None, None) metadata_fixup(working_file=outdir / 'graph.pdf', context=context) assert any(record.levelname == 'WARNING' for record in caplog.records) @@ -326,11 +325,11 @@ def test_prevent_gs_invalid_xml(resources, outdir): Title=b'String with trailing nul\x00' ) - options = parser.parse_args( + options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') - context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) + context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context @@ -357,11 +356,11 @@ def test_malformed_docinfo(caplog, resources, outdir): pike.trailer.Info = pikepdf.Stream(pike, b"") pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False) - options = parser.parse_args( + options = get_parser().parse_args( args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf'] ) pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf') - context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo) + context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None) convert_to_pdfa( str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context diff --git a/tests/test_unpaper.py b/tests/test_unpaper.py index f0e90b80..7753d340 100644 --- a/tests/test_unpaper.py +++ b/tests/test_unpaper.py @@ -21,7 +21,7 @@ from unittest.mock import patch import pytest from ocrmypdf._validation import check_options -from ocrmypdf.cli import parser +from ocrmypdf.cli import get_parser from ocrmypdf.exceptions import ExitCode, MissingDependencyError from ocrmypdf.exec import unpaper @@ -51,7 +51,7 @@ def spoof_unpaper_oldversion(tmp_path_factory): def test_no_unpaper(resources, no_outpdf): input_ = fspath(resources / "c02-22.pdf") output = fspath(no_outpdf) - options = parser.parse_args(args=["--clean", input_, output]) + options = get_parser().parse_args(args=["--clean", input_, output]) with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version: mock_unpaper_version.side_effect = FileNotFoundError("unpaper") diff --git a/tests/test_validation.py b/tests/test_validation.py index f183aa46..1a453e78 100644 --- a/tests/test_validation.py +++ b/tests/test_validation.py @@ -23,6 +23,7 @@ import pytest import ocrmypdf._validation as vd from ocrmypdf.api import create_options +from ocrmypdf.cli import get_parser from ocrmypdf.exceptions import BadArgsError, MissingDependencyError from ocrmypdf.pdfinfo import PdfInfo @@ -30,7 +31,9 @@ from ocrmypdf.pdfinfo import PdfInfo def make_opts(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs): if language is not None: kwargs['language'] = language - return create_options(input_file=input_file, output_file=output_file, **kwargs) + return create_options( + input_file=input_file, output_file=output_file, parser=get_parser(), **kwargs + ) def test_hocr_notlatin_warning(caplog):