mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-19 12:04:44 -04:00
Support plugin invocation with API
This commit is contained in:
@@ -26,7 +26,7 @@ from ocrmypdf._plugin_manager import get_plugin_manager
|
||||
from ocrmypdf._sync import run_pipeline
|
||||
from ocrmypdf._validation import check_closed_streams, check_options
|
||||
from ocrmypdf.api import Verbosity, configure_logging
|
||||
from ocrmypdf.cli import parser, plugins_only_parser
|
||||
from ocrmypdf.cli import get_parser, plugins_only_parser
|
||||
from ocrmypdf.exceptions import BadArgsError, ExitCode, MissingDependencyError
|
||||
|
||||
log = logging.getLogger('ocrmypdf')
|
||||
@@ -34,9 +34,10 @@ log = logging.getLogger('ocrmypdf')
|
||||
|
||||
def run(args=None):
|
||||
pre_options, _unused = plugins_only_parser.parse_known_args(args=args)
|
||||
if pre_options.plugins:
|
||||
pm = get_plugin_manager(pre_options)
|
||||
pm.hook.install_cli(parser=parser)
|
||||
plugin_manager = get_plugin_manager(pre_options.plugins)
|
||||
|
||||
parser = get_parser()
|
||||
plugin_manager.hook.install_cli(parser=parser)
|
||||
|
||||
options = parser.parse_args(args=args)
|
||||
|
||||
@@ -68,7 +69,7 @@ def run(args=None):
|
||||
log.error(e)
|
||||
return ExitCode.missing_dependency
|
||||
|
||||
result = run_pipeline(options=options)
|
||||
result = run_pipeline(options=options, plugin_manager=plugin_manager)
|
||||
return result
|
||||
|
||||
|
||||
|
||||
@@ -69,13 +69,19 @@ class PageContext:
|
||||
|
||||
def __getstate__(self):
|
||||
state = self.__dict__.copy()
|
||||
del state['plugin_manager']
|
||||
state['construct_plugin_manager'] = partial(get_plugin_manager, self.options)
|
||||
if state['plugin_manager'] is not None:
|
||||
del state['plugin_manager']
|
||||
state['construct_plugin_manager'] = partial(
|
||||
get_plugin_manager, self.options.plugins
|
||||
)
|
||||
return state
|
||||
|
||||
def __setstate__(self, state):
|
||||
self.__dict__.update(state)
|
||||
self.plugin_manager = self.__dict__['construct_plugin_manager']()
|
||||
if 'construct_plugin_manager' in state:
|
||||
self.plugin_manager = state['construct_plugin_manager']()
|
||||
else:
|
||||
self.plugin_manager = None
|
||||
del self.__dict__['construct_plugin_manager']
|
||||
|
||||
|
||||
|
||||
@@ -522,9 +522,11 @@ def create_ocr_image(image, page_context):
|
||||
|
||||
del draw
|
||||
|
||||
im = page_context.plugin_manager.hook.filter_ocr_image(
|
||||
filter_im = page_context.plugin_manager.hook.filter_ocr_image(
|
||||
page=page_context, image=im
|
||||
)
|
||||
if filter_im is not None:
|
||||
im = filter_im
|
||||
|
||||
# Pillow requires integer DPI
|
||||
dpi = tuple(round(coord) for coord in im.info['dpi'])
|
||||
|
||||
@@ -16,17 +16,17 @@
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
import importlib
|
||||
from typing import List
|
||||
|
||||
import pluggy
|
||||
|
||||
from ocrmypdf import pluginspec
|
||||
|
||||
|
||||
def get_plugin_manager(options):
|
||||
def get_plugin_manager(plugins: List[str]):
|
||||
pm = pluggy.PluginManager('ocrmypdf')
|
||||
pm.add_hookspecs(pluginspec)
|
||||
|
||||
for name in options.plugins:
|
||||
for name in plugins:
|
||||
module = importlib.import_module(name)
|
||||
pm.register(module)
|
||||
return pm
|
||||
|
||||
@@ -293,12 +293,14 @@ def configure_debug_logging(log_filename, prefix=''):
|
||||
return log_file_handler
|
||||
|
||||
|
||||
def run_pipeline(options, api=False):
|
||||
def run_pipeline(options, *, plugin_manager, api=False):
|
||||
# Any changes to options will not take effect for options that are already
|
||||
# bound to function parameters in the pipeline. (For example
|
||||
# options.input_file, options.pdf_renderer are already bound.)
|
||||
if not options.jobs:
|
||||
options.jobs = available_cpu_count()
|
||||
if not plugin_manager:
|
||||
plugin_manager = get_plugin_manager([])
|
||||
|
||||
work_folder = mkdtemp(prefix="com.github.ocrmypdf.")
|
||||
debug_log_handler = None
|
||||
@@ -307,7 +309,6 @@ def run_pipeline(options, api=False):
|
||||
):
|
||||
debug_log_handler = configure_debug_logging(Path(work_folder) / "debug.log")
|
||||
|
||||
pm = get_plugin_manager(options)
|
||||
try:
|
||||
check_requested_output_file(options)
|
||||
start_input_file, original_filename = create_input_file(options, work_folder)
|
||||
@@ -320,7 +321,7 @@ def run_pipeline(options, api=False):
|
||||
options,
|
||||
)
|
||||
|
||||
pm.hook.prepare(options=options)
|
||||
plugin_manager.hook.prepare(options=options)
|
||||
|
||||
# Gather pdfinfo and create context
|
||||
pdfinfo = get_pdfinfo(
|
||||
@@ -329,7 +330,7 @@ def run_pipeline(options, api=False):
|
||||
max_workers=options.jobs if not options.use_threads else 1, # To help debug
|
||||
)
|
||||
|
||||
context = PDFContext(options, work_folder, origin_pdf, pdfinfo, pm)
|
||||
context = PDFContext(options, work_folder, origin_pdf, pdfinfo, plugin_manager)
|
||||
|
||||
# Validate options are okay for this pdf
|
||||
validate_pdfinfo_options(context)
|
||||
|
||||
@@ -18,15 +18,17 @@
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from argparse import ArgumentParser
|
||||
from contextlib import suppress
|
||||
from enum import IntEnum
|
||||
from pathlib import Path
|
||||
from typing import Dict, Iterable
|
||||
|
||||
from ._logging import PageNumberFilter, TqdmConsole
|
||||
from ._sync import run_pipeline
|
||||
from ._validation import check_options
|
||||
from .cli import parser
|
||||
from ocrmypdf._logging import PageNumberFilter, TqdmConsole
|
||||
from ocrmypdf._plugin_manager import get_plugin_manager
|
||||
from ocrmypdf._sync import run_pipeline
|
||||
from ocrmypdf._validation import check_options
|
||||
from ocrmypdf.cli import get_parser, plugins_only_parser
|
||||
|
||||
try:
|
||||
import coloredlogs
|
||||
@@ -125,7 +127,13 @@ def configure_logging(
|
||||
return log
|
||||
|
||||
|
||||
def create_options(*, input_file: os.PathLike, output_file: os.PathLike, **kwargs):
|
||||
def create_options(
|
||||
*,
|
||||
input_file: os.PathLike,
|
||||
output_file: os.PathLike,
|
||||
parser: ArgumentParser,
|
||||
**kwargs,
|
||||
):
|
||||
cmdline = []
|
||||
deferred = []
|
||||
|
||||
@@ -223,9 +231,11 @@ def ocr( # pylint: disable=unused-argument
|
||||
user_words: os.PathLike = None,
|
||||
user_patterns: os.PathLike = None,
|
||||
fast_web_view: float = None,
|
||||
plugins: Iterable[str] = None,
|
||||
keep_temporary_files: bool = None,
|
||||
progress_bar: bool = None,
|
||||
tesseract_env: Dict[str, str] = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""Run OCRmyPDF on one PDF or image.
|
||||
|
||||
@@ -260,7 +270,15 @@ def ocr( # pylint: disable=unused-argument
|
||||
Returns:
|
||||
:class:`ocrmypdf.ExitCode`
|
||||
"""
|
||||
if not plugins:
|
||||
plugins = []
|
||||
|
||||
options = create_options(**locals())
|
||||
parser = get_parser()
|
||||
_plugin_manager = get_plugin_manager(plugins)
|
||||
_plugin_manager.hook.install_cli(parser=parser)
|
||||
|
||||
options = create_options(
|
||||
**{k: v for k, v in locals().items() if not k.startswith('_')}
|
||||
)
|
||||
check_options(options)
|
||||
return run_pipeline(options, api=True)
|
||||
return run_pipeline(options=options, plugin_manager=_plugin_manager, api=True)
|
||||
|
||||
@@ -17,10 +17,8 @@
|
||||
|
||||
import argparse
|
||||
|
||||
from ._version import PROGRAM_NAME as _PROGRAM_NAME
|
||||
from ._version import __version__ as _VERSION
|
||||
|
||||
__all__ = ['parser']
|
||||
from ocrmypdf._version import PROGRAM_NAME as _PROGRAM_NAME
|
||||
from ocrmypdf._version import __version__ as _VERSION
|
||||
|
||||
|
||||
def numeric(basetype, min_=None, max_=None):
|
||||
@@ -56,18 +54,19 @@ class ArgumentParser(argparse.ArgumentParser):
|
||||
raise ValueError(message)
|
||||
|
||||
|
||||
parser = ArgumentParser(
|
||||
prog=_PROGRAM_NAME,
|
||||
fromfile_prefix_chars='@',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description="""\
|
||||
def get_parser():
|
||||
parser = ArgumentParser(
|
||||
prog=_PROGRAM_NAME,
|
||||
fromfile_prefix_chars='@',
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
description="""\
|
||||
Generates a searchable PDF or PDF/A from a regular PDF.
|
||||
|
||||
OCRmyPDF rasterizes each page of the input PDF, optionally corrects page
|
||||
rotation and performs image processing, runs the Tesseract OCR engine on the
|
||||
image, and then creates a PDF from the OCR information.
|
||||
""",
|
||||
epilog="""\
|
||||
epilog="""\
|
||||
OCRmyPDF attempts to keep the output file at about the same size. If a file
|
||||
contains losslessly compressed images, and output file will be losslessly
|
||||
compressed as well.
|
||||
@@ -108,395 +107,409 @@ Online documentation is located at:
|
||||
https://ocrmypdf.readthedocs.io/en/latest/introduction.html
|
||||
|
||||
""",
|
||||
)
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'input_file',
|
||||
metavar="input_pdf_or_image",
|
||||
help="PDF file containing the images to be OCRed (or '-' to read from "
|
||||
"standard input)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'output_file',
|
||||
metavar="output_pdf",
|
||||
help="Output searchable PDF file (or '-' to write to standard output). "
|
||||
"Existing files will be ovewritten. If same as input file, the "
|
||||
"input file will be updated only if processing is successful.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'-l',
|
||||
'--language',
|
||||
action='append',
|
||||
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
|
||||
"all language packs installed in your system). Use -l eng+deu for "
|
||||
"multiple languages.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--image-dpi',
|
||||
metavar='DPI',
|
||||
type=int,
|
||||
help="For input image instead of PDF, use this DPI instead of file's.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-type',
|
||||
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
|
||||
default='pdfa',
|
||||
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
||||
"long term archiving (default, recommended) but may not suitable "
|
||||
"for users who want their file altered as little as possible. 'pdfa' "
|
||||
"also has problems with full Unicode text. 'pdf' attempts to "
|
||||
"preserve file contents as much as possible. 'pdf-a1' creates a "
|
||||
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||
"PDF/A3-b file.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'input_file',
|
||||
metavar="input_pdf_or_image",
|
||||
help="PDF file containing the images to be OCRed (or '-' to read from "
|
||||
"standard input)",
|
||||
)
|
||||
parser.add_argument(
|
||||
'output_file',
|
||||
metavar="output_pdf",
|
||||
help="Output searchable PDF file (or '-' to write to standard output). "
|
||||
"Existing files will be ovewritten. If same as input file, the "
|
||||
"input file will be updated only if processing is successful.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'-l',
|
||||
'--language',
|
||||
action='append',
|
||||
help="Language(s) of the file to be OCRed (see tesseract --list-langs for "
|
||||
"all language packs installed in your system). Use -l eng+deu for "
|
||||
"multiple languages.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--image-dpi',
|
||||
metavar='DPI',
|
||||
type=int,
|
||||
help="For input image instead of PDF, use this DPI instead of file's.",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--output-type',
|
||||
choices=['pdfa', 'pdf', 'pdfa-1', 'pdfa-2', 'pdfa-3'],
|
||||
default='pdfa',
|
||||
help="Choose output type. 'pdfa' creates a PDF/A-2b compliant file for "
|
||||
"long term archiving (default, recommended) but may not suitable "
|
||||
"for users who want their file altered as little as possible. 'pdfa' "
|
||||
"also has problems with full Unicode text. 'pdf' attempts to "
|
||||
"preserve file contents as much as possible. 'pdf-a1' creates a "
|
||||
"PDF/A1-b file. 'pdf-a2' is equivalent to 'pdfa'. 'pdf-a3' creates a "
|
||||
"PDF/A3-b file.",
|
||||
)
|
||||
|
||||
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
||||
# since that is the only invalid character for filepaths on all platforms
|
||||
# bool('\0') is True in Python
|
||||
parser.add_argument(
|
||||
'--sidecar',
|
||||
nargs='?',
|
||||
const='\0',
|
||||
default=None,
|
||||
metavar='FILE',
|
||||
help="Generate sidecar text files that contain the same text recognized "
|
||||
"by Tesseract. This may be useful for building a OCR text database. "
|
||||
"If FILE is omitted, the sidecar file be named {output_file}.txt "
|
||||
"If FILE is set to '-', the sidecar is written to stdout (a "
|
||||
"convenient way to preview OCR quality). The output file and sidecar "
|
||||
"may not both use stdout at the same time.",
|
||||
)
|
||||
# Use null string '\0' as sentinel to indicate the user supplied no argument,
|
||||
# since that is the only invalid character for filepaths on all platforms
|
||||
# bool('\0') is True in Python
|
||||
parser.add_argument(
|
||||
'--sidecar',
|
||||
nargs='?',
|
||||
const='\0',
|
||||
default=None,
|
||||
metavar='FILE',
|
||||
help="Generate sidecar text files that contain the same text recognized "
|
||||
"by Tesseract. This may be useful for building a OCR text database. "
|
||||
"If FILE is omitted, the sidecar file be named {output_file}.txt "
|
||||
"If FILE is set to '-', the sidecar is written to stdout (a "
|
||||
"convenient way to preview OCR quality). The output file and sidecar "
|
||||
"may not both use stdout at the same time.",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--version',
|
||||
action='version',
|
||||
version=_VERSION,
|
||||
help="Print program version and exit",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--version',
|
||||
action='version',
|
||||
version=_VERSION,
|
||||
help="Print program version and exit",
|
||||
)
|
||||
|
||||
jobcontrol = parser.add_argument_group("Job control options")
|
||||
jobcontrol.add_argument(
|
||||
'-j',
|
||||
'--jobs',
|
||||
metavar='N',
|
||||
type=numeric(int, 0, 256),
|
||||
help="Use up to N CPU cores simultaneously (default: use all).",
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'-q', '--quiet', action='store_true', help="Suppress INFO messages"
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'-v',
|
||||
'--verbose',
|
||||
type=numeric(int, 0, 2),
|
||||
default=0,
|
||||
const=1,
|
||||
nargs='?',
|
||||
help="Print more verbose messages for each additional verbose level. Use "
|
||||
"`-v 1` typically for much more detailed logging. Higher numbers "
|
||||
"are probably only useful in debugging.",
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'--no-progress-bar',
|
||||
action='store_false',
|
||||
dest='progress_bar',
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
jobcontrol.add_argument('--use-threads', action='store_true', help=argparse.SUPPRESS)
|
||||
jobcontrol = parser.add_argument_group("Job control options")
|
||||
jobcontrol.add_argument(
|
||||
'-j',
|
||||
'--jobs',
|
||||
metavar='N',
|
||||
type=numeric(int, 0, 256),
|
||||
help="Use up to N CPU cores simultaneously (default: use all).",
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'-q', '--quiet', action='store_true', help="Suppress INFO messages"
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'-v',
|
||||
'--verbose',
|
||||
type=numeric(int, 0, 2),
|
||||
default=0,
|
||||
const=1,
|
||||
nargs='?',
|
||||
help="Print more verbose messages for each additional verbose level. Use "
|
||||
"`-v 1` typically for much more detailed logging. Higher numbers "
|
||||
"are probably only useful in debugging.",
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'--no-progress-bar',
|
||||
action='store_false',
|
||||
dest='progress_bar',
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
jobcontrol.add_argument(
|
||||
'--use-threads', action='store_true', help=argparse.SUPPRESS
|
||||
)
|
||||
|
||||
metadata = parser.add_argument_group(
|
||||
"Metadata options",
|
||||
"Set output PDF/A metadata (default: copy input document's metadata)",
|
||||
)
|
||||
metadata.add_argument(
|
||||
'--title', type=str, help="Set document title (place multiple words in quotes)"
|
||||
)
|
||||
metadata.add_argument('--author', type=str, help="Set document author")
|
||||
metadata.add_argument('--subject', type=str, help="Set document subject description")
|
||||
metadata.add_argument('--keywords', type=str, help="Set document keywords")
|
||||
metadata = parser.add_argument_group(
|
||||
"Metadata options",
|
||||
"Set output PDF/A metadata (default: copy input document's metadata)",
|
||||
)
|
||||
metadata.add_argument(
|
||||
'--title', type=str, help="Set document title (place multiple words in quotes)"
|
||||
)
|
||||
metadata.add_argument('--author', type=str, help="Set document author")
|
||||
metadata.add_argument(
|
||||
'--subject', type=str, help="Set document subject description"
|
||||
)
|
||||
metadata.add_argument('--keywords', type=str, help="Set document keywords")
|
||||
|
||||
preprocessing = parser.add_argument_group(
|
||||
"Image preprocessing options",
|
||||
"Options to improve the quality of the final PDF and OCR",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-r',
|
||||
'--rotate-pages',
|
||||
action='store_true',
|
||||
help="Automatically rotate pages based on detected text orientation",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--remove-background',
|
||||
action='store_true',
|
||||
help="Attempt to remove background from gray or color pages, setting it "
|
||||
"to white ",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-d', '--deskew', action='store_true', help="Deskew each page before performing OCR"
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-c',
|
||||
'--clean',
|
||||
action='store_true',
|
||||
help="Clean pages from scanning artifacts before performing OCR, and send "
|
||||
"the cleaned page to OCR, but do not include the cleaned page in "
|
||||
"the output",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-i',
|
||||
'--clean-final',
|
||||
action='store_true',
|
||||
help="Clean page as above, and incorporate the cleaned image in the final "
|
||||
"PDF. Might remove desired content.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--unpaper-args',
|
||||
type=str,
|
||||
default=None,
|
||||
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
|
||||
"Example: --unpaper-args '--layout double'.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--oversample',
|
||||
metavar='DPI',
|
||||
type=numeric(int, 0, 5000),
|
||||
default=0,
|
||||
help="Oversample images to at least the specified DPI, to improve OCR "
|
||||
"results slightly",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--remove-vectors',
|
||||
action='store_true',
|
||||
help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
|
||||
"will not be included in OCR. This can eliminate false characters.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--threshold',
|
||||
action='store_true',
|
||||
help="EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract for OCR. Can "
|
||||
"improve OCR quality compared to Tesseract's thresholder.",
|
||||
)
|
||||
preprocessing = parser.add_argument_group(
|
||||
"Image preprocessing options",
|
||||
"Options to improve the quality of the final PDF and OCR",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-r',
|
||||
'--rotate-pages',
|
||||
action='store_true',
|
||||
help="Automatically rotate pages based on detected text orientation",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--remove-background',
|
||||
action='store_true',
|
||||
help="Attempt to remove background from gray or color pages, setting it "
|
||||
"to white ",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-d',
|
||||
'--deskew',
|
||||
action='store_true',
|
||||
help="Deskew each page before performing OCR",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-c',
|
||||
'--clean',
|
||||
action='store_true',
|
||||
help="Clean pages from scanning artifacts before performing OCR, and send "
|
||||
"the cleaned page to OCR, but do not include the cleaned page in "
|
||||
"the output",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'-i',
|
||||
'--clean-final',
|
||||
action='store_true',
|
||||
help="Clean page as above, and incorporate the cleaned image in the final "
|
||||
"PDF. Might remove desired content.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--unpaper-args',
|
||||
type=str,
|
||||
default=None,
|
||||
help="A quoted string of arguments to pass to unpaper. Requires --clean. "
|
||||
"Example: --unpaper-args '--layout double'.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--oversample',
|
||||
metavar='DPI',
|
||||
type=numeric(int, 0, 5000),
|
||||
default=0,
|
||||
help="Oversample images to at least the specified DPI, to improve OCR "
|
||||
"results slightly",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--remove-vectors',
|
||||
action='store_true',
|
||||
help="EXPERIMENTAL. Mask out any vector objects in the PDF so that they "
|
||||
"will not be included in OCR. This can eliminate false characters.",
|
||||
)
|
||||
preprocessing.add_argument(
|
||||
'--threshold',
|
||||
action='store_true',
|
||||
help=(
|
||||
"EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract "
|
||||
"for OCR. Can improve OCR quality compared to Tesseract's thresholder."
|
||||
),
|
||||
)
|
||||
|
||||
ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
|
||||
ocrsettings.add_argument(
|
||||
'-f',
|
||||
'--force-ocr',
|
||||
action='store_true',
|
||||
help="Rasterize any text or vector objects on each page, apply OCR, and "
|
||||
"save the rastered output (this rewrites the PDF)",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'-s',
|
||||
'--skip-text',
|
||||
action='store_true',
|
||||
help="Skip OCR on any pages that already contain text, but include the "
|
||||
"page in final output; useful for PDFs that contain a mix of "
|
||||
"images, text pages, and/or previously OCRed pages",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'--redo-ocr',
|
||||
action='store_true',
|
||||
help="Attempt to detect and remove the hidden OCR layer from files that "
|
||||
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
|
||||
"to text found in raster images. Existing visible text objects will "
|
||||
"not be changed. If there is no existing OCR, OCR will be added.",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'--skip-big',
|
||||
type=numeric(float, 0, 5000),
|
||||
metavar='MPixels',
|
||||
help="Skip OCR on pages larger than the specified amount of megapixels, "
|
||||
"but include skipped pages in final output",
|
||||
)
|
||||
ocrsettings = parser.add_argument_group("OCR options", "Control how OCR is applied")
|
||||
ocrsettings.add_argument(
|
||||
'-f',
|
||||
'--force-ocr',
|
||||
action='store_true',
|
||||
help="Rasterize any text or vector objects on each page, apply OCR, and "
|
||||
"save the rastered output (this rewrites the PDF)",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'-s',
|
||||
'--skip-text',
|
||||
action='store_true',
|
||||
help="Skip OCR on any pages that already contain text, but include the "
|
||||
"page in final output; useful for PDFs that contain a mix of "
|
||||
"images, text pages, and/or previously OCRed pages",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'--redo-ocr',
|
||||
action='store_true',
|
||||
help="Attempt to detect and remove the hidden OCR layer from files that "
|
||||
"were previously OCRed with OCRmyPDF or another program. Apply OCR "
|
||||
"to text found in raster images. Existing visible text objects will "
|
||||
"not be changed. If there is no existing OCR, OCR will be added.",
|
||||
)
|
||||
ocrsettings.add_argument(
|
||||
'--skip-big',
|
||||
type=numeric(float, 0, 5000),
|
||||
metavar='MPixels',
|
||||
help="Skip OCR on pages larger than the specified amount of megapixels, "
|
||||
"but include skipped pages in final output",
|
||||
)
|
||||
|
||||
optimizing = parser.add_argument_group(
|
||||
"Optimization options", "Control how the PDF is optimized after OCR"
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'-O',
|
||||
'--optimize',
|
||||
type=int,
|
||||
choices=range(0, 4),
|
||||
default=1,
|
||||
help=(
|
||||
"Control how PDF is optimized after processing:"
|
||||
"0 - do not optimize; "
|
||||
"1 - do safe, lossless optimizations (default); "
|
||||
"2 - do some lossy optimizations; "
|
||||
"3 - do aggressive lossy optimizations (including lossy JBIG2)"
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jpeg-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
help=(
|
||||
"Adjust JPEG quality level for JPEG optimization. "
|
||||
"100 is best quality and largest output size; "
|
||||
"1 is lowest quality and smallest output; "
|
||||
"0 uses the default."
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jpg-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
dest='jpeg_quality',
|
||||
help=argparse.SUPPRESS, # Alias for --jpeg-quality
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--png-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
help=(
|
||||
"Adjust PNG quality level to use when quantizing PNGs. "
|
||||
"Values have same meaning as with --jpeg-quality"
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jbig2-lossy',
|
||||
action='store_true',
|
||||
help=(
|
||||
"Enable JBIG2 lossy mode (better compression, not suitable for some "
|
||||
"use cases - see documentation)."
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jbig2-page-group-size',
|
||||
type=numeric(int, 1, 10000),
|
||||
default=0,
|
||||
metavar='N',
|
||||
# Adjust number of pages to consider at once for JBIG2 compression
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
optimizing = parser.add_argument_group(
|
||||
"Optimization options", "Control how the PDF is optimized after OCR"
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'-O',
|
||||
'--optimize',
|
||||
type=int,
|
||||
choices=range(0, 4),
|
||||
default=1,
|
||||
help=(
|
||||
"Control how PDF is optimized after processing:"
|
||||
"0 - do not optimize; "
|
||||
"1 - do safe, lossless optimizations (default); "
|
||||
"2 - do some lossy optimizations; "
|
||||
"3 - do aggressive lossy optimizations (including lossy JBIG2)"
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jpeg-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
help=(
|
||||
"Adjust JPEG quality level for JPEG optimization. "
|
||||
"100 is best quality and largest output size; "
|
||||
"1 is lowest quality and smallest output; "
|
||||
"0 uses the default."
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jpg-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
dest='jpeg_quality',
|
||||
help=argparse.SUPPRESS, # Alias for --jpeg-quality
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--png-quality',
|
||||
type=numeric(int, 0, 100),
|
||||
default=0,
|
||||
metavar='Q',
|
||||
help=(
|
||||
"Adjust PNG quality level to use when quantizing PNGs. "
|
||||
"Values have same meaning as with --jpeg-quality"
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jbig2-lossy',
|
||||
action='store_true',
|
||||
help=(
|
||||
"Enable JBIG2 lossy mode (better compression, not suitable for some "
|
||||
"use cases - see documentation)."
|
||||
),
|
||||
)
|
||||
optimizing.add_argument(
|
||||
'--jbig2-page-group-size',
|
||||
type=numeric(int, 1, 10000),
|
||||
default=0,
|
||||
metavar='N',
|
||||
# Adjust number of pages to consider at once for JBIG2 compression
|
||||
help=argparse.SUPPRESS,
|
||||
)
|
||||
|
||||
advanced = parser.add_argument_group(
|
||||
"Advanced", "Advanced options to control Tesseract's OCR behavior"
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pages',
|
||||
type=str,
|
||||
help="Limit OCR to the specified pages (ranges or comma separated), skipping others",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--max-image-mpixels',
|
||||
action='store',
|
||||
type=numeric(float, 0),
|
||||
metavar='MPixels',
|
||||
help="Set maximum number of pixels to unpack before treating an image as a "
|
||||
"decompression bomb",
|
||||
default=128.0,
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-config',
|
||||
action='append',
|
||||
metavar='CFG',
|
||||
default=[],
|
||||
help="Additional Tesseract configuration files -- see documentation",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-pagesegmode',
|
||||
action='store',
|
||||
type=int,
|
||||
metavar='PSM',
|
||||
choices=range(0, 14),
|
||||
help="Set Tesseract page segmentation mode (see tesseract --help)",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-oem',
|
||||
action='store',
|
||||
type=int,
|
||||
metavar='MODE',
|
||||
choices=range(0, 4),
|
||||
help=(
|
||||
"Set Tesseract 4.0 OCR engine mode: "
|
||||
"0 - original Tesseract only; "
|
||||
"1 - neural nets LSTM only; "
|
||||
"2 - Tesseract + LSTM; "
|
||||
"3 - default."
|
||||
),
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdf-renderer',
|
||||
choices=['auto', 'hocr', 'sandwich'],
|
||||
default='auto',
|
||||
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
|
||||
"choose. See documentation for discussion.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-timeout',
|
||||
default=180.0,
|
||||
type=numeric(float, 0),
|
||||
metavar='SECONDS',
|
||||
help='Give up on OCR after the timeout, but copy the preprocessed page '
|
||||
'into the final output',
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--rotate-pages-threshold',
|
||||
default=14.0,
|
||||
type=numeric(float, 0, 1000),
|
||||
metavar='CONFIDENCE',
|
||||
help="Only rotate pages when confidence is above this value (arbitrary "
|
||||
"units reported by tesseract)",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdfa-image-compression',
|
||||
choices=['auto', 'jpeg', 'lossless'],
|
||||
default='auto',
|
||||
help="Specify how to compress images in the output PDF/A. 'auto' lets "
|
||||
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
|
||||
"JPEG compression. 'lossless' uses PNG-style lossless compression "
|
||||
"for all images. Monochrome images are always compressed using a "
|
||||
"lossless codec. Compression settings "
|
||||
"are applied to all pages, including those for which OCR was "
|
||||
"skipped. Not supported for --output-type=pdf ; that setting "
|
||||
"preserves the original compression of all images.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--user-words',
|
||||
metavar='FILE',
|
||||
help="Specify the location of the Tesseract user words file. This is a "
|
||||
"list of words Tesseract should consider while performing OCR in "
|
||||
"addition to its standard language dictionaries. This can improve "
|
||||
"OCR quality especially for specialized and technical documents.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--user-patterns',
|
||||
metavar='FILE',
|
||||
help="Specify the location of the Tesseract user patterns file.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--fast-web-view',
|
||||
type=numeric(float, 0),
|
||||
default=1.0,
|
||||
metavar="MEGABYTES",
|
||||
help="If the size of file is more than this threshold (in MB), then "
|
||||
"linearize the PDF for fast web viewing. This allows the PDF to be "
|
||||
"displayed before it is fully downloaded in web browsers, but increases "
|
||||
"the space required slightly. By default we skip this for small files "
|
||||
"which do not benefit. If the threshold is 0 it will be apply to all files. "
|
||||
"Set the threshold very high to disable.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--plugins',
|
||||
action='append',
|
||||
default=[],
|
||||
help="Path to a folder than contains plugins.",
|
||||
)
|
||||
advanced = parser.add_argument_group(
|
||||
"Advanced", "Advanced options to control Tesseract's OCR behavior"
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pages',
|
||||
type=str,
|
||||
help=(
|
||||
"Limit OCR to the specified pages (ranges or comma separated), "
|
||||
"skipping others",
|
||||
),
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--max-image-mpixels',
|
||||
action='store',
|
||||
type=numeric(float, 0),
|
||||
metavar='MPixels',
|
||||
help="Set maximum number of pixels to unpack before treating an image as a "
|
||||
"decompression bomb",
|
||||
default=128.0,
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-config',
|
||||
action='append',
|
||||
metavar='CFG',
|
||||
default=[],
|
||||
help="Additional Tesseract configuration files -- see documentation",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-pagesegmode',
|
||||
action='store',
|
||||
type=int,
|
||||
metavar='PSM',
|
||||
choices=range(0, 14),
|
||||
help="Set Tesseract page segmentation mode (see tesseract --help)",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-oem',
|
||||
action='store',
|
||||
type=int,
|
||||
metavar='MODE',
|
||||
choices=range(0, 4),
|
||||
help=(
|
||||
"Set Tesseract 4.0 OCR engine mode: "
|
||||
"0 - original Tesseract only; "
|
||||
"1 - neural nets LSTM only; "
|
||||
"2 - Tesseract + LSTM; "
|
||||
"3 - default."
|
||||
),
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdf-renderer',
|
||||
choices=['auto', 'hocr', 'sandwich'],
|
||||
default='auto',
|
||||
help="Choose OCR PDF renderer - the default option is to let OCRmyPDF "
|
||||
"choose. See documentation for discussion.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--tesseract-timeout',
|
||||
default=180.0,
|
||||
type=numeric(float, 0),
|
||||
metavar='SECONDS',
|
||||
help='Give up on OCR after the timeout, but copy the preprocessed page '
|
||||
'into the final output',
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--rotate-pages-threshold',
|
||||
default=14.0,
|
||||
type=numeric(float, 0, 1000),
|
||||
metavar='CONFIDENCE',
|
||||
help="Only rotate pages when confidence is above this value (arbitrary "
|
||||
"units reported by tesseract)",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--pdfa-image-compression',
|
||||
choices=['auto', 'jpeg', 'lossless'],
|
||||
default='auto',
|
||||
help="Specify how to compress images in the output PDF/A. 'auto' lets "
|
||||
"OCRmyPDF decide. 'jpeg' changes all grayscale and color images to "
|
||||
"JPEG compression. 'lossless' uses PNG-style lossless compression "
|
||||
"for all images. Monochrome images are always compressed using a "
|
||||
"lossless codec. Compression settings "
|
||||
"are applied to all pages, including those for which OCR was "
|
||||
"skipped. Not supported for --output-type=pdf ; that setting "
|
||||
"preserves the original compression of all images.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--user-words',
|
||||
metavar='FILE',
|
||||
help="Specify the location of the Tesseract user words file. This is a "
|
||||
"list of words Tesseract should consider while performing OCR in "
|
||||
"addition to its standard language dictionaries. This can improve "
|
||||
"OCR quality especially for specialized and technical documents.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--user-patterns',
|
||||
metavar='FILE',
|
||||
help="Specify the location of the Tesseract user patterns file.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--fast-web-view',
|
||||
type=numeric(float, 0),
|
||||
default=1.0,
|
||||
metavar="MEGABYTES",
|
||||
help="If the size of file is more than this threshold (in MB), then "
|
||||
"linearize the PDF for fast web viewing. This allows the PDF to be "
|
||||
"displayed before it is fully downloaded in web browsers, but increases "
|
||||
"the space required slightly. By default we skip this for small files "
|
||||
"which do not benefit. If the threshold is 0 it will be apply to all files. "
|
||||
"Set the threshold very high to disable.",
|
||||
)
|
||||
advanced.add_argument(
|
||||
'--plugins',
|
||||
action='append',
|
||||
default=[],
|
||||
help="Path to a folder than contains plugins.",
|
||||
)
|
||||
|
||||
debugging = parser.add_argument_group(
|
||||
"Debugging", "Arguments to help with troubleshooting and debugging"
|
||||
)
|
||||
debugging.add_argument(
|
||||
'-k',
|
||||
'--keep-temporary-files',
|
||||
action='store_true',
|
||||
help="Keep temporary files (helpful for debugging)",
|
||||
)
|
||||
debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
|
||||
return parser
|
||||
|
||||
debugging = parser.add_argument_group(
|
||||
"Debugging", "Arguments to help with troubleshooting and debugging"
|
||||
)
|
||||
debugging.add_argument(
|
||||
'-k',
|
||||
'--keep-temporary-files',
|
||||
action='store_true',
|
||||
help="Keep temporary files (helpful for debugging)",
|
||||
)
|
||||
debugging.add_argument('--tesseract-env', type=str, help=argparse.SUPPRESS)
|
||||
|
||||
plugins_only_parser = ArgumentParser(
|
||||
prog=_PROGRAM_NAME, fromfile_prefix_chars='@', add_help=False, allow_abbrev=False
|
||||
|
||||
@@ -582,7 +582,7 @@ def main(infile, outfile, level, jobs=1):
|
||||
)
|
||||
|
||||
with TemporaryDirectory() as td:
|
||||
context = PDFContext(options, td, infile, None)
|
||||
context = PDFContext(options, td, infile, None, None)
|
||||
tmpout = Path(td) / 'out.pdf'
|
||||
optimize(
|
||||
infile,
|
||||
|
||||
@@ -214,7 +214,7 @@ def no_outpdf(tmp_path):
|
||||
def check_ocrmypdf(input_file, output_file, *args, env=None):
|
||||
"""Run ocrmypdf and confirmed that a valid file was created"""
|
||||
|
||||
options = cli.parser.parse_args(
|
||||
options = cli.get_parser().parse_args(
|
||||
[str(input_file), str(output_file)]
|
||||
+ [str(arg) for arg in args if arg is not None]
|
||||
)
|
||||
@@ -222,7 +222,7 @@ def check_ocrmypdf(input_file, output_file, *args, env=None):
|
||||
if env:
|
||||
options.tesseract_env = env
|
||||
options.tesseract_env['_OCRMYPDF_TEST_INFILE'] = os.fspath(input_file)
|
||||
result = api.run_pipeline(options, api=True)
|
||||
result = api.run_pipeline(options, plugin_manager=None, api=True)
|
||||
|
||||
assert result == 0
|
||||
assert os.path.exists(str(output_file)), "Output file not created"
|
||||
@@ -238,7 +238,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None):
|
||||
Does not currently have a way to manipulate the PATH except for Tesseract.
|
||||
"""
|
||||
|
||||
options = cli.parser.parse_args(
|
||||
options = cli.get_parser().parse_args(
|
||||
[str(input_file), str(output_file)]
|
||||
+ [str(arg) for arg in args if arg is not None]
|
||||
)
|
||||
@@ -253,7 +253,7 @@ def run_ocrmypdf_api(input_file, output_file, *args, env=None):
|
||||
if options.tesseract_env:
|
||||
assert all(isinstance(v, (str, bytes)) for v in options.tesseract_env.values())
|
||||
|
||||
return api.run_pipeline(options, api=False)
|
||||
return api.run_pipeline(options, plugin_manager=None, api=False)
|
||||
|
||||
|
||||
@pytest.helpers.register
|
||||
|
||||
@@ -32,7 +32,7 @@ from pikepdf.models.metadata import decode_pdf_date
|
||||
|
||||
from ocrmypdf._jobcontext import PDFContext
|
||||
from ocrmypdf._pipeline import convert_to_pdfa
|
||||
from ocrmypdf.cli import parser
|
||||
from ocrmypdf.cli import get_parser
|
||||
from ocrmypdf.exceptions import ExitCode
|
||||
from ocrmypdf.pdfa import SRGB_ICC_PROFILE, file_claims_pdfa, generate_pdfa_ps
|
||||
from ocrmypdf.pdfinfo import PdfInfo
|
||||
@@ -290,16 +290,15 @@ def test_kodak_toc(resources, outpdf, spoof_tesseract_noop):
|
||||
|
||||
|
||||
def test_metadata_fixup_warning(resources, outdir, caplog):
|
||||
from ocrmypdf.__main__ import parser
|
||||
from ocrmypdf._pipeline import metadata_fixup
|
||||
|
||||
options = parser.parse_args(
|
||||
options = get_parser().parse_args(
|
||||
args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
|
||||
)
|
||||
|
||||
copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')
|
||||
|
||||
context = PDFContext(options, outdir, outdir / 'graph.pdf', None)
|
||||
context = PDFContext(options, outdir, outdir / 'graph.pdf', None, None)
|
||||
metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
|
||||
for record in caplog.records:
|
||||
assert record.levelname != 'WARNING'
|
||||
@@ -310,7 +309,7 @@ def test_metadata_fixup_warning(resources, outdir, caplog):
|
||||
meta['prism2:publicationName'] = 'OCRmyPDF Test'
|
||||
graph.save(outdir / 'graph_mod.pdf')
|
||||
|
||||
context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None)
|
||||
context = PDFContext(options, outdir, outdir / 'graph_mod.pdf', None, None)
|
||||
metadata_fixup(working_file=outdir / 'graph.pdf', context=context)
|
||||
assert any(record.levelname == 'WARNING' for record in caplog.records)
|
||||
|
||||
@@ -326,11 +325,11 @@ def test_prevent_gs_invalid_xml(resources, outdir):
|
||||
Title=b'String with trailing nul\x00'
|
||||
)
|
||||
|
||||
options = parser.parse_args(
|
||||
options = get_parser().parse_args(
|
||||
args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
|
||||
)
|
||||
pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
|
||||
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo)
|
||||
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None)
|
||||
|
||||
convert_to_pdfa(
|
||||
str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
|
||||
@@ -357,11 +356,11 @@ def test_malformed_docinfo(caplog, resources, outdir):
|
||||
pike.trailer.Info = pikepdf.Stream(pike, b"<xml></xml>")
|
||||
pike.save(outdir / 'layers.rendered.pdf', fix_metadata_version=False)
|
||||
|
||||
options = parser.parse_args(
|
||||
options = get_parser().parse_args(
|
||||
args=['-j', '1', '--output-type', 'pdfa-2', 'a.pdf', 'b.pdf']
|
||||
)
|
||||
pdfinfo = PdfInfo(outdir / 'layers.rendered.pdf')
|
||||
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo)
|
||||
context = PDFContext(options, outdir, outdir / 'layers.rendered.pdf', pdfinfo, None)
|
||||
|
||||
convert_to_pdfa(
|
||||
str(outdir / 'layers.rendered.pdf'), str(outdir / 'pdfa.ps'), context
|
||||
|
||||
@@ -21,7 +21,7 @@ from unittest.mock import patch
|
||||
import pytest
|
||||
|
||||
from ocrmypdf._validation import check_options
|
||||
from ocrmypdf.cli import parser
|
||||
from ocrmypdf.cli import get_parser
|
||||
from ocrmypdf.exceptions import ExitCode, MissingDependencyError
|
||||
from ocrmypdf.exec import unpaper
|
||||
|
||||
@@ -51,7 +51,7 @@ def spoof_unpaper_oldversion(tmp_path_factory):
|
||||
def test_no_unpaper(resources, no_outpdf):
|
||||
input_ = fspath(resources / "c02-22.pdf")
|
||||
output = fspath(no_outpdf)
|
||||
options = parser.parse_args(args=["--clean", input_, output])
|
||||
options = get_parser().parse_args(args=["--clean", input_, output])
|
||||
|
||||
with patch("ocrmypdf.exec.unpaper.version") as mock_unpaper_version:
|
||||
mock_unpaper_version.side_effect = FileNotFoundError("unpaper")
|
||||
|
||||
@@ -23,6 +23,7 @@ import pytest
|
||||
|
||||
import ocrmypdf._validation as vd
|
||||
from ocrmypdf.api import create_options
|
||||
from ocrmypdf.cli import get_parser
|
||||
from ocrmypdf.exceptions import BadArgsError, MissingDependencyError
|
||||
from ocrmypdf.pdfinfo import PdfInfo
|
||||
|
||||
@@ -30,7 +31,9 @@ from ocrmypdf.pdfinfo import PdfInfo
|
||||
def make_opts(input_file='a.pdf', output_file='b.pdf', language='eng', **kwargs):
|
||||
if language is not None:
|
||||
kwargs['language'] = language
|
||||
return create_options(input_file=input_file, output_file=output_file, **kwargs)
|
||||
return create_options(
|
||||
input_file=input_file, output_file=output_file, parser=get_parser(), **kwargs
|
||||
)
|
||||
|
||||
|
||||
def test_hocr_notlatin_warning(caplog):
|
||||
|
||||
Reference in New Issue
Block a user