mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-05 13:16:55 -04:00
Pull JobContext out of pipeline.py to avoid circular reference
This commit is contained in:
@@ -36,8 +36,8 @@ import ruffus.ruffus_exceptions as ruffus_exceptions
|
||||
import ruffus.cmdline as cmdline
|
||||
import ruffus.proxy_logger as proxy_logger
|
||||
|
||||
from .pipeline import JobContext, JobContextManager, \
|
||||
cleanup_working_files, build_pipeline
|
||||
from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
|
||||
from .pipeline import build_pipeline
|
||||
from .pdfa import file_claims_pdfa
|
||||
from .helpers import is_iterable_notstr, re_symlink, is_file_writable, \
|
||||
available_cpu_count
|
||||
|
||||
84
src/ocrmypdf/_jobcontext.py
Normal file
84
src/ocrmypdf/_jobcontext.py
Normal file
@@ -0,0 +1,84 @@
|
||||
# © 2018 James R. Barlow: github.com/jbarlow83
|
||||
#
|
||||
# This file is part of OCRmyPDF.
|
||||
#
|
||||
# OCRmyPDF is free software: you can redistribute it and/or modify
|
||||
# it under the terms of the GNU General Public License as published by
|
||||
# the Free Software Foundation, either version 3 of the License, or
|
||||
# (at your option) any later version.
|
||||
#
|
||||
# OCRmyPDF is distributed in the hope that it will be useful,
|
||||
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
# GNU General Public License for more details.
|
||||
#
|
||||
# You should have received a copy of the GNU General Public License
|
||||
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
|
||||
|
||||
from contextlib import suppress
|
||||
from multiprocessing.managers import SyncManager
|
||||
import sys
|
||||
import shutil
|
||||
|
||||
from .pdfinfo import PdfInfo
|
||||
|
||||
|
||||
class JobContext:
|
||||
"""Holds our context for a particular run of the pipeline
|
||||
|
||||
A multiprocessing manager effectively creates a separate process
|
||||
that keeps the master job context object. Other threads access
|
||||
job context via multiprocessing proxy objects.
|
||||
|
||||
While this would naturally lend itself @property's it seems to make
|
||||
a little more sense to use functions to make it explicitly that the
|
||||
invocation requires marshalling data across a process boundary.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.pdfinfo = None
|
||||
self.options = None
|
||||
self.work_folder = None
|
||||
self.rotations = {}
|
||||
|
||||
def generate_pdfinfo(self, infile):
|
||||
self.pdfinfo = PdfInfo(infile)
|
||||
|
||||
def get_pdfinfo(self):
|
||||
"What we know about the input PDF"
|
||||
return self.pdfinfo
|
||||
|
||||
def set_pdfinfo(self, pdfinfo):
|
||||
self.pdfinfo = pdfinfo
|
||||
|
||||
def get_options(self):
|
||||
return self.options
|
||||
|
||||
def set_options(self, options):
|
||||
self.options = options
|
||||
|
||||
def get_work_folder(self):
|
||||
return self.work_folder
|
||||
|
||||
def set_work_folder(self, work_folder):
|
||||
self.work_folder = work_folder
|
||||
|
||||
def get_rotation(self, pageno):
|
||||
return self.rotations.get(pageno, 0)
|
||||
|
||||
def set_rotation(self, pageno, value):
|
||||
self.rotations[pageno] = value
|
||||
|
||||
|
||||
class JobContextManager(SyncManager):
|
||||
pass
|
||||
|
||||
|
||||
def cleanup_working_files(work_folder, options):
|
||||
if options.keep_temporary_files:
|
||||
print("Temporary working files saved at:\n{0}".format(work_folder),
|
||||
file=sys.stderr)
|
||||
else:
|
||||
with suppress(FileNotFoundError):
|
||||
shutil.rmtree(work_folder)
|
||||
@@ -18,14 +18,18 @@
|
||||
from pathlib import Path
|
||||
from subprocess import CalledProcessError
|
||||
import concurrent.futures
|
||||
from collections import defaultdict
|
||||
from collections import defaultdict, namedtuple
|
||||
import struct
|
||||
import logging
|
||||
import sys
|
||||
|
||||
from io import BytesIO
|
||||
from PIL import Image
|
||||
|
||||
from .lib import fitz
|
||||
import pikepdf
|
||||
|
||||
from ._jobcontext import JobContext
|
||||
from . import leptonica
|
||||
from .helpers import re_symlink
|
||||
from .exec import pngquant, jbig2enc
|
||||
@@ -446,18 +450,18 @@ def optimize(
|
||||
re_symlink(target_file, output_file, log)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
import logging
|
||||
import sys
|
||||
from .pipeline import JobContext
|
||||
from collections import namedtuple
|
||||
def main(infile, outfile, level, jobs=1):
|
||||
Options = namedtuple('Options', 'jobs optimize')
|
||||
|
||||
logging.basicConfig(level=logging.DEBUG)
|
||||
log = logging.getLogger()
|
||||
|
||||
ctx = JobContext()
|
||||
options = Options(jobs=4, optimize=3)
|
||||
options = Options(jobs=jobs, optimize=int(level))
|
||||
ctx.set_options(options)
|
||||
|
||||
optimize(sys.argv[1], sys.argv[2], log, ctx)
|
||||
optimize(infile, outfile, log, ctx)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main(sys.argv[1], sys.argv[2], sys.argv[3])
|
||||
@@ -48,76 +48,10 @@ from ._optimize import optimize
|
||||
|
||||
VECTOR_PAGE_DPI = 400
|
||||
|
||||
# -------------
|
||||
# Pipeline state manager
|
||||
|
||||
class JobContext:
|
||||
"""Holds our context for a particular run of the pipeline
|
||||
|
||||
A multiprocessing manager effectively creates a separate process
|
||||
that keeps the master job context object. Other threads access
|
||||
job context via multiprocessing proxy objects.
|
||||
|
||||
While this would naturally lend itself @property's it seems to make
|
||||
a little more sense to use functions to make it explicitly that the
|
||||
invocation requires marshalling data across a process boundary.
|
||||
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.pdfinfo = None
|
||||
self.options = None
|
||||
self.work_folder = None
|
||||
self.rotations = {}
|
||||
|
||||
def generate_pdfinfo(self, infile):
|
||||
self.pdfinfo = PdfInfo(infile)
|
||||
|
||||
def get_pdfinfo(self):
|
||||
"What we know about the input PDF"
|
||||
return self.pdfinfo
|
||||
|
||||
def set_pdfinfo(self, pdfinfo):
|
||||
self.pdfinfo = pdfinfo
|
||||
|
||||
def get_options(self):
|
||||
return self.options
|
||||
|
||||
def set_options(self, options):
|
||||
self.options = options
|
||||
|
||||
def get_work_folder(self):
|
||||
return self.work_folder
|
||||
|
||||
def set_work_folder(self, work_folder):
|
||||
self.work_folder = work_folder
|
||||
|
||||
def get_rotation(self, pageno):
|
||||
return self.rotations.get(pageno, 0)
|
||||
|
||||
def set_rotation(self, pageno, value):
|
||||
self.rotations[pageno] = value
|
||||
|
||||
|
||||
from multiprocessing.managers import SyncManager
|
||||
class JobContextManager(SyncManager):
|
||||
pass
|
||||
|
||||
|
||||
def cleanup_working_files(work_folder, options):
|
||||
if options.keep_temporary_files:
|
||||
print("Temporary working files saved at:\n{0}".format(work_folder),
|
||||
file=sys.stderr)
|
||||
else:
|
||||
with suppress(FileNotFoundError):
|
||||
shutil.rmtree(work_folder)
|
||||
|
||||
|
||||
#
|
||||
# The Pipeline
|
||||
#
|
||||
|
||||
|
||||
def triage_image_file(input_file, output_file, log, options):
|
||||
try:
|
||||
log.info("Input file is not a PDF, checking if it is an image...")
|
||||
|
||||
Reference in New Issue
Block a user