Pull JobContext out of pipeline.py to avoid circular reference

This commit is contained in:
James R. Barlow
2018-05-14 14:01:25 -07:00
parent 87a7d4d1a8
commit f0a56592e2
4 changed files with 98 additions and 76 deletions

View File

@@ -36,8 +36,8 @@ import ruffus.ruffus_exceptions as ruffus_exceptions
import ruffus.cmdline as cmdline
import ruffus.proxy_logger as proxy_logger
from .pipeline import JobContext, JobContextManager, \
cleanup_working_files, build_pipeline
from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
from .pipeline import build_pipeline
from .pdfa import file_claims_pdfa
from .helpers import is_iterable_notstr, re_symlink, is_file_writable, \
available_cpu_count

View File

@@ -0,0 +1,84 @@
# © 2018 James R. Barlow: github.com/jbarlow83
#
# This file is part of OCRmyPDF.
#
# OCRmyPDF is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# OCRmyPDF is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with OCRmyPDF. If not, see <http://www.gnu.org/licenses/>.
from contextlib import suppress
from multiprocessing.managers import SyncManager
import sys
import shutil
from .pdfinfo import PdfInfo
class JobContext:
"""Holds our context for a particular run of the pipeline
A multiprocessing manager effectively creates a separate process
that keeps the master job context object. Other threads access
job context via multiprocessing proxy objects.
While this would naturally lend itself @property's it seems to make
a little more sense to use functions to make it explicitly that the
invocation requires marshalling data across a process boundary.
"""
def __init__(self):
self.pdfinfo = None
self.options = None
self.work_folder = None
self.rotations = {}
def generate_pdfinfo(self, infile):
self.pdfinfo = PdfInfo(infile)
def get_pdfinfo(self):
"What we know about the input PDF"
return self.pdfinfo
def set_pdfinfo(self, pdfinfo):
self.pdfinfo = pdfinfo
def get_options(self):
return self.options
def set_options(self, options):
self.options = options
def get_work_folder(self):
return self.work_folder
def set_work_folder(self, work_folder):
self.work_folder = work_folder
def get_rotation(self, pageno):
return self.rotations.get(pageno, 0)
def set_rotation(self, pageno, value):
self.rotations[pageno] = value
class JobContextManager(SyncManager):
pass
def cleanup_working_files(work_folder, options):
if options.keep_temporary_files:
print("Temporary working files saved at:\n{0}".format(work_folder),
file=sys.stderr)
else:
with suppress(FileNotFoundError):
shutil.rmtree(work_folder)

View File

@@ -18,14 +18,18 @@
from pathlib import Path
from subprocess import CalledProcessError
import concurrent.futures
from collections import defaultdict
from collections import defaultdict, namedtuple
import struct
import logging
import sys
from io import BytesIO
from PIL import Image
from .lib import fitz
import pikepdf
from ._jobcontext import JobContext
from . import leptonica
from .helpers import re_symlink
from .exec import pngquant, jbig2enc
@@ -446,18 +450,18 @@ def optimize(
re_symlink(target_file, output_file, log)
if __name__ == '__main__':
import logging
import sys
from .pipeline import JobContext
from collections import namedtuple
def main(infile, outfile, level, jobs=1):
Options = namedtuple('Options', 'jobs optimize')
logging.basicConfig(level=logging.DEBUG)
log = logging.getLogger()
ctx = JobContext()
options = Options(jobs=4, optimize=3)
options = Options(jobs=jobs, optimize=int(level))
ctx.set_options(options)
optimize(sys.argv[1], sys.argv[2], log, ctx)
optimize(infile, outfile, log, ctx)
if __name__ == '__main__':
main(sys.argv[1], sys.argv[2], sys.argv[3])

View File

@@ -48,76 +48,10 @@ from ._optimize import optimize
VECTOR_PAGE_DPI = 400
# -------------
# Pipeline state manager
class JobContext:
"""Holds our context for a particular run of the pipeline
A multiprocessing manager effectively creates a separate process
that keeps the master job context object. Other threads access
job context via multiprocessing proxy objects.
While this would naturally lend itself @property's it seems to make
a little more sense to use functions to make it explicitly that the
invocation requires marshalling data across a process boundary.
"""
def __init__(self):
self.pdfinfo = None
self.options = None
self.work_folder = None
self.rotations = {}
def generate_pdfinfo(self, infile):
self.pdfinfo = PdfInfo(infile)
def get_pdfinfo(self):
"What we know about the input PDF"
return self.pdfinfo
def set_pdfinfo(self, pdfinfo):
self.pdfinfo = pdfinfo
def get_options(self):
return self.options
def set_options(self, options):
self.options = options
def get_work_folder(self):
return self.work_folder
def set_work_folder(self, work_folder):
self.work_folder = work_folder
def get_rotation(self, pageno):
return self.rotations.get(pageno, 0)
def set_rotation(self, pageno, value):
self.rotations[pageno] = value
from multiprocessing.managers import SyncManager
class JobContextManager(SyncManager):
pass
def cleanup_working_files(work_folder, options):
if options.keep_temporary_files:
print("Temporary working files saved at:\n{0}".format(work_folder),
file=sys.stderr)
else:
with suppress(FileNotFoundError):
shutil.rmtree(work_folder)
#
# The Pipeline
#
def triage_image_file(input_file, output_file, log, options):
try:
log.info("Input file is not a PDF, checking if it is an image...")