From f0a56592e294d66cef919df7f9bb26f862ee482d Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 14 May 2018 14:01:25 -0700 Subject: [PATCH] Pull JobContext out of pipeline.py to avoid circular reference --- src/ocrmypdf/__main__.py | 4 +- src/ocrmypdf/_jobcontext.py | 84 +++++++++++++++++++++++++++++++++++++ src/ocrmypdf/_optimize.py | 20 +++++---- src/ocrmypdf/pipeline.py | 66 ----------------------------- 4 files changed, 98 insertions(+), 76 deletions(-) create mode 100644 src/ocrmypdf/_jobcontext.py diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index f0cd9f7a..d549f5dc 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -36,8 +36,8 @@ import ruffus.ruffus_exceptions as ruffus_exceptions import ruffus.cmdline as cmdline import ruffus.proxy_logger as proxy_logger -from .pipeline import JobContext, JobContextManager, \ - cleanup_working_files, build_pipeline +from ._jobcontext import JobContext, JobContextManager, cleanup_working_files +from .pipeline import build_pipeline from .pdfa import file_claims_pdfa from .helpers import is_iterable_notstr, re_symlink, is_file_writable, \ available_cpu_count diff --git a/src/ocrmypdf/_jobcontext.py b/src/ocrmypdf/_jobcontext.py new file mode 100644 index 00000000..7d21c213 --- /dev/null +++ b/src/ocrmypdf/_jobcontext.py @@ -0,0 +1,84 @@ +# © 2018 James R. Barlow: github.com/jbarlow83 +# +# This file is part of OCRmyPDF. +# +# OCRmyPDF is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# OCRmyPDF is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with OCRmyPDF. If not, see . + +from contextlib import suppress +from multiprocessing.managers import SyncManager +import sys +import shutil + +from .pdfinfo import PdfInfo + + +class JobContext: + """Holds our context for a particular run of the pipeline + + A multiprocessing manager effectively creates a separate process + that keeps the master job context object. Other threads access + job context via multiprocessing proxy objects. + + While this would naturally lend itself @property's it seems to make + a little more sense to use functions to make it explicitly that the + invocation requires marshalling data across a process boundary. + + """ + + def __init__(self): + self.pdfinfo = None + self.options = None + self.work_folder = None + self.rotations = {} + + def generate_pdfinfo(self, infile): + self.pdfinfo = PdfInfo(infile) + + def get_pdfinfo(self): + "What we know about the input PDF" + return self.pdfinfo + + def set_pdfinfo(self, pdfinfo): + self.pdfinfo = pdfinfo + + def get_options(self): + return self.options + + def set_options(self, options): + self.options = options + + def get_work_folder(self): + return self.work_folder + + def set_work_folder(self, work_folder): + self.work_folder = work_folder + + def get_rotation(self, pageno): + return self.rotations.get(pageno, 0) + + def set_rotation(self, pageno, value): + self.rotations[pageno] = value + + +class JobContextManager(SyncManager): + pass + + +def cleanup_working_files(work_folder, options): + if options.keep_temporary_files: + print("Temporary working files saved at:\n{0}".format(work_folder), + file=sys.stderr) + else: + with suppress(FileNotFoundError): + shutil.rmtree(work_folder) diff --git a/src/ocrmypdf/_optimize.py b/src/ocrmypdf/_optimize.py index c3b00d3b..c1ebdd90 100644 --- a/src/ocrmypdf/_optimize.py +++ b/src/ocrmypdf/_optimize.py @@ -18,14 +18,18 @@ from pathlib import Path from subprocess import CalledProcessError import concurrent.futures -from collections import defaultdict +from collections import defaultdict, namedtuple import struct +import logging +import sys + from io import BytesIO from PIL import Image from .lib import fitz import pikepdf +from ._jobcontext import JobContext from . import leptonica from .helpers import re_symlink from .exec import pngquant, jbig2enc @@ -446,18 +450,18 @@ def optimize( re_symlink(target_file, output_file, log) -if __name__ == '__main__': - import logging - import sys - from .pipeline import JobContext - from collections import namedtuple +def main(infile, outfile, level, jobs=1): Options = namedtuple('Options', 'jobs optimize') logging.basicConfig(level=logging.DEBUG) log = logging.getLogger() ctx = JobContext() - options = Options(jobs=4, optimize=3) + options = Options(jobs=jobs, optimize=int(level)) ctx.set_options(options) - optimize(sys.argv[1], sys.argv[2], log, ctx) \ No newline at end of file + optimize(infile, outfile, log, ctx) + + +if __name__ == '__main__': + main(sys.argv[1], sys.argv[2], sys.argv[3]) \ No newline at end of file diff --git a/src/ocrmypdf/pipeline.py b/src/ocrmypdf/pipeline.py index 7ed102d2..c7a38707 100644 --- a/src/ocrmypdf/pipeline.py +++ b/src/ocrmypdf/pipeline.py @@ -48,76 +48,10 @@ from ._optimize import optimize VECTOR_PAGE_DPI = 400 -# ------------- -# Pipeline state manager - -class JobContext: - """Holds our context for a particular run of the pipeline - - A multiprocessing manager effectively creates a separate process - that keeps the master job context object. Other threads access - job context via multiprocessing proxy objects. - - While this would naturally lend itself @property's it seems to make - a little more sense to use functions to make it explicitly that the - invocation requires marshalling data across a process boundary. - - """ - - def __init__(self): - self.pdfinfo = None - self.options = None - self.work_folder = None - self.rotations = {} - - def generate_pdfinfo(self, infile): - self.pdfinfo = PdfInfo(infile) - - def get_pdfinfo(self): - "What we know about the input PDF" - return self.pdfinfo - - def set_pdfinfo(self, pdfinfo): - self.pdfinfo = pdfinfo - - def get_options(self): - return self.options - - def set_options(self, options): - self.options = options - - def get_work_folder(self): - return self.work_folder - - def set_work_folder(self, work_folder): - self.work_folder = work_folder - - def get_rotation(self, pageno): - return self.rotations.get(pageno, 0) - - def set_rotation(self, pageno, value): - self.rotations[pageno] = value - - -from multiprocessing.managers import SyncManager -class JobContextManager(SyncManager): - pass - - -def cleanup_working_files(work_folder, options): - if options.keep_temporary_files: - print("Temporary working files saved at:\n{0}".format(work_folder), - file=sys.stderr) - else: - with suppress(FileNotFoundError): - shutil.rmtree(work_folder) - - # # The Pipeline # - def triage_image_file(input_file, output_file, log, options): try: log.info("Input file is not a PDF, checking if it is an image...")