Pull JobContext out of pipeline.py to avoid circular reference

2026-05-05 13:16:55 -04:00 · 2018-05-14 14:01:25 -07:00
parent 87a7d4d1a8
commit f0a56592e2
4 changed files with 98 additions and 76 deletions
--- a/src/ocrmypdf/main.py
+++ b/src/ocrmypdf/main.py
@@ -36,8 +36,8 @@ import ruffus.ruffus_exceptions as ruffus_exceptions
 import ruffus.cmdline as cmdline
 import ruffus.proxy_logger as proxy_logger

-from .pipeline import JobContext, JobContextManager, \
-    cleanup_working_files, build_pipeline
+from ._jobcontext import JobContext, JobContextManager, cleanup_working_files
+from .pipeline import build_pipeline
 from .pdfa import file_claims_pdfa
 from .helpers import is_iterable_notstr, re_symlink, is_file_writable, \
    available_cpu_count
--- a/src/ocrmypdf/_jobcontext.py
+++ b/src/ocrmypdf/_jobcontext.py
@@ -0,0 +1,84 @@
+# © 2018 James R. Barlow: github.com/jbarlow83
+#
+# This file is part of OCRmyPDF.
+#
+# OCRmyPDF is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# OCRmyPDF is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with OCRmyPDF.  If not, see <http://www.gnu.org/licenses/>.
+
+from contextlib import suppress
+from multiprocessing.managers import SyncManager
+import sys
+import shutil
+
+from .pdfinfo import PdfInfo
+
+
+class JobContext:
+    """Holds our context for a particular run of the pipeline
+
+    A multiprocessing manager effectively creates a separate process
+    that keeps the master job context object.  Other threads access
+    job context via multiprocessing proxy objects.
+
+    While this would naturally lend itself @property's it seems to make
+    a little more sense to use functions to make it explicitly that the
+    invocation requires marshalling data across a process boundary.
+
+    """
+
+    def __init__(self):
+        self.pdfinfo = None
+        self.options = None
+        self.work_folder = None
+        self.rotations = {}
+
+    def generate_pdfinfo(self, infile):
+        self.pdfinfo = PdfInfo(infile)
+
+    def get_pdfinfo(self):
+        "What we know about the input PDF"
+        return self.pdfinfo
+
+    def set_pdfinfo(self, pdfinfo):
+        self.pdfinfo = pdfinfo
+
+    def get_options(self):
+        return self.options
+
+    def set_options(self, options):
+        self.options = options
+
+    def get_work_folder(self):
+        return self.work_folder
+
+    def set_work_folder(self, work_folder):
+        self.work_folder = work_folder
+
+    def get_rotation(self, pageno):
+        return self.rotations.get(pageno, 0)
+
+    def set_rotation(self, pageno, value):
+        self.rotations[pageno] = value
+
+
+class JobContextManager(SyncManager):
+    pass
+
+
+def cleanup_working_files(work_folder, options):
+    if options.keep_temporary_files:
+        print("Temporary working files saved at:\n{0}".format(work_folder),
+              file=sys.stderr)
+    else:
+        with suppress(FileNotFoundError):
+            shutil.rmtree(work_folder)
--- a/src/ocrmypdf/_optimize.py
+++ b/src/ocrmypdf/_optimize.py
@@ -18,14 +18,18 @@
 from pathlib import Path
 from subprocess import CalledProcessError
 import concurrent.futures
-from collections import defaultdict
+from collections import defaultdict, namedtuple
 import struct
+import logging
+import sys
+
 from io import BytesIO
 from PIL import Image

 from .lib import fitz
 import pikepdf

+from ._jobcontext import JobContext
 from . import leptonica
 from .helpers import re_symlink
 from .exec import pngquant, jbig2enc
@@ -446,18 +450,18 @@ def optimize(
        re_symlink(target_file, output_file, log)
        

-if __name__ == '__main__':
-    import logging
-    import sys
-    from .pipeline import JobContext
-    from collections import namedtuple
+def main(infile, outfile, level, jobs=1):
    Options = namedtuple('Options', 'jobs optimize')

    logging.basicConfig(level=logging.DEBUG)
    log = logging.getLogger()

    ctx = JobContext()
-    options = Options(jobs=4, optimize=3)
+    options = Options(jobs=jobs, optimize=int(level))
    ctx.set_options(options)

-    optimize(sys.argv[1], sys.argv[2], log, ctx)
+    optimize(infile, outfile, log, ctx)
+
+
+if __name__ == '__main__':
+    main(sys.argv[1], sys.argv[2], sys.argv[3])
--- a/src/ocrmypdf/pipeline.py
+++ b/src/ocrmypdf/pipeline.py
@@ -48,76 +48,10 @@ from ._optimize import optimize

 VECTOR_PAGE_DPI = 400

-# -------------
-# Pipeline state manager
-
-class JobContext:
-    """Holds our context for a particular run of the pipeline
-
-    A multiprocessing manager effectively creates a separate process
-    that keeps the master job context object.  Other threads access
-    job context via multiprocessing proxy objects.
-
-    While this would naturally lend itself @property's it seems to make
-    a little more sense to use functions to make it explicitly that the
-    invocation requires marshalling data across a process boundary.
-
-    """
-
-    def __init__(self):
-        self.pdfinfo = None
-        self.options = None
-        self.work_folder = None
-        self.rotations = {}
-
-    def generate_pdfinfo(self, infile):
-        self.pdfinfo = PdfInfo(infile)
-
-    def get_pdfinfo(self):
-        "What we know about the input PDF"
-        return self.pdfinfo
-
-    def set_pdfinfo(self, pdfinfo):
-        self.pdfinfo = pdfinfo
-
-    def get_options(self):
-        return self.options
-
-    def set_options(self, options):
-        self.options = options
-
-    def get_work_folder(self):
-        return self.work_folder
-
-    def set_work_folder(self, work_folder):
-        self.work_folder = work_folder
-
-    def get_rotation(self, pageno):
-        return self.rotations.get(pageno, 0)
-
-    def set_rotation(self, pageno, value):
-        self.rotations[pageno] = value
-
-
-from multiprocessing.managers import SyncManager
-class JobContextManager(SyncManager):
-    pass
-
-
-def cleanup_working_files(work_folder, options):
-    if options.keep_temporary_files:
-        print("Temporary working files saved at:\n{0}".format(work_folder),
-              file=sys.stderr)
-    else:
-        with suppress(FileNotFoundError):
-            shutil.rmtree(work_folder)
-
-
 #
 # The Pipeline
 #

-
 def triage_image_file(input_file, output_file, log, options):
    try:
        log.info("Input file is not a PDF, checking if it is an image...")