optimize plugin: solve linearization and "is optimization enabled?" issues

2026-05-05 13:16:55 -04:00 · 2022-06-13 00:59:41 -07:00
parent 61069660a2
commit 13d11e76e5
5 changed files with 53 additions and 13 deletions
--- a/src/ocrmypdf/_pipeline.py
+++ b/src/ocrmypdf/_pipeline.py
@@ -817,13 +817,14 @@ def metadata_fixup(working_file: Path, context: PdfContext):
                missing = set(meta_original.keys()) - set(meta.keys())
                report_on_metadata(missing)

+        optimizing = context.plugin_manager.hook.is_optimization_enabled(
+            context=context
+        )
        pdf.save(
            output_file,
            **get_pdf_save_settings(options.output_type),
            linearize=(  # Don't linearize if optimize() will be linearizing too
-                should_linearize(working_file, context)
-                if hasattr(options, 'optimize') and options.optimize == 0
-                else False
+                not optimizing and should_linearize(working_file, context)
            ),
        )

@@ -833,7 +834,11 @@ def metadata_fixup(working_file: Path, context: PdfContext):
 def optimize_pdf(input_file: Path, context: PdfContext, executor: Executor):
    output_file = context.get_path('optimize.pdf')
    output_pdf = context.plugin_manager.hook.optimize_pdf(
-        input_pdf=input_file, output_pdf=output_file, context=context, executor=executor
+        input_pdf=input_file,
+        output_pdf=output_file,
+        context=context,
+        executor=executor,
+        linearize=should_linearize(input_file, context),
    )

    input_size = input_file.stat().st_size
--- a/src/ocrmypdf/_plugin_manager.py
+++ b/src/ocrmypdf/_plugin_manager.py
@@ -120,3 +120,6 @@ def get_parser_options_plugins(

    options = parser.parse_args(args=args)
    return parser, options, plugin_manager
+
+
+__all__ = ['OcrmypdfPluginManager', 'get_plugin_manager', 'get_parser_options_plugins']
--- a/src/ocrmypdf/builtin_plugins/optimize.py
+++ b/src/ocrmypdf/builtin_plugins/optimize.py
@@ -14,7 +14,7 @@ from pathlib import Path
 from ocrmypdf import PdfContext, hookimpl
 from ocrmypdf._concurrent import Executor
 from ocrmypdf._exec import jbig2enc, pngquant
-from ocrmypdf._pipeline import get_pdf_save_settings, should_linearize
+from ocrmypdf._pipeline import get_pdf_save_settings
 from ocrmypdf.cli import numeric
 from ocrmypdf.optimize import optimize
 from ocrmypdf.subprocess import check_external_program
@@ -125,11 +125,20 @@ def check_options(options):

@hookimpl
 def optimize_pdf(
-    input_pdf: Path, output_pdf: Path, context: PdfContext, executor: Executor
+    input_pdf: Path,
+    output_pdf: Path,
+    context: PdfContext,
+    executor: Executor,
+    linearize: bool,
 ) -> Path:
    save_settings = dict(
-        linearize=should_linearize(input_pdf, context),
+        linearize=linearize,
        **get_pdf_save_settings(context.options.output_type),
    )
    optimize(input_pdf, output_pdf, context, save_settings, executor)
    return output_pdf
+
+
+@hookimpl
+def is_optimization_enabled(context: PdfContext) -> bool:
+    return context.options.optimize != 0
--- a/src/ocrmypdf/pluginspec.py
+++ b/src/ocrmypdf/pluginspec.py
@@ -462,12 +462,16 @@ def generate_pdfa(

@hookspec(firstresult=True)
 def optimize_pdf(
-    input_pdf: Path, output_pdf: Path, context: PdfContext, executor: Executor
+    input_pdf: Path,
+    output_pdf: Path,
+    context: PdfContext,
+    executor: Executor,
+    linearize: bool,
 ) -> Path:
    """Optimize a PDF after image, OCR and metadata processing.

-    If the input_pdf is a PDF/A, the plugin must only modify input_pdf in a way
-    that preserves the PDF/A status.
+    If the input_pdf is a PDF/A, the plugin should modify input_pdf in a way
+    that preserves the PDF/A status, or report to the user when this is not possible.

    If the implementation fails to produce a smaller file than the input file, it
    should return input_pdf instead.
@@ -479,6 +483,8 @@ def optimize_pdf(
        context: The current context.
        executor: An initialized executor which may be used during optimization,
            to distribute optimization tasks.
+        linearize: If True, OCRmyPDF requires ``optimize_pdf`` to return a linearized,
+            also known as fast web view PDF.

    Returns:
        Path: If optimization is successful, the hook should return ``output_file``.
@@ -488,3 +494,20 @@ def optimize_pdf(
    Note:
        This is a :ref:`firstresult hook<firstresult>`.
    """
+
+
+@hookspec(firstresult=True)
+def is_optimization_enabled(context: PdfContext) -> bool:
+    """For a given PdfContext, OCRmyPDF asks the plugin if optimization is enabled.
+
+    It is assumed that an optimization plugin might be installed but could be
+    disabled by user settings.
+
+    If this returns False, OCRmyPDF will take certain actions to finalize the PDF.
+
+    Returns:
+        True if the plugin's optimization is enabled.
+
+    Note:
+        This is a :ref:`firstresult hook<firstresult>`.
+    """
--- a/tests/test_metadata.py
+++ b/tests/test_metadata.py
@@ -16,7 +16,7 @@ from pikepdf.models.metadata import decode_pdf_date

 from ocrmypdf._jobcontext import PdfContext
 from ocrmypdf._pipeline import convert_to_pdfa, metadata_fixup
-from ocrmypdf._plugin_manager import get_plugin_manager
+from ocrmypdf._plugin_manager import get_parser_options_plugins, get_plugin_manager
 from ocrmypdf.cli import get_parser
 from ocrmypdf.exceptions import ExitCode
 from ocrmypdf.pdfa import file_claims_pdfa, generate_pdfa_ps
@@ -299,8 +299,8 @@ def test_kodak_toc(resources, outpdf):


 def test_metadata_fixup_warning(resources, outdir, caplog):
-    options = get_parser().parse_args(
-        args=['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
+    _parser, options, _pm = get_parser_options_plugins(
+        ['--output-type', 'pdfa-2', 'graph.pdf', 'out.pdf']
    )

    copyfile(resources / 'graph.pdf', outdir / 'graph.pdf')