optimize: use pikepdf to save PIL images

Eliminates another usage of PyMuPDF in the main path.
2026-05-05 05:05:44 -04:00 · 2018-05-18 00:18:44 -07:00
parent f4571e2508
commit b52eb95cf8
1 changed files with 22 additions and 26 deletions
--- a/src/ocrmypdf/_optimize.py
+++ b/src/ocrmypdf/_optimize.py
@@ -55,7 +55,7 @@ def tif_name(root, xref):
    return img_name(root, xref, '.tif')


-def extract_image(*, doc, pike, root, log, image, xref, jbig2s, 
+def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
                  pngs, jpegs, options):
    if image.Subtype != '/Image':
        return False
@@ -85,7 +85,7 @@ def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
                ext = pim.extract_to(stream=f)
            imgname.rename(imgname.with_suffix(ext))
        except pikepdf.UnsupportedImageTypeError:
-            return False        
+            return False
        jbig2s.append((xref, ext))
    elif filtdp[0] == '/DCTDecode' \
            and options.optimize >= 2:
@@ -111,7 +111,7 @@ def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
                ext = pim.extract_to(stream=f)
            imgname.rename(imgname.with_suffix(ext))
        except pikepdf.UnsupportedImageTypeError:
-            return False        
+            return False
        jpegs.append(xref)
    elif pim.indexed \
            and pim.colorspace in pim.SIMPLE_COLORSPACES \
@@ -121,19 +121,15 @@ def extract_image(*, doc, pike, root, log, image, xref, jbig2s,
        # fruit in most cases
        pix = fitz.Pixmap(doc, xref)
        pix.writePNG(png_name(root, xref), savealpha=False)
-        pngs.append(xref)        
-    elif pim.colorspace in pim.SIMPLE_COLORSPACES and fitz:
-        # For any 'inferior' filter including /FlateDecode we extract
-        # and recode as /FlateDecode
-        # raw_png = pike._get_object_id(xref, 0)
-        # raw_png_data = raw_png.read_raw_bytes()
-        # (root / '{:08d}.png'.format(xref)).write_bytes(raw_png_data)
-        pix = fitz.Pixmap(doc, xref)
-        pix.writePNG(png_name(root, xref), savealpha=False)
+        pngs.append(xref)
+    elif pim.colorspace in pim.SIMPLE_COLORSPACES:
+        # An optimization opportunity here, not currently taken, is directly
+        # generating a PNG from compressed data
+        pim.as_pil_image().save(png_name(root, xref))
        pngs.append(xref)
    else:
        return False
-    
+
    return True


@@ -156,8 +152,8 @@ def extract_images(doc, pike, root, log, options):
                continue  # Don't improve same image twice
            try:
                result = extract_image(
-                    doc=doc, pike=pike, root=root, log=log, image=image, 
-                    xref=xref, jbig2s=jbig2_groups[group], pngs=pngs, 
+                    doc=doc, pike=pike, root=root, log=log, image=image,
+                    xref=xref, jbig2s=jbig2_groups[group], pngs=pngs,
                    jpegs=jpegs, options=options
                )
                if result:
@@ -171,7 +167,7 @@ def extract_images(doc, pike, root, log, options):
    jbig2_groups = {group: xrefs for group, xrefs in jbig2_groups.items()
                    if len(xrefs) > 0}
    log.debug(
-        "Optimizable images: " 
+        "Optimizable images: "
        "JBIG2 groups: {} JPEGs: {} PNGs: {} Errors: {}".format(
            len(jbig2_groups), len(jpegs), len(pngs), errors
    ))
@@ -186,7 +182,7 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options):
    We use a group because JBIG2 works best with a symbol dictionary that spans
    multiple pages. When inserted back into the PDF, each JBIG2 must reference
    the symbol dictionary it is associated with. So convert a group at a time,
-    and replace their streams with a parameter set that points to the 
+    and replace their streams with a parameter set that points to the
    appropriate dictionary.

    If too many pages shared the same dictionary JBIG2 encoding becomes more
@@ -199,7 +195,7 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options):
        for group, xref_exts in jbig2_groups.items():
            prefix = 'group{:08d}'.format(group)
            future = executor.submit(
-                jbig2enc.convert_group, 
+                jbig2enc.convert_group,
                cwd=str(root),
                infiles=(img_name(root, xref, ext) for xref, ext in xref_exts),
                out_prefix=prefix
@@ -220,7 +216,7 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options):
            jbig2_im_data = jbig2_im_file.read_bytes()
            im_obj = pike._get_object_id(xref, 0)
            im_obj.write(
-                jbig2_im_data, pikepdf.Name('/JBIG2Decode'), 
+                jbig2_im_data, pikepdf.Name('/JBIG2Decode'),
                pikepdf.Dictionary({
                    '/JBIG2Globals': jbig2_globals
                })
@@ -237,7 +233,7 @@ def transcode_jpegs(pike, jpegs, root, log, options):
        # 'close'.  Seems to be mostly harmless
        # https://github.com/python-pillow/Pillow/issues/1144
        with Image.open(str(in_jpg)) as im:
-            im.save(str(opt_jpg), 
+            im.save(str(opt_jpg),
                    optimize=True,
                    quality=JPEG_QUALITY)
        if opt_jpg.stat().st_size > in_jpg.stat().st_size:
@@ -258,8 +254,8 @@ def transcode_pngs(pike, pngs, root, log, options):
                max_workers=options.jobs) as executor:
            for xref in pngs:
                executor.submit(
-                    pngquant.quantize, 
-                    png_name(root, xref), png_name(root, xref), 
+                    pngquant.quantize,
+                    png_name(root, xref), png_name(root, xref),
                    PNG_QUALITY[0], PNG_QUALITY[1])

    for xref in pngs:
@@ -274,7 +270,7 @@ def transcode_pngs(pike, pngs, root, log, options):
        except leptonica.LeptonicaError as e:
            log.error(e)
            continue
-        
+
        # This is what we should be doing: open the compressed data without
        # transcoding. However this shifts each pixel row by one for some
        # reason.
@@ -351,13 +347,13 @@ def optimize(
    savings = 1 - output_size / input_size
    log.info("Optimize ratio: {:.2f} savings: {:.1f}%".format(
        ratio, 100 * savings))
-    
+
    if savings < 0:
        log.info("Optimize did not improve the file - discarded")
        re_symlink(input_file, output_file, log)
    else:
        re_symlink(target_file, output_file, log)
-        
+

 def main(infile, outfile, level, jobs=1):
    Options = namedtuple('Options', 'jobs optimize')
@@ -373,4 +369,4 @@ def main(infile, outfile, level, jobs=1):


 if __name__ == '__main__':
-    main(sys.argv[1], sys.argv[2], sys.argv[3])
+    main(sys.argv[1], sys.argv[2], sys.argv[3])