From 58282ea0fb15249b88e793d0dae68aba54149186 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 4 Oct 2018 13:44:51 -0700 Subject: [PATCH] optimize: more refactoring Now properly generalized/specialized where it should be --- src/ocrmypdf/optimize.py | 142 +++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 82 deletions(-) diff --git a/src/ocrmypdf/optimize.py b/src/ocrmypdf/optimize.py index abe18b75..973864a8 100644 --- a/src/ocrmypdf/optimize.py +++ b/src/ocrmypdf/optimize.py @@ -50,17 +50,17 @@ def tif_name(root, xref): return img_name(root, xref, '.tif') -def extract_image_jbig2(*, pike, root, log, image, xref, options): +def extract_image_filter(pike, root, log, image, xref): if image.Subtype != '/Image': return None if image.Length < 100: - log.debug("Skipping small image, xref {}".format(xref)) + log.debug("Skipping small image, xref %s", xref) return None pim = pikepdf.PdfImage(image) if len(pim.filter_decodeparms) > 1: - log.debug("Skipping multiply filtered, xref {}".format(xref)) + log.debug("Skipping multiply filtered, xref %s", xref) return None filtdp = pim.filter_decodeparms[0] @@ -70,6 +70,15 @@ def extract_image_jbig2(*, pike, root, log, image, xref, options): if filtdp[0] == '/JPXDecode': return None # Don't do JPEG2000 + return pim, filtdp + + +def extract_image_jbig2(*, pike, root, log, image, xref, options): + result = extract_image_filter(pike, root, log, image, xref) + if result is None: + return None + pim, filtdp = result + if pim.bits_per_component == 1 \ and filtdp != '/JBIG2Decode' \ and jbig2enc.available(): @@ -84,27 +93,12 @@ def extract_image_jbig2(*, pike, root, log, image, xref, options): return None -def extract_image(*, pike, root, log, image, xref, options): - if image.Subtype != '/Image': - return None - if image.Length < 100: - log.debug("Skipping small image, xref {}".format(xref)) +def extract_image_generic(*, pike, root, log, image, xref, options): + result = extract_image_filter(pike, root, log, image, xref) + if result is None: return None + pim, filtdp = result - pim = pikepdf.PdfImage(image) - - if len(pim.filter_decodeparms) > 1: - log.debug("Skipping multiply filtered, xref {}".format(xref)) - return None - filtdp = pim.filter_decodeparms[0] - - if pim.bits_per_component > 8: - return None # Don't mess with wide gamut images - - if filtdp[0] == '/JPXDecode': - return None # Don't do JPEG2000 - - outname = '' if filtdp[0] == '/DCTDecode' \ and options.optimize >= 2: # This is a simple heuristic derived from some training data, that has @@ -149,56 +143,13 @@ def extract_image(*, pike, root, log, image, xref, options): return True -def extract_images_jbig2(pike, root, log, options): - """Extract any image that we think we can improve""" +def extract_images(pike, root, log, options, extract_fn): + """Extract image using extract_fn + + extract_fn decides where the image is interesting in this case + """ changed_xrefs = set() - jbig2_groups = defaultdict(list) - errors = 0 - for pageno, page in enumerate(pike.pages): - group, _ = divmod(pageno, options.jbig2_page_group_size) - try: - xobjs = page.Resources.XObject - except AttributeError: - continue - for imname, image in dict(xobjs).items(): - if image.objgen[1] != 0: - continue # Ignore images in an incremental PDF - xref = image.objgen[0] - if xref in changed_xrefs: - continue # Don't improve same image twice - try: - result = extract_image_jbig2( - pike=pike, root=root, log=log, image=image, - xref=xref, options=options - ) - except Exception as e: - log.debug("Image {} xref {}".format(imname, xref)) - log.debug(repr(e)) - errors += 1 - else: - if result: - jbig2_groups[group].append(result) - changed_xrefs.add(xref) - - # Elide empty groups - jbig2_groups = {group: xrefs for group, xrefs in jbig2_groups.items() - if len(xrefs) > 0} - log.debug( - "Optimizable images: " - "JBIG2 groups: {} Errors: {}".format( - len(jbig2_groups), errors - )) - - return jbig2_groups - - -def extract_images(pike, root, log, options): - """Extract any image that we think we can improve""" - - changed_xrefs = set() - jpegs = [] - pngs = [] errors = 0 for pageno, page in enumerate(pike.pages): try: @@ -212,32 +163,59 @@ def extract_images(pike, root, log, options): if xref in changed_xrefs: continue # Don't improve same image twice try: - result = extract_image( + result = extract_fn( pike=pike, root=root, log=log, image=image, xref=xref, options=options ) except Exception as e: - log.debug("Image {} xref {}".format(imname, xref)) + log.debug("Image %s xref %s", imname, xref) log.debug(repr(e)) errors += 1 else: if result: changed_xrefs.add(xref) _, ext = result - if ext == '.png': - pngs.append(xref) - elif ext == '.jpg': - jpegs.append(xref) + yield pageno, xref, ext + +def extract_images_generic(pike, root, log, options): + """Extract any >=2bpp image we think we can improve""" + + jpegs = [] + pngs = [] + for _, xref, ext in extract_images( + pike, root, log, options, extract_image_generic): + log.debug('xref = %s ext = %s', xref, ext) + if ext == '.png': + pngs.append(xref) + elif ext == '.jpg': + jpegs.append(xref) log.debug( "Optimizable images: " - "JPEGs: {} PNGs: {} Errors: {}".format( - len(jpegs), len(pngs), errors - )) - + "JPEGs: %s PNGs: %s", len(jpegs), len(pngs) + ) return jpegs, pngs +def extract_images_jbig2(pike, root, log, options): + """Extract any bitonal image that we think we can improve as JBIG2""" + + jbig2_groups = defaultdict(list) + for pageno, xref, ext in extract_images( + pike, root, log, options, extract_image_jbig2): + group = pageno // options.jbig2_page_group_size + jbig2_groups[group].append((xref, ext)) + + # Elide empty groups + jbig2_groups = {group: xrefs for group, xrefs in jbig2_groups.items() + if len(xrefs) > 0} + log.debug( + "Optimizable images: " + "JBIG2 groups: %s", (len(jbig2_groups),) + ) + return jbig2_groups + + def _produce_jbig2_images(jbig2_groups, root, log, options): """Produce JBIG2 images from their groups""" @@ -335,7 +313,7 @@ def transcode_jpegs(pike, jpegs, root, log, options): quality=options.jpeg_quality) # pylint: disable=no-member if opt_jpg.stat().st_size > in_jpg.stat().st_size: - log.debug("xref {}, jpeg, made larger - skip".format(xref)) + log.debug("xref %s, jpeg, made larger - skip", xref) continue compdata = leptonica.CompressedData.open(opt_jpg) @@ -436,7 +414,7 @@ def optimize( root = Path(output_file).parent / 'images' root.mkdir(exist_ok=True) # pylint: disable=no-member - jpegs, pngs = extract_images(pike, root, log, options) + jpegs, pngs = extract_images_generic(pike, root, log, options) transcode_jpegs(pike, jpegs, root, log, options) transcode_pngs(pike, pngs, root, log, options)