From 08bf651ef25fd5ca1b84879ef57738e62a0e4af2 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 15 May 2018 15:32:15 -0700 Subject: [PATCH] Refactor JBIG2 path for non-CCITT monochrome images --- src/ocrmypdf/_optimize.py | 41 +++++++++++++++++++++++++-------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/src/ocrmypdf/_optimize.py b/src/ocrmypdf/_optimize.py index 45571128..a3ddc42c 100644 --- a/src/ocrmypdf/_optimize.py +++ b/src/ocrmypdf/_optimize.py @@ -39,16 +39,20 @@ JPEG_QUALITY = 75 PNG_QUALITY = (65, 75) +def img_name(root, xref, ext): + return str(root / '{:08d}{}'.format(xref, ext)) + + def png_name(root, xref): - return str(root / '{:08d}.png'.format(xref)) + return img_name(root, xref, '.png') def jpg_name(root, xref): - return str(root / '{:08d}.jpg'.format(xref)) + return img_name(root, xref, '.jpg') def tif_name(root, xref): - return str(root / '{:08d}.tif'.format(xref)) + return img_name(root, xref, '.tif') def extract_image(*, doc, pike, root, log, image, xref, jbig2s, @@ -75,11 +79,14 @@ def extract_image(*, doc, pike, root, log, image, xref, jbig2s, if pim.bits_per_component == 1 \ and filtdp != '/JBIG2Decode' \ and jbig2enc.available(): - with Path(tif_name(root, xref)).open('wb') as f: - result = pim.write_stream(f) - if not result: + try: + imgname = Path(root / '{:08d}'.format(xref)) + with imgname.open('wb') as f: + ext = pim.extract(f) + imgname.rename(imgname.with_suffix(ext)) + except pikepdf.UnsupportedImageTypeError: return False - jbig2s.append(xref) + jbig2s.append((xref, ext)) elif filtdp[0] == '/DCTDecode' \ and options.optimize >= 2: # This is a simple heuristic derived from some training data, that has @@ -98,10 +105,13 @@ def extract_image(*, doc, pike, root, log, image, xref, jbig2s, # iccbytes = icc.read_bytes() # with Image.open(stream) as im: # im.save(jpg_name(root, xref), icc_profile=iccbytes) - with Path(jpg_name(root, xref)).open('wb') as f: - result = pim.write_stream(f) - if not result: - return False + try: + imgname = Path(root / '{:08d}'.format(xref)) + with imgname.open('wb') as f: + ext = pim.extract(f) + imgname.rename(imgname.with_suffix(ext)) + except pikepdf.UnsupportedImageTypeError: + return False jpegs.append(xref) elif pim.indexed \ and pim.colorspace in pim.SIMPLE_COLORSPACES \ @@ -186,12 +196,12 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options): with concurrent.futures.ThreadPoolExecutor( max_workers=options.jobs) as executor: futures = [] - for group, xrefs in jbig2_groups.items(): + for group, xref_exts in jbig2_groups.items(): prefix = 'group{:08d}'.format(group) future = executor.submit( jbig2enc.convert_group, cwd=str(root), - infiles=(png_name(root, xref) for xref in xrefs), + infiles=(img_name(root, xref, ext) for xref, ext in xref_exts), out_prefix=prefix ) futures.append(future) @@ -199,12 +209,13 @@ def convert_to_jbig2(pike, jbig2_groups, root, log, options): proc = future.result() log.debug(proc.stderr.decode()) - for group, xrefs in jbig2_groups.items(): + for group, xref_exts in jbig2_groups.items(): prefix = 'group{:08d}'.format(group) jbig2_globals_data = (root / (prefix + '.sym')).read_bytes() jbig2_globals = pikepdf.Stream(pike, jbig2_globals_data) - for n, xref in enumerate(xrefs): + for n, xref_ext in enumerate(xref_exts): + xref, ext = xref_ext jbig2_im_file = root / (prefix + '.{:04d}'.format(n)) jbig2_im_data = jbig2_im_file.read_bytes() im_obj = pike._get_object_id(xref, 0)