From 56d2aae96387d2c3cf123578f68acd3cd2de4ab5 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 18 May 2017 18:39:14 -0700 Subject: [PATCH] Refactor from ImageInfo index to attribute accessing --- ocrmypdf/pageinfo.py | 4 ++++ ocrmypdf/pipeline.py | 10 +++++----- tests/test_main.py | 23 ++++++++++++----------- tests/test_pageinfo.py | 21 +++++++++++---------- tests/test_tess4.py | 2 +- 5 files changed, 33 insertions(+), 27 deletions(-) diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py index 61b9f6d4..53a21b61 100644 --- a/ocrmypdf/pageinfo.py +++ b/ocrmypdf/pageinfo.py @@ -619,6 +619,10 @@ class PageInfo(MutableMapping): def yres(self): return self._pageinfo['yres'] + @property + def images(self): + return self._pageinfo['images'] + def __getitem__(self, item): warnings.warn("pageinfo[item] is deprecated", DeprecationWarning) return self._pageinfo[item] diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index 196cfe79..7accb1ab 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -397,13 +397,13 @@ def rasterize_with_ghostscript( device = 'png16m' # 24-bit if pageinfo.images: - if all(image['comp'] == 1 for image in pageinfo.images): - if all(image['bpc'] == 1 for image in pageinfo.images): + if all(image.comp == 1 for image in pageinfo.images): + if all(image.bpc == 1 for image in pageinfo.images): device = 'pngmono' - elif all(image['bpc'] > 1 and image['color'] == 'index' + elif all(image.bpc > 1 and image.color == 'index' for image in pageinfo.images): device = 'png256' - elif all(image['bpc'] > 1 and image['color'] == 'gray' + elif all(image.bpc > 1 and image.color == 'gray' for image in pageinfo.images): device = 'pnggray' @@ -430,7 +430,7 @@ def preprocess_remove_background( pageinfo = get_pageinfo(input_file, context) - if any(image['bpc'] > 1 for image in pageinfo.images): + if any(image.bpc > 1 for image in pageinfo.images): leptonica.remove_background(input_file, output_file) else: log.info("{0:4d}: background removal skipped on mono page".format( diff --git a/tests/test_main.py b/tests/test_main.py index c1cfea60..bdb00f2a 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -12,6 +12,7 @@ from ocrmypdf import leptonica from ocrmypdf.pdfa import file_claims_pdfa from ocrmypdf.exec import ghostscript import logging +from math import isclose check_ocrmypdf = pytest.helpers.check_ocrmypdf @@ -629,7 +630,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf): env=spoof_tesseract_cache) out_pageinfo = PdfInfo(out) - assert out_pageinfo[0].images[0]['enc'] == 'jbig2' + assert out_pageinfo[0].images[0].enc == 'jbig2' def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf): @@ -746,8 +747,8 @@ def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf): pdfinfo = PdfInfo(outpdf) image = pdfinfo[0].images[0] - assert image['dpi_w'] == image['dpi_h'] - assert image['dpi_w'] == 2400 + assert isclose(image.xres, image.yres) + assert isclose(image.xres, 2400) def test_overlay(spoof_tesseract_noop, resources, outpdf): @@ -905,16 +906,16 @@ def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec, pdfimage = pdfinfo[0].images[0] if input_file.endswith('.png'): - assert pdfimage['enc'] != 'jpeg', \ + assert pdfimage.enc != 'jpeg', \ "Lossless compression changed to lossy!" elif input_file.endswith('.jpg'): - assert pdfimage['enc'] == 'jpeg', \ + assert pdfimage.enc == 'jpeg', \ "Lossy compression changed to lossless!" if im.mode.startswith('RGB') or im.mode.startswith('BGR'): - assert pdfimage['color'] == 'rgb', \ + assert pdfimage.color == 'rgb', \ "Colorspace changed" elif im.mode.startswith('L'): - assert pdfimage['color'] == 'gray', \ + assert pdfimage.color == 'gray', \ "Colorspace changed" @@ -950,15 +951,15 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec, pdfimage = pdfinfo[0].images[0] if compression == 'jpeg': - assert pdfimage['enc'] == 'jpeg' + assert pdfimage.enc == 'jpeg' elif compression == 'lossless': - assert pdfimage['enc'] == 'image' + assert pdfimage.enc == 'image' if im.mode.startswith('RGB') or im.mode.startswith('BGR'): - assert pdfimage['color'] == 'rgb', \ + assert pdfimage.color == 'rgb', \ "Colorspace changed" elif im.mode.startswith('L'): - assert pdfimage['color'] == 'gray', \ + assert pdfimage.color == 'gray', \ "Colorspace changed" diff --git a/tests/test_pageinfo.py b/tests/test_pageinfo.py index dbaafbf4..663380e5 100644 --- a/tests/test_pageinfo.py +++ b/tests/test_pageinfo.py @@ -5,6 +5,7 @@ from ocrmypdf import pageinfo from reportlab.pdfgen.canvas import Canvas from PIL import Image from tempfile import NamedTemporaryFile +from math import isclose from contextlib import suppress import os import shutil @@ -62,12 +63,12 @@ def test_single_page_image(outdir): assert len(page.images) == 1 pdfimage = page.images[0] - assert pdfimage['width'] == 8 - assert pdfimage['color'] == 'gray' + assert pdfimage.width == 8 + assert pdfimage.color == 'gray' # DPI in a 1"x1" is the image width - assert abs(pdfimage['dpi_w'] - 8) < 1e-5 - assert abs(pdfimage['dpi_h'] - 8) < 1e-5 + assert isclose(pdfimage.xres, 8) + assert isclose(pdfimage.yres, 8) def test_single_page_inline_image(outdir): @@ -86,9 +87,9 @@ def test_single_page_inline_image(outdir): pdfinfo = pageinfo.PdfInfo(filename) print(pdfinfo) pdfimage = pdfinfo[0].images[0] - assert (pdfimage['dpi_w'] - 8) < 1e-5 - assert pdfimage['color'] != '-' - assert pdfimage['width'] == 8 + assert isclose(pdfimage.xres, 8) + assert pdfimage.color != '-' + assert pdfimage.width == 8 def test_jpeg(resources, outdir): @@ -97,8 +98,8 @@ def test_jpeg(resources, outdir): pdfinfo = pageinfo.PdfInfo(filename) pdfimage = pdfinfo[0].images[0] - assert pdfimage['enc'] == 'jpeg' - assert (pdfimage['dpi_w'] - 150) < 1e-5 + assert pdfimage.enc == 'jpeg' + assert isclose(pdfimage.xres, 150) def test_form_xobject(resources): @@ -106,7 +107,7 @@ def test_form_xobject(resources): pdfinfo = pageinfo.PdfInfo(filename) pdfimage = pdfinfo[0].images[0] - assert pdfimage['width'] == 50 + assert pdfimage.width == 50 def test_no_contents(resources): diff --git a/tests/test_tess4.py b/tests/test_tess4.py index 47cad81f..b45d8f0f 100644 --- a/tests/test_tess4.py +++ b/tests/test_tess4.py @@ -117,4 +117,4 @@ def test_content_preservation(ensure_tess4, resources, outpdf): info = pageinfo.PdfInfo(outpdf) page = info[0] - assert len(page['images']) > 1, "masked were rasterized" \ No newline at end of file + assert len(page.images) > 1, "masked were rasterized" \ No newline at end of file