Access PageInfo instance variables instead of dictionary

2026-05-07 14:14:41 -04:00 · 2017-05-18 17:12:04 -07:00
parent 6c12e7e944
commit caee5b1428
4 changed files with 60 additions and 53 deletions
--- a/ocrmypdf/pageinfo.py
+++ b/ocrmypdf/pageinfo.py
@@ -551,6 +551,14 @@ class PageInfo(MutableMapping):
    def height_inches(self):
        return self._pageinfo['height_inches']

+    @property
+    def width_pixels(self):
+        return int(round(self.width_inches * self.xres))
+
+    @property
+    def height_pixels(self):
+        return int(round(self.height_inches * self.yres))
+
    @property
    def rotation(self):
        return self._pageinfo['rotate']
@@ -594,7 +602,6 @@ class PdfInfo:
        return self._pages[index]

    def __getitem__(self, item):
-        warnings.warn("pageinfo[item] is deprecated", DeprecationWarning)
        return self._pages[item]

    def __len__(self):
--- a/ocrmypdf/pipeline.py
+++ b/ocrmypdf/pipeline.py
@@ -210,9 +210,9 @@ def get_page_square_dpi(pageinfo, options):


 def is_ocr_required(pageinfo, log, options):
-    page = pageinfo['pageno'] + 1
+    page = pageinfo.pageno + 1
    ocr_required = True
-    if not pageinfo['images']:
+    if not pageinfo.images:
        if options.force_ocr and options.oversample:
            # The user really wants to reprocess this file
            log.info(
@@ -234,7 +234,7 @@ def is_ocr_required(pageinfo, log, options):
                "skipping all processing on this page".format(page))
            ocr_required = False

-    elif pageinfo['has_text']:
+    elif pageinfo.has_text:
        msg = "{0:4d}: page already has text! – {1}"

        if not options.force_ocr and not options.skip_text:
@@ -250,8 +250,8 @@ def is_ocr_required(pageinfo, log, options):
                                "skipping all processing on this page"))
            ocr_required = False

-    if ocr_required and options.skip_big and pageinfo['images']:
-        pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels']
+    if ocr_required and options.skip_big and pageinfo.images:
+        pixel_count = pageinfo.width_pixels * pageinfo.height_pixels
        if pixel_count > (options.skip_big * 1000000):
            ocr_required = False
            log.warning(
@@ -396,15 +396,15 @@ def rasterize_with_ghostscript(
    pageinfo = get_pageinfo(input_file, context)

    device = 'png16m'  # 24-bit
-    if pageinfo['images']:
-        if all(image['comp'] == 1 for image in pageinfo['images']):
-            if all(image['bpc'] == 1 for image in pageinfo['images']):
+    if pageinfo.images:
+        if all(image['comp'] == 1 for image in pageinfo.images):
+            if all(image['bpc'] == 1 for image in pageinfo.images):
                device = 'pngmono'
            elif all(image['bpc'] > 1 and image['color'] == 'index'
-                     for image in pageinfo['images']):
+                     for image in pageinfo.images):
                device = 'png256'
            elif all(image['bpc'] > 1 and image['color'] == 'gray'
-                     for image in pageinfo['images']):
+                     for image in pageinfo.images):
                device = 'pnggray'

    log.debug("Rasterize {0} with {1}".format(
@@ -430,11 +430,11 @@ def preprocess_remove_background(

    pageinfo = get_pageinfo(input_file, context)

-    if any(image['bpc'] > 1 for image in pageinfo['images']):
+    if any(image['bpc'] > 1 for image in pageinfo.images):
        leptonica.remove_background(input_file, output_file)
    else:
        log.info("{0:4d}: background removal skipped on mono page".format(
-            pageinfo['pageno']))
+            pageinfo.pageno))
        re_symlink(input_file, output_file, log)


@@ -521,8 +521,8 @@ def select_visible_page_image(
    image = next(ii for ii in infiles if ii.endswith(image_suffix))

    pageinfo = get_pageinfo(image, context)
-    if pageinfo['images'] and \
-            all(im['enc'] == 'jpeg' for im in pageinfo['images']):
+    if pageinfo.images and \
+            all(im['enc'] == 'jpeg' for im in pageinfo.images):
        log.debug('{:4d}: JPEG input -> JPEG output'.format(
            page_number(image)))
        # If all images were JPEGs originally, produce a JPEG as output
--- a/tests/test_main.py
+++ b/tests/test_main.py
@@ -232,8 +232,8 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf):

    pdfinfo = PdfInfo(oversampled_pdf)

-    print(pdfinfo[0]['xres'])
-    assert abs(pdfinfo[0]['xres'] - 350) < 1
+    print(pdfinfo[0].xres)
+    assert abs(pdfinfo[0].xres - 350) < 1


 def test_repeat_ocr(resources, no_outpdf):
@@ -245,7 +245,7 @@ def test_force_ocr(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-f',
                         env=spoof_tesseract_cache)
    pdfinfo = PdfInfo(out)
-    assert pdfinfo[0]['has_text']
+    assert pdfinfo[0].has_text


 def test_skip_ocr(spoof_tesseract_cache, resources, outpdf):
@@ -359,14 +359,14 @@ def test_ocr_timeout(renderer, resources, outpdf):
    out = check_ocrmypdf(resources / 'skew.pdf', outpdf,
                         '--tesseract-timeout', '1.0')
    pdfinfo = PdfInfo(out)
-    assert not pdfinfo[0]['has_text']
+    assert not pdfinfo[0].has_text


 def test_skip_big(spoof_tesseract_cache, resources, outpdf):
    out = check_ocrmypdf(resources / 'enormous.pdf', outpdf,
                         '--skip-big', '10', env=spoof_tesseract_cache)
    pdfinfo = PdfInfo(out)
-    assert not pdfinfo[0]['has_text']
+    assert not pdfinfo[0].has_text


@pytest.mark.parametrize('renderer', ['hocr', 'tesseract'])
@@ -563,7 +563,7 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
                               resources, outpdf):
    # Confirm input image is non-square resolution
    in_pageinfo = PdfInfo(resources / 'aspect.pdf')
-    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
+    assert in_pageinfo[0].xres != in_pageinfo[0].yres

    check_ocrmypdf(
        resources / 'aspect.pdf', outpdf,
@@ -572,8 +572,8 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache,
    out_pageinfo = PdfInfo(outpdf)

    # Confirm resolution was kept the same
-    assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres']
-    assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres']
+    assert in_pageinfo[0].xres == out_pageinfo[0].xres
+    assert in_pageinfo[0].yres == out_pageinfo[0].yres


@pytest.mark.parametrize('renderer', [
@@ -586,7 +586,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,

    # Confirm input image is non-square resolution
    in_pageinfo = PdfInfo(resources / 'aspect.pdf')
-    assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres']
+    assert in_pageinfo[0].xres != in_pageinfo[0].yres

    # --force-ocr requires means forced conversion to square resolution
    check_ocrmypdf(
@@ -599,20 +599,20 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache,
    in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0]

    # Resolution show now be equal
-    assert out_p0['xres'] == out_p0['yres']
+    assert out_p0.xres == out_p0.yres

    # Page size should match input page size
-    assert isclose(in_p0['width_inches'],
-                   out_p0['width_inches'])
-    assert isclose(in_p0['height_inches'],
-                   out_p0['height_inches'])
+    assert isclose(in_p0.width_inches,
+                   out_p0.width_inches)
+    assert isclose(in_p0.height_inches,
+                   out_p0.height_inches)

    # Because we rasterized the page to produce a new image, it should occupy
    # the entire page
-    out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w']
-    out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h']
-    assert isclose(out_p0['width_inches'], out_im_w)
-    assert isclose(out_p0['height_inches'], out_im_h)
+    out_im_w = out_p0.images[0]['width'] / out_p0.images[0]['dpi_w']
+    out_im_h = out_p0.images[0]['height'] / out_p0.images[0]['dpi_h']
+    assert isclose(out_p0.width_inches, out_im_w)
+    assert isclose(out_p0.height_inches, out_im_h)


 def test_image_to_pdf(spoof_tesseract_noop, resources, outpdf):
@@ -629,7 +629,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf):
        env=spoof_tesseract_cache)

    out_pageinfo = PdfInfo(out)
-    assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2'
+    assert out_pageinfo[0].images[0]['enc'] == 'jbig2'


 def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf):
@@ -704,9 +704,9 @@ def test_rotated_skew_timeout(resources, outpdf):
    input_file = str(resources / 'rotated_skew.pdf')
    in_pageinfo = PdfInfo(input_file)[0]

-    assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \
+    assert in_pageinfo.height_pixels < in_pageinfo.width_pixels, \
        "Expected the input page to be landscape"
-    assert in_pageinfo['rotate'] == 90, "Expected a rotated page"
+    assert in_pageinfo.rotation == 90, "Expected a rotated page"

    out = check_ocrmypdf(
        input_file, outpdf,
@@ -715,14 +715,14 @@ def test_rotated_skew_timeout(resources, outpdf):

    out_pageinfo = PdfInfo(out)[0]

-    assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \
+    assert out_pageinfo.height_pixels > out_pageinfo.width_pixels, \
        "Expected the output page to be portrait"

-    assert out_pageinfo['rotate'] == 0, \
+    assert out_pageinfo.rotation == 0, \
        "Expected no page rotation for output"

-    assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \
-        in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \
+    assert in_pageinfo.width_pixels == out_pageinfo.height_pixels and \
+        in_pageinfo.height_pixels == out_pageinfo.width_pixels, \
        "Expected page rotation to be baked in"


@@ -745,7 +745,7 @@ def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf):
                   env=spoof_tesseract_cache)
    pdfinfo = PdfInfo(outpdf)

-    image = pdfinfo[0]['images'][0]
+    image = pdfinfo[0].images[0]
    assert image['dpi_w'] == image['dpi_h']
    assert image['dpi_w'] == 2400

@@ -902,7 +902,7 @@ def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec,

    pdfinfo = PdfInfo(output_file)

-    pdfimage = pdfinfo[0]['images'][0]
+    pdfimage = pdfinfo[0].images[0]

    if input_file.endswith('.png'):
        assert pdfimage['enc'] != 'jpeg', \
@@ -947,7 +947,7 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec,

    pdfinfo = PdfInfo(output_file)

-    pdfimage = pdfinfo[0]['images'][0]
+    pdfimage = pdfinfo[0].images[0]

    if compression == 'jpeg':
        assert pdfimage['enc'] == 'jpeg'
--- a/tests/test_pageinfo.py
+++ b/tests/test_pageinfo.py
@@ -31,8 +31,8 @@ def test_single_page_text(outdir):
    assert len(pdfinfo) == 1
    page = pdfinfo[0]

-    assert page['has_text']
-    assert len(page['images']) == 0
+    assert page.has_text
+    assert len(page.images) == 0


 def test_single_page_image(outdir):
@@ -58,10 +58,10 @@ def test_single_page_image(outdir):
    assert len(pdfinfo) == 1
    page = pdfinfo[0]

-    assert not page['has_text']
-    assert len(page['images']) == 1
+    assert not page.has_text
+    assert len(page.images) == 1

-    pdfimage = page['images'][0]
+    pdfimage = page.images[0]
    assert pdfimage['width'] == 8
    assert pdfimage['color'] == 'gray'

@@ -85,7 +85,7 @@ def test_single_page_inline_image(outdir):

    pdfinfo = pageinfo.PdfInfo(filename)
    print(pdfinfo)
-    pdfimage = pdfinfo[0]['images'][0]
+    pdfimage = pdfinfo[0].images[0]
    assert (pdfimage['dpi_w'] - 8) < 1e-5
    assert pdfimage['color'] != '-'
    assert pdfimage['width'] == 8
@@ -96,7 +96,7 @@ def test_jpeg(resources, outdir):

    pdfinfo = pageinfo.PdfInfo(filename)

-    pdfimage = pdfinfo[0]['images'][0]
+    pdfimage = pdfinfo[0].images[0]
    assert pdfimage['enc'] == 'jpeg'
    assert (pdfimage['dpi_w'] - 150) < 1e-5

@@ -105,7 +105,7 @@ def test_form_xobject(resources):
    filename = resources / 'formxobject.pdf'

    pdfinfo = pageinfo.PdfInfo(filename)
-    pdfimage = pdfinfo[0]['images'][0]
+    pdfimage = pdfinfo[0].images[0]
    assert pdfimage['width'] == 50


@@ -113,5 +113,5 @@ def test_no_contents(resources):
    filename = resources / 'no_contents.pdf'

    pdfinfo = pageinfo.PdfInfo(filename)
-    assert len(pdfinfo[0]['images']) == 0
-    assert pdfinfo[0]['has_text'] == False
+    assert len(pdfinfo[0].images) == 0
+    assert pdfinfo[0].has_text == False