diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py index 5876fc5c..1f9ab6e2 100644 --- a/ocrmypdf/pageinfo.py +++ b/ocrmypdf/pageinfo.py @@ -551,6 +551,14 @@ class PageInfo(MutableMapping): def height_inches(self): return self._pageinfo['height_inches'] + @property + def width_pixels(self): + return int(round(self.width_inches * self.xres)) + + @property + def height_pixels(self): + return int(round(self.height_inches * self.yres)) + @property def rotation(self): return self._pageinfo['rotate'] @@ -594,7 +602,6 @@ class PdfInfo: return self._pages[index] def __getitem__(self, item): - warnings.warn("pageinfo[item] is deprecated", DeprecationWarning) return self._pages[item] def __len__(self): diff --git a/ocrmypdf/pipeline.py b/ocrmypdf/pipeline.py index 77a7bc34..196cfe79 100644 --- a/ocrmypdf/pipeline.py +++ b/ocrmypdf/pipeline.py @@ -210,9 +210,9 @@ def get_page_square_dpi(pageinfo, options): def is_ocr_required(pageinfo, log, options): - page = pageinfo['pageno'] + 1 + page = pageinfo.pageno + 1 ocr_required = True - if not pageinfo['images']: + if not pageinfo.images: if options.force_ocr and options.oversample: # The user really wants to reprocess this file log.info( @@ -234,7 +234,7 @@ def is_ocr_required(pageinfo, log, options): "skipping all processing on this page".format(page)) ocr_required = False - elif pageinfo['has_text']: + elif pageinfo.has_text: msg = "{0:4d}: page already has text! – {1}" if not options.force_ocr and not options.skip_text: @@ -250,8 +250,8 @@ def is_ocr_required(pageinfo, log, options): "skipping all processing on this page")) ocr_required = False - if ocr_required and options.skip_big and pageinfo['images']: - pixel_count = pageinfo['width_pixels'] * pageinfo['height_pixels'] + if ocr_required and options.skip_big and pageinfo.images: + pixel_count = pageinfo.width_pixels * pageinfo.height_pixels if pixel_count > (options.skip_big * 1000000): ocr_required = False log.warning( @@ -396,15 +396,15 @@ def rasterize_with_ghostscript( pageinfo = get_pageinfo(input_file, context) device = 'png16m' # 24-bit - if pageinfo['images']: - if all(image['comp'] == 1 for image in pageinfo['images']): - if all(image['bpc'] == 1 for image in pageinfo['images']): + if pageinfo.images: + if all(image['comp'] == 1 for image in pageinfo.images): + if all(image['bpc'] == 1 for image in pageinfo.images): device = 'pngmono' elif all(image['bpc'] > 1 and image['color'] == 'index' - for image in pageinfo['images']): + for image in pageinfo.images): device = 'png256' elif all(image['bpc'] > 1 and image['color'] == 'gray' - for image in pageinfo['images']): + for image in pageinfo.images): device = 'pnggray' log.debug("Rasterize {0} with {1}".format( @@ -430,11 +430,11 @@ def preprocess_remove_background( pageinfo = get_pageinfo(input_file, context) - if any(image['bpc'] > 1 for image in pageinfo['images']): + if any(image['bpc'] > 1 for image in pageinfo.images): leptonica.remove_background(input_file, output_file) else: log.info("{0:4d}: background removal skipped on mono page".format( - pageinfo['pageno'])) + pageinfo.pageno)) re_symlink(input_file, output_file, log) @@ -521,8 +521,8 @@ def select_visible_page_image( image = next(ii for ii in infiles if ii.endswith(image_suffix)) pageinfo = get_pageinfo(image, context) - if pageinfo['images'] and \ - all(im['enc'] == 'jpeg' for im in pageinfo['images']): + if pageinfo.images and \ + all(im['enc'] == 'jpeg' for im in pageinfo.images): log.debug('{:4d}: JPEG input -> JPEG output'.format( page_number(image))) # If all images were JPEGs originally, produce a JPEG as output diff --git a/tests/test_main.py b/tests/test_main.py index bfe8bef2..c1cfea60 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -232,8 +232,8 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf): pdfinfo = PdfInfo(oversampled_pdf) - print(pdfinfo[0]['xres']) - assert abs(pdfinfo[0]['xres'] - 350) < 1 + print(pdfinfo[0].xres) + assert abs(pdfinfo[0].xres - 350) < 1 def test_repeat_ocr(resources, no_outpdf): @@ -245,7 +245,7 @@ def test_force_ocr(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf(resources / 'graph_ocred.pdf', outpdf, '-f', env=spoof_tesseract_cache) pdfinfo = PdfInfo(out) - assert pdfinfo[0]['has_text'] + assert pdfinfo[0].has_text def test_skip_ocr(spoof_tesseract_cache, resources, outpdf): @@ -359,14 +359,14 @@ def test_ocr_timeout(renderer, resources, outpdf): out = check_ocrmypdf(resources / 'skew.pdf', outpdf, '--tesseract-timeout', '1.0') pdfinfo = PdfInfo(out) - assert not pdfinfo[0]['has_text'] + assert not pdfinfo[0].has_text def test_skip_big(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf(resources / 'enormous.pdf', outpdf, '--skip-big', '10', env=spoof_tesseract_cache) pdfinfo = PdfInfo(out) - assert not pdfinfo[0]['has_text'] + assert not pdfinfo[0].has_text @pytest.mark.parametrize('renderer', ['hocr', 'tesseract']) @@ -563,7 +563,7 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache, resources, outpdf): # Confirm input image is non-square resolution in_pageinfo = PdfInfo(resources / 'aspect.pdf') - assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres'] + assert in_pageinfo[0].xres != in_pageinfo[0].yres check_ocrmypdf( resources / 'aspect.pdf', outpdf, @@ -572,8 +572,8 @@ def test_non_square_resolution(renderer, spoof_tesseract_cache, out_pageinfo = PdfInfo(outpdf) # Confirm resolution was kept the same - assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres'] - assert in_pageinfo[0]['yres'] == out_pageinfo[0]['yres'] + assert in_pageinfo[0].xres == out_pageinfo[0].xres + assert in_pageinfo[0].yres == out_pageinfo[0].yres @pytest.mark.parametrize('renderer', [ @@ -586,7 +586,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache, # Confirm input image is non-square resolution in_pageinfo = PdfInfo(resources / 'aspect.pdf') - assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres'] + assert in_pageinfo[0].xres != in_pageinfo[0].yres # --force-ocr requires means forced conversion to square resolution check_ocrmypdf( @@ -599,20 +599,20 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache, in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0] # Resolution show now be equal - assert out_p0['xres'] == out_p0['yres'] + assert out_p0.xres == out_p0.yres # Page size should match input page size - assert isclose(in_p0['width_inches'], - out_p0['width_inches']) - assert isclose(in_p0['height_inches'], - out_p0['height_inches']) + assert isclose(in_p0.width_inches, + out_p0.width_inches) + assert isclose(in_p0.height_inches, + out_p0.height_inches) # Because we rasterized the page to produce a new image, it should occupy # the entire page - out_im_w = out_p0['images'][0]['width'] / out_p0['images'][0]['dpi_w'] - out_im_h = out_p0['images'][0]['height'] / out_p0['images'][0]['dpi_h'] - assert isclose(out_p0['width_inches'], out_im_w) - assert isclose(out_p0['height_inches'], out_im_h) + out_im_w = out_p0.images[0]['width'] / out_p0.images[0]['dpi_w'] + out_im_h = out_p0.images[0]['height'] / out_p0.images[0]['dpi_h'] + assert isclose(out_p0.width_inches, out_im_w) + assert isclose(out_p0.height_inches, out_im_h) def test_image_to_pdf(spoof_tesseract_noop, resources, outpdf): @@ -629,7 +629,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf): env=spoof_tesseract_cache) out_pageinfo = PdfInfo(out) - assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2' + assert out_pageinfo[0].images[0]['enc'] == 'jbig2' def test_stdin(spoof_tesseract_noop, ocrmypdf_exec, resources, outpdf): @@ -704,9 +704,9 @@ def test_rotated_skew_timeout(resources, outpdf): input_file = str(resources / 'rotated_skew.pdf') in_pageinfo = PdfInfo(input_file)[0] - assert in_pageinfo['height_pixels'] < in_pageinfo['width_pixels'], \ + assert in_pageinfo.height_pixels < in_pageinfo.width_pixels, \ "Expected the input page to be landscape" - assert in_pageinfo['rotate'] == 90, "Expected a rotated page" + assert in_pageinfo.rotation == 90, "Expected a rotated page" out = check_ocrmypdf( input_file, outpdf, @@ -715,14 +715,14 @@ def test_rotated_skew_timeout(resources, outpdf): out_pageinfo = PdfInfo(out)[0] - assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \ + assert out_pageinfo.height_pixels > out_pageinfo.width_pixels, \ "Expected the output page to be portrait" - assert out_pageinfo['rotate'] == 0, \ + assert out_pageinfo.rotation == 0, \ "Expected no page rotation for output" - assert in_pageinfo['width_pixels'] == out_pageinfo['height_pixels'] and \ - in_pageinfo['height_pixels'] == out_pageinfo['width_pixels'], \ + assert in_pageinfo.width_pixels == out_pageinfo.height_pixels and \ + in_pageinfo.height_pixels == out_pageinfo.width_pixels, \ "Expected page rotation to be baked in" @@ -745,7 +745,7 @@ def test_very_high_dpi(spoof_tesseract_cache, resources, outpdf): env=spoof_tesseract_cache) pdfinfo = PdfInfo(outpdf) - image = pdfinfo[0]['images'][0] + image = pdfinfo[0].images[0] assert image['dpi_w'] == image['dpi_h'] assert image['dpi_w'] == 2400 @@ -902,7 +902,7 @@ def test_compression_preserved(spoof_tesseract_noop, ocrmypdf_exec, pdfinfo = PdfInfo(output_file) - pdfimage = pdfinfo[0]['images'][0] + pdfimage = pdfinfo[0].images[0] if input_file.endswith('.png'): assert pdfimage['enc'] != 'jpeg', \ @@ -947,7 +947,7 @@ def test_compression_changed(spoof_tesseract_noop, ocrmypdf_exec, pdfinfo = PdfInfo(output_file) - pdfimage = pdfinfo[0]['images'][0] + pdfimage = pdfinfo[0].images[0] if compression == 'jpeg': assert pdfimage['enc'] == 'jpeg' diff --git a/tests/test_pageinfo.py b/tests/test_pageinfo.py index d6bd0daf..dbaafbf4 100644 --- a/tests/test_pageinfo.py +++ b/tests/test_pageinfo.py @@ -31,8 +31,8 @@ def test_single_page_text(outdir): assert len(pdfinfo) == 1 page = pdfinfo[0] - assert page['has_text'] - assert len(page['images']) == 0 + assert page.has_text + assert len(page.images) == 0 def test_single_page_image(outdir): @@ -58,10 +58,10 @@ def test_single_page_image(outdir): assert len(pdfinfo) == 1 page = pdfinfo[0] - assert not page['has_text'] - assert len(page['images']) == 1 + assert not page.has_text + assert len(page.images) == 1 - pdfimage = page['images'][0] + pdfimage = page.images[0] assert pdfimage['width'] == 8 assert pdfimage['color'] == 'gray' @@ -85,7 +85,7 @@ def test_single_page_inline_image(outdir): pdfinfo = pageinfo.PdfInfo(filename) print(pdfinfo) - pdfimage = pdfinfo[0]['images'][0] + pdfimage = pdfinfo[0].images[0] assert (pdfimage['dpi_w'] - 8) < 1e-5 assert pdfimage['color'] != '-' assert pdfimage['width'] == 8 @@ -96,7 +96,7 @@ def test_jpeg(resources, outdir): pdfinfo = pageinfo.PdfInfo(filename) - pdfimage = pdfinfo[0]['images'][0] + pdfimage = pdfinfo[0].images[0] assert pdfimage['enc'] == 'jpeg' assert (pdfimage['dpi_w'] - 150) < 1e-5 @@ -105,7 +105,7 @@ def test_form_xobject(resources): filename = resources / 'formxobject.pdf' pdfinfo = pageinfo.PdfInfo(filename) - pdfimage = pdfinfo[0]['images'][0] + pdfimage = pdfinfo[0].images[0] assert pdfimage['width'] == 50 @@ -113,5 +113,5 @@ def test_no_contents(resources): filename = resources / 'no_contents.pdf' pdfinfo = pageinfo.PdfInfo(filename) - assert len(pdfinfo[0]['images']) == 0 - assert pdfinfo[0]['has_text'] == False \ No newline at end of file + assert len(pdfinfo[0].images) == 0 + assert pdfinfo[0].has_text == False \ No newline at end of file