From cd04ae6949c0ea2cdfc999d9bc87a1049fc5f05e Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 18 May 2017 16:43:50 -0700 Subject: [PATCH] Refactor PdfInfo(str(filename)) -> PdfInfo(filename) --- ocrmypdf/pageinfo.py | 3 +++ tests/conftest.py | 2 +- tests/test_main.py | 20 ++++++++++---------- tests/test_pageinfo.py | 12 ++++++------ tests/test_tess4.py | 6 +++--- 5 files changed, 23 insertions(+), 20 deletions(-) diff --git a/ocrmypdf/pageinfo.py b/ocrmypdf/pageinfo.py index b8467f38..e2a32ec5 100644 --- a/ocrmypdf/pageinfo.py +++ b/ocrmypdf/pageinfo.py @@ -9,6 +9,7 @@ import sys import PyPDF2 as pypdf from collections import namedtuple import warnings +from pathlib import Path matrix_mult = pypdf.pdf.utils.matrixMultiply @@ -517,6 +518,8 @@ def _pdf_get_pageinfo(infile, pageno: int): def pdf_get_all_pageinfo(infile): + if isinstance(infile, Path): + infile = str(infile) pdf = pypdf.PdfFileReader(infile) return [_pdf_get_pageinfo(infile, n) for n in range(pdf.numPages)] diff --git a/tests/conftest.py b/tests/conftest.py index be0a6d82..24ea1035 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -130,6 +130,6 @@ def run_ocrmypdf(input_file, output_file, *args, env=None): @pytest.helpers.register def first_page_dimensions(pdf): from ocrmypdf import pageinfo - info = pageinfo.pdf_get_all_pageinfo(str(pdf)) + info = pageinfo.PdfInfo(pdf) page0 = info[0] return (page0['width_inches'], page0['height_inches']) diff --git a/tests/test_main.py b/tests/test_main.py index bb2eb122..bfe8bef2 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -230,7 +230,7 @@ def test_oversample(spoof_tesseract_cache, renderer, resources, outpdf): '-f', '--pdf-renderer', renderer, env=spoof_tesseract_cache) - pdfinfo = PdfInfo(str(oversampled_pdf)) + pdfinfo = PdfInfo(oversampled_pdf) print(pdfinfo[0]['xres']) assert abs(pdfinfo[0]['xres'] - 350) < 1 @@ -358,14 +358,14 @@ def test_autorotate_threshold( def test_ocr_timeout(renderer, resources, outpdf): out = check_ocrmypdf(resources / 'skew.pdf', outpdf, '--tesseract-timeout', '1.0') - pdfinfo = PdfInfo(str(out)) + pdfinfo = PdfInfo(out) assert not pdfinfo[0]['has_text'] def test_skip_big(spoof_tesseract_cache, resources, outpdf): out = check_ocrmypdf(resources / 'enormous.pdf', outpdf, '--skip-big', '10', env=spoof_tesseract_cache) - pdfinfo = PdfInfo(str(out)) + pdfinfo = PdfInfo(out) assert not pdfinfo[0]['has_text'] @@ -562,14 +562,14 @@ def test_algo4(resources, no_outpdf): def test_non_square_resolution(renderer, spoof_tesseract_cache, resources, outpdf): # Confirm input image is non-square resolution - in_pageinfo = PdfInfo(str(resources / 'aspect.pdf')) + in_pageinfo = PdfInfo(resources / 'aspect.pdf') assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres'] check_ocrmypdf( resources / 'aspect.pdf', outpdf, '--pdf-renderer', renderer, env=spoof_tesseract_cache) - out_pageinfo = PdfInfo(str(outpdf)) + out_pageinfo = PdfInfo(outpdf) # Confirm resolution was kept the same assert in_pageinfo[0]['xres'] == out_pageinfo[0]['xres'] @@ -585,7 +585,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache, from math import isclose # Confirm input image is non-square resolution - in_pageinfo = PdfInfo(str(resources / 'aspect.pdf')) + in_pageinfo = PdfInfo(resources / 'aspect.pdf') assert in_pageinfo[0]['xres'] != in_pageinfo[0]['yres'] # --force-ocr requires means forced conversion to square resolution @@ -594,7 +594,7 @@ def test_convert_to_square_resolution(renderer, spoof_tesseract_cache, '--force-ocr', '--pdf-renderer', renderer, env=spoof_tesseract_cache) - out_pageinfo = PdfInfo(str(outpdf)) + out_pageinfo = PdfInfo(outpdf) in_p0, out_p0 = in_pageinfo[0], out_pageinfo[0] @@ -628,7 +628,7 @@ def test_jbig2_passthrough(spoof_tesseract_cache, resources, outpdf): '--pdf-renderer', 'hocr', env=spoof_tesseract_cache) - out_pageinfo = PdfInfo(str(out)) + out_pageinfo = PdfInfo(out) assert out_pageinfo[0]['images'][0]['enc'] == 'jbig2' @@ -713,7 +713,7 @@ def test_rotated_skew_timeout(resources, outpdf): '--pdf-renderer', 'hocr', '--deskew', '--tesseract-timeout', '0') - out_pageinfo = PdfInfo(str(out))[0] + out_pageinfo = PdfInfo(out)[0] assert out_pageinfo['height_pixels'] > out_pageinfo['width_pixels'], \ "Expected the output page to be portrait" @@ -970,7 +970,7 @@ def test_sidecar_pagecount(spoof_tesseract_cache, resources, outpdf): '--sidecar', sidecar, env=spoof_tesseract_cache) - pdfinfo = PdfInfo(str(resources / 'multipage.pdf')) + pdfinfo = PdfInfo(resources / 'multipage.pdf') num_pages = len(pdfinfo) with open(sidecar, 'r') as f: diff --git a/tests/test_pageinfo.py b/tests/test_pageinfo.py index a860cfb9..d6bd0daf 100644 --- a/tests/test_pageinfo.py +++ b/tests/test_pageinfo.py @@ -26,7 +26,7 @@ def test_single_page_text(outdir): pdf.showPage() pdf.save() - pdfinfo = pageinfo.PdfInfo(str(filename)) + pdfinfo = pageinfo.PdfInfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] @@ -53,7 +53,7 @@ def test_single_page_image(outdir): layout_fun=layout_fun) filename.write_bytes(pdf_bytes) - pdfinfo = pageinfo.PdfInfo(str(filename)) + pdfinfo = pageinfo.PdfInfo(filename) assert len(pdfinfo) == 1 page = pdfinfo[0] @@ -83,7 +83,7 @@ def test_single_page_inline_image(outdir): pdf.showPage() pdf.save() - pdfinfo = pageinfo.PdfInfo(str(filename)) + pdfinfo = pageinfo.PdfInfo(filename) print(pdfinfo) pdfimage = pdfinfo[0]['images'][0] assert (pdfimage['dpi_w'] - 8) < 1e-5 @@ -94,7 +94,7 @@ def test_single_page_inline_image(outdir): def test_jpeg(resources, outdir): filename = resources / 'c02-22.pdf' - pdfinfo = pageinfo.PdfInfo(str(filename)) + pdfinfo = pageinfo.PdfInfo(filename) pdfimage = pdfinfo[0]['images'][0] assert pdfimage['enc'] == 'jpeg' @@ -104,7 +104,7 @@ def test_jpeg(resources, outdir): def test_form_xobject(resources): filename = resources / 'formxobject.pdf' - pdfinfo = pageinfo.PdfInfo(str(filename)) + pdfinfo = pageinfo.PdfInfo(filename) pdfimage = pdfinfo[0]['images'][0] assert pdfimage['width'] == 50 @@ -112,6 +112,6 @@ def test_form_xobject(resources): def test_no_contents(resources): filename = resources / 'no_contents.pdf' - pdfinfo = pageinfo.PdfInfo(str(filename)) + pdfinfo = pageinfo.PdfInfo(filename) assert len(pdfinfo[0]['images']) == 0 assert pdfinfo[0]['has_text'] == False \ No newline at end of file diff --git a/tests/test_tess4.py b/tests/test_tess4.py index b6d60860..47cad81f 100644 --- a/tests/test_tess4.py +++ b/tests/test_tess4.py @@ -96,9 +96,9 @@ def test_skip_pages_does_not_replicate( env=ensure_tess4 ) - info_in = pageinfo.pdf_get_all_pageinfo(str(infile)) + info_in = pageinfo.PdfInfo(infile) - info = pageinfo.pdf_get_all_pageinfo(str(outpdf)) + info = pageinfo.PdfInfo(outpdf) for page in info: assert len(page['images']) == 1, "skipped page was replicated" @@ -115,6 +115,6 @@ def test_content_preservation(ensure_tess4, resources, outpdf): env=ensure_tess4 ) - info = pageinfo.pdf_get_all_pageinfo(str(outpdf)) + info = pageinfo.PdfInfo(outpdf) page = info[0] assert len(page['images']) > 1, "masked were rasterized" \ No newline at end of file