From b40eec4cb0b07d30b8dec32b6ebfbe1efc6c9559 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Mon, 27 Jul 2015 17:18:02 -0700 Subject: [PATCH] Add --oversample test for hocr rendering --- flowchart.svg | 230 ------------------------------------- ocrmypdf/main.py | 6 +- pipeline.svg | 274 ++++++++++++++++++++++++--------------------- tests/test_main.py | 12 ++ 4 files changed, 162 insertions(+), 360 deletions(-) delete mode 100644 flowchart.svg diff --git a/flowchart.svg b/flowchart.svg deleted file mode 100644 index 65f3fe16..00000000 --- a/flowchart.svg +++ /dev/null @@ -1,230 +0,0 @@ - - - - - - -Pipeline: - -clustertasks - -Pipeline: - - -t0 - - - - -repair_pdf - - -t1 - - -split_pages - - -t0->t1 - - - - -t10 - - - - -generate_postscript_stub - - -t0->t10 - - - - -t2 - - - - -rasterize_with_ghostscript - - -t1->t2 - - - - -t11 - - - - -skip_page - - -t1->t11 - - - - -t3 - - - - -preprocess_deskew - - -t2->t3 - - - - -t6 - - - - -select_image_for_pdf - - -t2->t6 - - - - -t4 - - - - -preprocess_clean - - -t3->t4 - - - - -t3->t6 - - - - -t4->t6 - - - - -t5 - - - - -ocr_tesseract_hocr - - -t4->t5 - - - - -t9 - - - - -tesseract_ocr_and_render_pdf - - -t4->t9 - - - - -t7 - - - - -render_hocr_page - - -t6->t7 - - - - -t8 - - - - -render_hocr_debug_page - - -t6->t8 - - - - -t5->t7 - - - - -t5->t8 - - - - -t12 - - -merge_pages - - -t7->t12 - - - - -t8->t12 - - - - -t11->t12 - - - - -t9->t12 - - - - -t10->t12 - - - - -t13 - - - - -validate_pdfa - - -t12->t13 - - - - - diff --git a/ocrmypdf/main.py b/ocrmypdf/main.py index b7a8605e..ca740eb6 100755 --- a/ocrmypdf/main.py +++ b/ocrmypdf/main.py @@ -162,6 +162,8 @@ if _argv[0].startswith('python'): _argv = _argv[1:] if _argv[0].endswith('.py'): _argv = _argv[1:] +if _argv[0].startswith('ocrmypdf'): + _argv = _argv[1:] options = parser.parse_args(_argv) @@ -576,7 +578,7 @@ def render_hocr_page( image = next(ii for ii in infiles if ii.endswith('.image')) pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) - dpi = round(max(pageinfo['xres'], pageinfo['yres'])) + dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample)) hocrtransform = HocrTransform(hocr, dpi) hocrtransform.to_pdf(output_file, imageFileName=image, @@ -600,7 +602,7 @@ def render_hocr_debug_page( image = next(ii for ii in infiles if ii.endswith('.image')) pageinfo = get_pageinfo(image, pdfinfo, pdfinfo_lock) - dpi = round(max(pageinfo['xres'], pageinfo['yres'])) + dpi = round(max(pageinfo['xres'], pageinfo['yres'], options.oversample)) hocrtransform = HocrTransform(hocr, dpi) hocrtransform.to_pdf(output_file, imageFileName=None, diff --git a/pipeline.svg b/pipeline.svg index 57d3c88c..65f3fe16 100644 --- a/pipeline.svg +++ b/pipeline.svg @@ -4,209 +4,227 @@ - + Pipeline: - + clustertasks - -Pipeline: + +Pipeline: t0 - - - - -repair_pdf + + + + +repair_pdf t1 - - -split_pages + + +split_pages t0->t1 - - + + - -t9 - - - - -generate_postscript_stub + +t10 + + + + +generate_postscript_stub - -t0->t9 - - + +t0->t10 + + t2 - - - - -rasterize_with_ghostscript + + + + +rasterize_with_ghostscript t1->t2 - - + + - -t10 - - - - -skip_page + +t11 + + + + +skip_page - -t1->t10 - - + +t1->t11 + + t3 - - - - -preprocess_deskew + + + + +preprocess_deskew t2->t3 - - + + t6 - - - - -select_image_for_pdf + + + + +select_image_for_pdf t2->t6 - - + + t4 - - - - -preprocess_clean + + + + +preprocess_clean t3->t4 - - + + t3->t6 - - + + t4->t6 - - + + t5 - - - - -ocr_tesseract + + + + +ocr_tesseract_hocr t4->t5 - - + + + + +t9 + + + + +tesseract_ocr_and_render_pdf + + +t4->t9 + + t7 - - - - -render_page + + + + +render_hocr_page t6->t7 - - + + t8 - - - - -render_debug_page + + + + +render_hocr_debug_page t6->t8 - - + + t5->t7 - - + + t5->t8 - - - - -t11 - - -merge_pages - - -t7->t11 - - - - -t8->t11 - - - - -t10->t11 - - - - -t9->t11 - - + + t12 - - - - -validate_pdfa + + +merge_pages + + +t7->t12 + + + + +t8->t12 + + -t11->t12 - - +t11->t12 + + + + +t9->t12 + + + + +t10->t12 + + + + +t13 + + + + +validate_pdfa + + +t12->t13 + + diff --git a/tests/test_main.py b/tests/test_main.py index 2d893bb0..fba68e02 100644 --- a/tests/test_main.py +++ b/tests/test_main.py @@ -114,3 +114,15 @@ def test_metadata(): assert pdfinfo['Author'] == '孔子' assert pdfinfo['Subject'] == 'U+1030C is: 𐌌' assert pdfinfo.get('Keywords', '') == '' + + +def test_oversample(): + oversampled_pdf = run_ocrmypdf( + 'skew.pdf', 'test-oversample.pdf', '--oversample', '300') + + from ocrmypdf.pageinfo import pdf_get_all_pageinfo + + pdfinfo = pdf_get_all_pageinfo(oversampled_pdf) + + print(pdfinfo[0]['xres']) + assert abs(pdfinfo[0]['xres'] - 300) < 1