From fa48205bb8fa74484f542ea1f9ba6ef02f5c2135 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 18 Oct 2018 21:46:08 -0700 Subject: [PATCH] Add feature to remove vector graphics objects --- src/ocrmypdf/__main__.py | 5 +++++ src/ocrmypdf/_pipeline.py | 3 ++- src/ocrmypdf/exec/ghostscript.py | 5 ++++- 3 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index 3e69effd..175118d4 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -244,6 +244,11 @@ preprocessing.add_argument( '--oversample', metavar='DPI', type=numeric(int, 0, 5000), default=0, help="Oversample images to at least the specified DPI, to improve OCR " "results slightly") +preprocessing.add_argument( + '--remove-vectors', action='store_true', + help="EXPERIMENTAL. Remove any vector graphics objects from the PDF, " + "including text rendered as curves. Useful when these objects " + "interfere with OCR.") ocrsettings = parser.add_argument_group( "OCR options", diff --git a/src/ocrmypdf/_pipeline.py b/src/ocrmypdf/_pipeline.py index e114d6b6..5bb057d6 100644 --- a/src/ocrmypdf/_pipeline.py +++ b/src/ocrmypdf/_pipeline.py @@ -493,7 +493,8 @@ def rasterize_with_ghostscript( ghostscript.rasterize_pdf( input_file, output_file, xres=canvas_dpi, yres=canvas_dpi, raster_device=device, log=log, page_dpi=(page_dpi, page_dpi), - pageno=page_number(input_file), rotation=correction) + pageno=page_number(input_file), rotation=correction, + filter_vector=options.remove_vectors) def preprocess_remove_background( diff --git a/src/ocrmypdf/exec/ghostscript.py b/src/ocrmypdf/exec/ghostscript.py index 60a0f3c6..d1abfe96 100644 --- a/src/ocrmypdf/exec/ghostscript.py +++ b/src/ocrmypdf/exec/ghostscript.py @@ -101,7 +101,7 @@ def extract_text(input_file, pageno=1): def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log, - pageno=1, page_dpi=None, rotation=None): + pageno=1, page_dpi=None, rotation=None, filter_vector=False): """Rasterize one page of a PDF at resolution (xres, yres) in canvas units. The image is sized to match the integer pixels dimensions implied by @@ -116,6 +116,8 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log, :param log: :param pageno: page number to rasterize (beginning at page 1) :param page_dpi: resolution tuple (x, y) overriding output image DPI + :param rotation: 0, 90, 180, 270: clockwise angle to rotate page + :param filter_vector: if True, remove vector graphics objects :return: """ res = xres, yres @@ -134,6 +136,7 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log, '-dFirstPage=%i' % pageno, '-dLastPage=%i' % pageno, '-r{0}x{1}'.format(str(int_res[0]), str(int_res[1])), + ] + (['-dFILTERVECTOR'] if filter_vector else []) + [ '-o', tmp.name, '-dAutoRotatePages=/None', # Probably has no effect on raster '-f',