Add feature to remove vector graphics objects

This commit is contained in:
James R. Barlow
2018-10-18 21:46:08 -07:00
parent f7dbf94071
commit fa48205bb8
3 changed files with 11 additions and 2 deletions

View File

@@ -244,6 +244,11 @@ preprocessing.add_argument(
'--oversample', metavar='DPI', type=numeric(int, 0, 5000), default=0,
help="Oversample images to at least the specified DPI, to improve OCR "
"results slightly")
preprocessing.add_argument(
'--remove-vectors', action='store_true',
help="EXPERIMENTAL. Remove any vector graphics objects from the PDF, "
"including text rendered as curves. Useful when these objects "
"interfere with OCR.")
ocrsettings = parser.add_argument_group(
"OCR options",

View File

@@ -493,7 +493,8 @@ def rasterize_with_ghostscript(
ghostscript.rasterize_pdf(
input_file, output_file, xres=canvas_dpi, yres=canvas_dpi,
raster_device=device, log=log, page_dpi=(page_dpi, page_dpi),
pageno=page_number(input_file), rotation=correction)
pageno=page_number(input_file), rotation=correction,
filter_vector=options.remove_vectors)
def preprocess_remove_background(

View File

@@ -101,7 +101,7 @@ def extract_text(input_file, pageno=1):
def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
pageno=1, page_dpi=None, rotation=None):
pageno=1, page_dpi=None, rotation=None, filter_vector=False):
"""Rasterize one page of a PDF at resolution (xres, yres) in canvas units.
The image is sized to match the integer pixels dimensions implied by
@@ -116,6 +116,8 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
:param log:
:param pageno: page number to rasterize (beginning at page 1)
:param page_dpi: resolution tuple (x, y) overriding output image DPI
:param rotation: 0, 90, 180, 270: clockwise angle to rotate page
:param filter_vector: if True, remove vector graphics objects
:return:
"""
res = xres, yres
@@ -134,6 +136,7 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
'-dFirstPage=%i' % pageno,
'-dLastPage=%i' % pageno,
'-r{0}x{1}'.format(str(int_res[0]), str(int_res[1])),
] + (['-dFILTERVECTOR'] if filter_vector else []) + [
'-o', tmp.name,
'-dAutoRotatePages=/None', # Probably has no effect on raster
'-f',