mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-05-06 13:47:41 -04:00
Add feature to remove vector graphics objects
This commit is contained in:
@@ -244,6 +244,11 @@ preprocessing.add_argument(
|
||||
'--oversample', metavar='DPI', type=numeric(int, 0, 5000), default=0,
|
||||
help="Oversample images to at least the specified DPI, to improve OCR "
|
||||
"results slightly")
|
||||
preprocessing.add_argument(
|
||||
'--remove-vectors', action='store_true',
|
||||
help="EXPERIMENTAL. Remove any vector graphics objects from the PDF, "
|
||||
"including text rendered as curves. Useful when these objects "
|
||||
"interfere with OCR.")
|
||||
|
||||
ocrsettings = parser.add_argument_group(
|
||||
"OCR options",
|
||||
|
||||
@@ -493,7 +493,8 @@ def rasterize_with_ghostscript(
|
||||
ghostscript.rasterize_pdf(
|
||||
input_file, output_file, xres=canvas_dpi, yres=canvas_dpi,
|
||||
raster_device=device, log=log, page_dpi=(page_dpi, page_dpi),
|
||||
pageno=page_number(input_file), rotation=correction)
|
||||
pageno=page_number(input_file), rotation=correction,
|
||||
filter_vector=options.remove_vectors)
|
||||
|
||||
|
||||
def preprocess_remove_background(
|
||||
|
||||
@@ -101,7 +101,7 @@ def extract_text(input_file, pageno=1):
|
||||
|
||||
|
||||
def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
|
||||
pageno=1, page_dpi=None, rotation=None):
|
||||
pageno=1, page_dpi=None, rotation=None, filter_vector=False):
|
||||
"""Rasterize one page of a PDF at resolution (xres, yres) in canvas units.
|
||||
|
||||
The image is sized to match the integer pixels dimensions implied by
|
||||
@@ -116,6 +116,8 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
|
||||
:param log:
|
||||
:param pageno: page number to rasterize (beginning at page 1)
|
||||
:param page_dpi: resolution tuple (x, y) overriding output image DPI
|
||||
:param rotation: 0, 90, 180, 270: clockwise angle to rotate page
|
||||
:param filter_vector: if True, remove vector graphics objects
|
||||
:return:
|
||||
"""
|
||||
res = xres, yres
|
||||
@@ -134,6 +136,7 @@ def rasterize_pdf(input_file, output_file, xres, yres, raster_device, log,
|
||||
'-dFirstPage=%i' % pageno,
|
||||
'-dLastPage=%i' % pageno,
|
||||
'-r{0}x{1}'.format(str(int_res[0]), str(int_res[1])),
|
||||
] + (['-dFILTERVECTOR'] if filter_vector else []) + [
|
||||
'-o', tmp.name,
|
||||
'-dAutoRotatePages=/None', # Probably has no effect on raster
|
||||
'-f',
|
||||
|
||||
Reference in New Issue
Block a user