From 700abbb8a51790bb1fba62c6c13fbc8f933aa92a Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 10 Nov 2018 15:48:41 -0800 Subject: [PATCH] Documentation for OCR quality features --- .gitignore | 1 + docs/release_notes.rst | 8 +++++++- src/ocrmypdf/__main__.py | 4 ++-- 3 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.gitignore b/.gitignore index 610fe588..e545c422 100644 --- a/.gitignore +++ b/.gitignore @@ -32,6 +32,7 @@ htmlcov/ *.profile /*.pdf /*.qdf +/*.png /scratch.py IDEAS log/ diff --git a/docs/release_notes.rst b/docs/release_notes.rst index b6be039f..d36cf4d4 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -29,7 +29,13 @@ v7.3.0 - OCRmyPDF now warns when a PDF that contains Adobe AcroForms, since such files probably do not need OCR. It can work with these files. -- Added a new feature ``--mask-barcodes`` to detect and suppress barcodes in files. We have observed that barcodes can interfere with OCR. +- Added three new **experimental** features. The name, syntax and behavior of these arguments is subject to change. They may also be incompatible with some other features. + + - ``--remove-vectors`` which strips out vector graphics. This can improve OCR quality since OCR will not search artwork for readable text; however, it currently removes "text as curves" as well. + + - ``--mask-barcodes`` to detect and suppress barcodes in files. We have observed that barcodes can interfere with OCR. + + - ``--threshold`` which uses a more sophisticated thresholding algorithm than is currently in use in Tesseract OCR. This works around a `known issue in Tesseract `_ with text on bright backgrounds. - Fixed an issue where an error message was not reported when the installed Ghostscript was very old. diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index 2caf45a4..52ae8629 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -250,12 +250,12 @@ preprocessing.add_argument( "will not be included in OCR. This can eliminate false characters.") preprocessing.add_argument( '--mask-barcodes', action='store_true', - help="Mask out any barcodes that appear in the PDF so they are not " + help="EXPERIMENTAL. Mask out any barcodes that appear in the PDF so they are not " "considered during OCR. Barcodes can introduce false characters into " "OCR.") preprocessing.add_argument( '--threshold', action='store_true', - help="Threshold image to 1bpp before sending it to Tesseract for OCR. Can " + help="EXPERIMENTAL. Threshold image to 1bpp before sending it to Tesseract for OCR. Can " "improve OCR quality compared to Tesseract's thresholder.") ocrsettings = parser.add_argument_group(