From 7f462c618b579d192235e191a38b5b6864f78cb9 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Tue, 3 Apr 2018 00:11:20 -0700 Subject: [PATCH 1/3] v6.1.3 notes --- docs/release_notes.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/docs/release_notes.rst b/docs/release_notes.rst index 087d2832..017fae17 100644 --- a/docs/release_notes.rst +++ b/docs/release_notes.rst @@ -5,6 +5,12 @@ OCRmyPDF uses `semantic versioning `_ for its command line i The OCRmyPDF package itself does not contain a public API, although it is fairly stable and breaking changes are usually timed with a major release. A future release will clearly define the stable public API. +v6.1.3 +------ + +- Fix issue #247, ``/CreationDate`` metadata not copied from input to output. + + v6.1.2 ------ From 1dbb6f1746c31c74cc53404b2646067b19f7da8c Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 5 Apr 2018 02:15:01 -0700 Subject: [PATCH 2/3] Notes on relevant envvars, repology --- docs/advanced.rst | 18 +++++++++++++++--- docs/installation.rst | 6 ++++++ 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/docs/advanced.rst b/docs/advanced.rst index da4a4ab5..349a953e 100644 --- a/docs/advanced.rst +++ b/docs/advanced.rst @@ -19,7 +19,7 @@ If ``--force-ocr`` is issued, then all pages will be rasterized to images, disca Time and image size limits """""""""""""""""""""""""" -By default, OCRmyPDF permits tesseract to run for only three minutes (180 seconds) per page. This is usually more than enough time to find all text on a reasonably sized page with modern hardware. +By default, OCRmyPDF permits tesseract to run for three minutes (180 seconds) per page. This is usually more than enough time to find all text on a reasonably sized page with modern hardware. If a page is skipped, it will be inserted without OCR. If preprocessing was requested, the preprocessed image layer will be inserted. @@ -33,11 +33,17 @@ If you want to adjust the amount of time spent on OCR, change ``--tesseract-time Overriding default tesseract """""""""""""""""""""""""""" -OCRmyPDF checks the system ``PATH`` for the ``tesseract`` binary. +OCRmyPDF checks the system ``PATH`` for the ``tesseract`` binary. + +Some relevant environment variables that influence Tesseract's behavior include: .. envvar:: TESSDATA_PREFIX - A Tesseract environment variable that overrides the path to Tesseract's data files. + Overrides the path to Tesseract's data files. This can allow simultaneous installation of the "best" and "fast" training data sets. OCRmyPDF does not manage this environment variable. + +.. envvar:: OMP_THREAD_LIMIT + + Controls the number of threads Tesseract will use. OCRmyPDF will manage this environment if it is not already set. (Currently, it will set it to 1 because this gives the best results in testing.) For example, if you are testing tesseract 4.00 and don't wish to use an existing tesseract 3.04 installation, you can launch OCRmyPDF as follows: @@ -106,6 +112,8 @@ The ``sandwich`` renderer The ``sandwich`` renderer uses Tesseract's new text-only PDF feature, which produces a PDF page that lays out the OCR in invisible text. This page is then "sandwiched" onto the original PDF page, allowing lossless application of OCR even to PDF pages that contain other vector objects. +Currently this is the best renderer for most uses, however it is implemented in Tesseract so OCRmyPDF cannot influence it. Currently some problematic PDF viewers like Mozilla PDF.js and macOS Preview have problems with segmenting its text output, and mightrunseveralwordstogether. + When image preprocessing features like ``--deskew`` are used, the original PDF will be rendered as a full page and the OCR layer will be placed on top. This renderer requires Tesseract 3.05.01 or newer. @@ -115,6 +123,10 @@ The ``hocr`` renderer The ``hocr`` renderer works with older versions of Tesseract. The image layer is copied from the original PDF page if possible, avoiding potentially lossy transcoding or loss of other PDF information. If preprocessing is specified, then the image layer is a new PDF. +Unlike ``sandwich`` this renderer is implemented within OCRmyPDF; anyone looking to customize how OCR is presented should look here. A major disadvantage of this renderer is it not capable of correctly handling text outside the Latin alphabet. Pull requests to improve the situation are welcome. + +Currently, this renderer has the best compatibility with Mozilla's PDF.js viewer. + This works in all versions of Tesseract. The ``tesseract`` renderer diff --git a/docs/installation.rst b/docs/installation.rst index 6bb3c066..0443acae 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -386,3 +386,9 @@ If not yet installed, the script will notify you about dependencies that need to be installed. The script requires specific versions of the dependencies. Older version than the ones mentioned in the release notes are likely not to be compatible to OCRmyPDF. + + +Other Linux packages +-------------------- + +See the `Repology `_ page. From be41ff6d5436c2433360b894e56766ecab49391f Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Thu, 5 Apr 2018 21:23:44 -0700 Subject: [PATCH 3/3] Update flowchart [ci skip] --- docs/pipeline.svg | 875 +++++++++++++++++++++++---------------- src/ocrmypdf/__main__.py | 1 + 2 files changed, 519 insertions(+), 357 deletions(-) diff --git a/docs/pipeline.svg b/docs/pipeline.svg index 341d5d76..a8ac187e 100644 --- a/docs/pipeline.svg +++ b/docs/pipeline.svg @@ -1,392 +1,553 @@ - - - + + Pipeline: - -clustertasks - -Pipeline: + + +clustertasks + +Pipeline: -t0 - - - - -ocrmypdf.pipeline.triage + +t0 + + + + +ocrmypdf.pipeline.triage -t1 - - - - -ocrmypdf.pipeline.repair_pdf + +t1 + + + + +ocrmypdf.pipeline.repair_and_parse_pdf -t0->t1 - - + +t0->t1 + + -t2 - - -ocrmypdf.pipeline.split_pages + +t2 + + +ocrmypdf.pipeline.pre_split_pages -t1->t2 - - - - -t18 - - - - -ocrmypdf.pipeline.generate_postscript_stub - - -t1->t18 - - - - -t21 - - -ocrmypdf.pipeline.merge_pages_qpdf - - -t1->t21 - - - - -t3 - - - - -ocrmypdf.pipeline.rasterize_preview - - -t2->t3 - - - - -t4 - - - - -ocrmypdf.pipeline.orient_page - - -t2->t4 - - - - -t3->t4 - - - - -t5 - - - - -ocrmypdf.pipeline.rasterize_with_ghostscript - - -t4->t5 - - - - -t15 - - - - -ocrmypdf.pipeline.ocr_tesseract_textonly_pdf - - -t4->t15 - - - - -t12 - -ocrmypdf.pipeline.select_image_layer - - -t4->t12 - - - - -t19 - - - - -ocrmypdf.pipeline.skip_page - - -t4->t19 - - - - -t17 - - - - -ocrmypdf.pipeline.ocr_tesseract_and_render_pdf - - -t4->t17 - - - - -t6 - - - - -ocrmypdf.pipeline.preprocess_remove_background - - -t5->t6 - - - - -t11 - -ocrmypdf.pipeline.select_visible_page_image - - -t5->t11 - - - - -t7 - - - - -ocrmypdf.pipeline.preprocess_deskew - - -t6->t7 - - - - -t6->t11 - - - - -t8 - - - - -ocrmypdf.pipeline.preprocess_clean - - -t7->t8 - - - - -t7->t11 - - - - -t9 - - - - -ocrmypdf.pipeline.select_ocr_image - - -t8->t9 - - - - -t8->t11 - - - - -t10 - - - - -ocrmypdf.pipeline.ocr_tesseract_hocr - - -t9->t10 - - - - -t9->t15 - - - - -t13 - - - - -ocrmypdf.pipeline.render_hocr_page - - -t10->t13 - - - - -t14 - - - - -ocrmypdf.pipeline.render_hocr_debug_page - - -t10->t14 - - - - -t16 - - - - -ocrmypdf.pipeline.combine_layers - - -t13->t16 - - - - -t15->t16 - - - - -t11->t12 - - - - -t11->t14 - - - - -t11->t17 - - - - -t12->t16 - - + +t1->t2 + + -t20 - - -ocrmypdf.pipeline.merge_pages_ghostscript + +t20 + + + + +ocrmypdf.pipeline.generate_postscript_stub - -t16->t20 - - + + +t1->t20 + + - -t16->t21 - - + + +t24 + + +ocrmypdf.pipeline.merge_pages_mupdf - -t14->t20 - - + + +t1->t24 + + - -t14->t21 - - + + +t23 + + +ocrmypdf.pipeline.merge_pages_qpdf - -t19->t20 - - + + +t1->t23 + + - -t19->t21 - - + + +t3 + + + + +ocrmypdf.pipeline.split_page - -t17->t20 - - + + +t2->t3 + + - -t17->t21 - - + + +t4 + + +ocrmypdf.pipeline.ocr_or_skip - -t18->t20 - - + + +t3->t4 + + + + + +t5 + + + + +ocrmypdf.pipeline.rasterize_preview + + + +t4->t5 + + + + + +t6 + + + + +ocrmypdf.pipeline.orient_page + + + +t4->t6 + + + + + +t5->t6 + + + + + +t7 + + + + +ocrmypdf.pipeline.rasterize_with_ghostscript + + + +t6->t7 + + + + + +t17 + + + + +ocrmypdf.pipeline.ocr_tesseract_textonly_pdf + + + +t6->t17 + + + + + +t14 + +ocrmypdf.pipeline.select_image_layer + + + +t6->t14 + + + + + +t21 + + + + +ocrmypdf.pipeline.skip_page + + + +t6->t21 + + + + + +t19 + + + + +ocrmypdf.pipeline.ocr_tesseract_and_render_pdf + + + +t6->t19 + + + + + +t8 + + + + +ocrmypdf.pipeline.preprocess_remove_background + + + +t7->t8 + + + + + +t13 + +ocrmypdf.pipeline.select_visible_page_image + + + +t7->t13 + + + + + +t9 + + + + +ocrmypdf.pipeline.preprocess_deskew + + + +t8->t9 + + + + + +t8->t13 + + + + + +t10 + + + + +ocrmypdf.pipeline.preprocess_clean + + + +t9->t10 + + + + + +t9->t13 + + + + + +t11 + + + + +ocrmypdf.pipeline.select_ocr_image + + + +t10->t11 + + + + + +t10->t13 + + + + + +t12 + + + + +ocrmypdf.pipeline.ocr_tesseract_hocr + + + +t11->t12 + + + + + +t11->t17 + + + + + +t15 + + + + +ocrmypdf.pipeline.render_hocr_page + + + +t12->t15 + + + + + +t16 + + + + +ocrmypdf.pipeline.render_hocr_debug_page + + + +t12->t16 + + + + + +t25 + + +ocrmypdf.pipeline.merge_sidecars + + + +t12->t25 + + + + + +t18 + + + + +ocrmypdf.pipeline.combine_layers + + + +t15->t18 + + + + + +t17->t18 + + + + + +t17->t25 + + + + + +t13->t14 + + + + + +t13->t16 + + + + + +t13->t19 + + + + + +t14->t18 + + -t22 - - -ocrmypdf.pipeline.copy_final + +t22 + + +ocrmypdf.pipeline.merge_pages_ghostscript - -t20->t22 - - + + +t18->t22 + + + + + +t18->t24 + + + + + +t18->t23 + + + + + +t16->t22 + + + + + +t16->t24 + + + + + +t16->t23 + + -t21->t22 - - + +t21->t22 + + + + + +t21->t24 + + + + + +t21->t23 + + + + + +t19->t22 + + + + + +t19->t24 + + + + + +t19->t23 + + + + + +t19->t25 + + + + + +t20->t22 + + + + + +t26 + + +ocrmypdf.pipeline.copy_final + + + +t22->t26 + + + + + +t24->t26 + + + + + +t23->t26 + + diff --git a/src/ocrmypdf/__main__.py b/src/ocrmypdf/__main__.py index b6a9f02c..2d0a0a0c 100755 --- a/src/ocrmypdf/__main__.py +++ b/src/ocrmypdf/__main__.py @@ -879,6 +879,7 @@ def run_pipeline(): if options.flowchart: _log.info("Flowchart saved to {}".format(options.flowchart)) + return ExitCode.ok elif options.output_file == '-': _log.info("Output sent to stdout") elif os.path.samefile(options.output_file, os.devnull):