Watched folder bug fixes, new flags, and docs updates.

2026-02-16 01:02:26 -05:00 · 2020-01-19 19:11:54 -08:00
parent a6567f2ae4
commit b7f38e976b
2 changed files with 29 additions and 4 deletions
--- a/docs/batch.rst
+++ b/docs/batch.rst
@@ -210,6 +210,9 @@ be launched as follows:
        -v <path to files to convert>:/input \
        -v <path to store results>:/output \
        -e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
+        -e OCR_ON_SUCCESS_DELETE=1 \
+        -e OCR_DESKEW=1 \
+        -e PYTHONUNBUFFERED=1 \
        -it --entrypoint python3 \
        jbarlow83/ocrmypdf \
        watcher.py
@@ -224,6 +227,9 @@ convert it to a OCRed PDF in ``/output/``. The parameters to this image are:
    "``-v <path to files to convert>:/input``", "Files placed in this location will be OCRed"
    "``-v <path to store results>:/output``", "This is where OCRed files will be stored"
    "``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "This will place files in the output in {output}/{year}/{month}/{filename}"
+    "``-e OCR_ON_SUCCESS_DELETE=1``", "This will delete the input file if the exit code is 0 (OK)"
+    "``-e OCR_DESKEW=1``", "This will enable deskew for crooked PDFs"
+    "``-e PYTHONBUFFERED=1``", "This will force STDOUT to be unbuffered and allow you to see messages in docker logs"

 This service relies on polling to check for changes to the filesystem. It
 may not be suitable for some environments, such as filesystems shared on a
--- a/misc/watcher.py
+++ b/misc/watcher.py
@@ -25,12 +25,15 @@ import ocrmypdf

 INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
 OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
+ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False))
+DESKEW = bool(os.getenv('OCR_DESKEW', False))
 OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))
 PATTERNS = ['*.pdf']


 def execute_ocrmypdf(file_path):
-    filename = Path(file_path).name
+    new_file = Path(file_path)
+    filename = new_file.name
    if OUTPUT_DIRECTORY_YEAR_MONTH:
        today = datetime.today()
        output_directory_year_month = Path(
@@ -41,13 +44,29 @@ def execute_ocrmypdf(file_path):
        output_path = Path(output_directory_year_month) / filename
    else:
        output_path = Path(OUTPUT_DIRECTORY) / filename
-    print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}')
-    ocrmypdf.ocr(file_path, output_path)
+    print(f'New file: {file_path}. Waiting until fully loaded...')
+    # This loop waits to make sure that the file is completely loaded on
+    # disk before attempting to read. Docker sometimes will publish the
+    # watchdog event before the file is actually fully on disk, causing
+    # pikepdf to fail.
+    current_size = None
+    while current_size != new_file.stat().st_size:
+        current_size = new_file.stat().st_size
+        time.sleep(1)
+    print(f'Attempting to OCRmyPDF to: {output_path}')
+    exit_code = ocrmypdf.ocr(
+        input_file=file_path, output_file=output_path, deskew=DESKEW
+    )
+    if exit_code == 0 and ON_SUCCESS_DELETE:
+        print(f'Done. Deleting: {file_path}')
+        new_file.unlink()
+    else:
+        print('Done')


 class HandleObserverEvent(PatternMatchingEventHandler):
    def on_any_event(self, event):
-        if event.event_type in ['created', 'modified']:
+        if event.event_type in ['created']:
            execute_ocrmypdf(event.src_path)