Watched folder bug fixes, new flags, and docs updates.

This commit is contained in:
Ian Alexander
2020-01-19 19:11:54 -08:00
parent a6567f2ae4
commit b7f38e976b
2 changed files with 29 additions and 4 deletions

View File

@@ -210,6 +210,9 @@ be launched as follows:
-v <path to files to convert>:/input \
-v <path to store results>:/output \
-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
-e OCR_ON_SUCCESS_DELETE=1 \
-e OCR_DESKEW=1 \
-e PYTHONUNBUFFERED=1 \
-it --entrypoint python3 \
jbarlow83/ocrmypdf \
watcher.py
@@ -224,6 +227,9 @@ convert it to a OCRed PDF in ``/output/``. The parameters to this image are:
"``-v <path to files to convert>:/input``", "Files placed in this location will be OCRed"
"``-v <path to store results>:/output``", "This is where OCRed files will be stored"
"``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "This will place files in the output in {output}/{year}/{month}/{filename}"
"``-e OCR_ON_SUCCESS_DELETE=1``", "This will delete the input file if the exit code is 0 (OK)"
"``-e OCR_DESKEW=1``", "This will enable deskew for crooked PDFs"
"``-e PYTHONBUFFERED=1``", "This will force STDOUT to be unbuffered and allow you to see messages in docker logs"
This service relies on polling to check for changes to the filesystem. It
may not be suitable for some environments, such as filesystems shared on a

View File

@@ -25,12 +25,15 @@ import ocrmypdf
INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False))
DESKEW = bool(os.getenv('OCR_DESKEW', False))
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))
PATTERNS = ['*.pdf']
def execute_ocrmypdf(file_path):
filename = Path(file_path).name
new_file = Path(file_path)
filename = new_file.name
if OUTPUT_DIRECTORY_YEAR_MONTH:
today = datetime.today()
output_directory_year_month = Path(
@@ -41,13 +44,29 @@ def execute_ocrmypdf(file_path):
output_path = Path(output_directory_year_month) / filename
else:
output_path = Path(OUTPUT_DIRECTORY) / filename
print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}')
ocrmypdf.ocr(file_path, output_path)
print(f'New file: {file_path}. Waiting until fully loaded...')
# This loop waits to make sure that the file is completely loaded on
# disk before attempting to read. Docker sometimes will publish the
# watchdog event before the file is actually fully on disk, causing
# pikepdf to fail.
current_size = None
while current_size != new_file.stat().st_size:
current_size = new_file.stat().st_size
time.sleep(1)
print(f'Attempting to OCRmyPDF to: {output_path}')
exit_code = ocrmypdf.ocr(
input_file=file_path, output_file=output_path, deskew=DESKEW
)
if exit_code == 0 and ON_SUCCESS_DELETE:
print(f'Done. Deleting: {file_path}')
new_file.unlink()
else:
print('Done')
class HandleObserverEvent(PatternMatchingEventHandler):
def on_any_event(self, event):
if event.event_type in ['created', 'modified']:
if event.event_type in ['created']:
execute_ocrmypdf(event.src_path)