mirror of
https://github.com/ocrmypdf/OCRmyPDF.git
synced 2026-02-16 01:02:26 -05:00
Watched folder bug fixes, new flags, and docs updates.
This commit is contained in:
@@ -210,6 +210,9 @@ be launched as follows:
|
||||
-v <path to files to convert>:/input \
|
||||
-v <path to store results>:/output \
|
||||
-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1 \
|
||||
-e OCR_ON_SUCCESS_DELETE=1 \
|
||||
-e OCR_DESKEW=1 \
|
||||
-e PYTHONUNBUFFERED=1 \
|
||||
-it --entrypoint python3 \
|
||||
jbarlow83/ocrmypdf \
|
||||
watcher.py
|
||||
@@ -224,6 +227,9 @@ convert it to a OCRed PDF in ``/output/``. The parameters to this image are:
|
||||
"``-v <path to files to convert>:/input``", "Files placed in this location will be OCRed"
|
||||
"``-v <path to store results>:/output``", "This is where OCRed files will be stored"
|
||||
"``-e OCR_OUTPUT_DIRECTORY_YEAR_MONTH=1``", "This will place files in the output in {output}/{year}/{month}/{filename}"
|
||||
"``-e OCR_ON_SUCCESS_DELETE=1``", "This will delete the input file if the exit code is 0 (OK)"
|
||||
"``-e OCR_DESKEW=1``", "This will enable deskew for crooked PDFs"
|
||||
"``-e PYTHONBUFFERED=1``", "This will force STDOUT to be unbuffered and allow you to see messages in docker logs"
|
||||
|
||||
This service relies on polling to check for changes to the filesystem. It
|
||||
may not be suitable for some environments, such as filesystems shared on a
|
||||
|
||||
@@ -25,12 +25,15 @@ import ocrmypdf
|
||||
|
||||
INPUT_DIRECTORY = os.getenv('OCR_INPUT_DIRECTORY', '/input')
|
||||
OUTPUT_DIRECTORY = os.getenv('OCR_OUTPUT_DIRECTORY', '/output')
|
||||
ON_SUCCESS_DELETE = bool(os.getenv('OCR_ON_SUCCESS_DELETE', False))
|
||||
DESKEW = bool(os.getenv('OCR_DESKEW', False))
|
||||
OUTPUT_DIRECTORY_YEAR_MONTH = bool(os.getenv('OCR_OUTPUT_DIRECTORY_YEAR_MONTH', False))
|
||||
PATTERNS = ['*.pdf']
|
||||
|
||||
|
||||
def execute_ocrmypdf(file_path):
|
||||
filename = Path(file_path).name
|
||||
new_file = Path(file_path)
|
||||
filename = new_file.name
|
||||
if OUTPUT_DIRECTORY_YEAR_MONTH:
|
||||
today = datetime.today()
|
||||
output_directory_year_month = Path(
|
||||
@@ -41,13 +44,29 @@ def execute_ocrmypdf(file_path):
|
||||
output_path = Path(output_directory_year_month) / filename
|
||||
else:
|
||||
output_path = Path(OUTPUT_DIRECTORY) / filename
|
||||
print(f'New file: {file_path}.\nAttempting to OCRmyPDF to: {output_path}')
|
||||
ocrmypdf.ocr(file_path, output_path)
|
||||
print(f'New file: {file_path}. Waiting until fully loaded...')
|
||||
# This loop waits to make sure that the file is completely loaded on
|
||||
# disk before attempting to read. Docker sometimes will publish the
|
||||
# watchdog event before the file is actually fully on disk, causing
|
||||
# pikepdf to fail.
|
||||
current_size = None
|
||||
while current_size != new_file.stat().st_size:
|
||||
current_size = new_file.stat().st_size
|
||||
time.sleep(1)
|
||||
print(f'Attempting to OCRmyPDF to: {output_path}')
|
||||
exit_code = ocrmypdf.ocr(
|
||||
input_file=file_path, output_file=output_path, deskew=DESKEW
|
||||
)
|
||||
if exit_code == 0 and ON_SUCCESS_DELETE:
|
||||
print(f'Done. Deleting: {file_path}')
|
||||
new_file.unlink()
|
||||
else:
|
||||
print('Done')
|
||||
|
||||
|
||||
class HandleObserverEvent(PatternMatchingEventHandler):
|
||||
def on_any_event(self, event):
|
||||
if event.event_type in ['created', 'modified']:
|
||||
if event.event_type in ['created']:
|
||||
execute_ocrmypdf(event.src_path)
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user