diff --git a/src/anthias_server/celery_tasks.py b/src/anthias_server/celery_tasks.py index 0daa3be6..c7f5712c 100755 --- a/src/anthias_server/celery_tasks.py +++ b/src/anthias_server/celery_tasks.py @@ -9,6 +9,7 @@ from typing import Any import django import sh from celery import Celery, Task +from PIL import UnidentifiedImageError from tenacity import Retrying, stop_after_attempt, wait_fixed django.setup() @@ -785,32 +786,38 @@ NORMALIZE_VIDEO_TIME_LIMIT_S = processing.NORMALIZE_VIDEO_TIME_LIMIT_S base=processing._NormalizeAssetTask, time_limit=300, autoretry_for=(OSError,), - # ``FileNotFoundError`` is-a ``OSError`` so the autoretry_for - # filter would catch it — but a missing source file isn't a - # transient condition; retrying just keeps the row in - # ``is_processing=True`` longer before the inevitable - # ``on_failure`` lands. Excluding it here lets the on_failure - # path write ``metadata.error_message`` immediately. - dont_autoretry_for=(FileNotFoundError,), + # Two OSError subclasses are permanent and must NOT trigger a + # retry — they'd just keep the row in ``is_processing=True`` + # longer before the inevitable on_failure lands: + # * ``FileNotFoundError`` — source file gone between row + # creation and pickup (cleanup raced operator, disk pressure). + # * ``UnidentifiedImageError`` — Pillow decoded the header far + # enough to refuse the file (corrupt HEIC, truncated TIFF, + # mis-typed extension). It inherits from OSError so the + # autoretry_for filter would otherwise sweep it up; listing + # it explicitly here makes the failure surface immediately. + dont_autoretry_for=(FileNotFoundError, UnidentifiedImageError), retry_backoff=10, retry_backoff_max=300, retry_jitter=True, max_retries=2, ) def normalize_image_asset(asset_id: str) -> None: - """Convert HEIC / HEIF / TIFF uploads to lossless WebP. + """Convert every extension in + ``processing.NORMALIZE_IMAGE_EXTS`` (HEIC / HEIF / TIFF / BMP / + ICO / TGA / JPEG 2000 family / AVIF) to lossless WebP. No-ops if the row is missing or has already been finalised (duplicate task fire / operator-edited row). Permanent decode failures (corrupt HEIC) raise so ``_NormalizeAssetTask.on_failure`` writes ``metadata.error_message`` and clears ``is_processing``. - Retries on OSError covers transient disk pressure / a temporary - libheif read hiccup; Pillow's UnidentifiedImageError is permanent - and lands directly on on_failure. ``FileNotFoundError`` (source - file gone between row creation and pickup) is excluded from - autoretry — see the ``dont_autoretry_for`` rationale on the - decorator. + Retries on OSError cover transient disk pressure / a temporary + libheif read hiccup. Two OSError subclasses are excluded from + autoretry via ``dont_autoretry_for`` because they're permanent: + ``FileNotFoundError`` (source file gone) and Pillow's + ``UnidentifiedImageError`` (corrupt input). See the decorator's + inline comment for the rationale. """ asset = processing._row_or_none(asset_id) if asset is None: diff --git a/tests/test_processing.py b/tests/test_processing.py index 283e5306..312a3988 100644 --- a/tests/test_processing.py +++ b/tests/test_processing.py @@ -1412,15 +1412,21 @@ def test_normalize_image_asset_celery_no_op_when_row_missing() -> None: run.assert_not_called() -def test_normalize_tasks_exclude_filenotfounderror_from_autoretry() -> None: +def test_normalize_tasks_exclude_permanent_oserrors_from_autoretry() -> None: """Both normalisation tasks have ``autoretry_for=(OSError,)`` so a - transient disk hiccup is retried automatically. ``FileNotFoundError`` - is-a ``OSError`` so the autoretry filter would otherwise catch it - too — but a missing source file is permanent, and retrying just - delays the on_failure that writes ``metadata.error_message``. - Confirm both tasks were registered with - ``dont_autoretry_for=(FileNotFoundError,)`` so the exclusion is - in effect at celery-config time. + transient disk hiccup is retried automatically. Several OSError + subclasses are permanent and must be excluded so they land on + on_failure immediately: + + * ``FileNotFoundError`` — source file disappeared between row + creation and pickup. + * ``UnidentifiedImageError`` (image task only) — Pillow refused + to decode the file. Inherits from OSError, so without the + explicit exclusion the autoretry filter would sweep it up. + + Confirms the exclusions are in effect at celery-config time so a + future change to the decorators can't silently regress the + immediate-fail contract. """ from anthias_server.celery_tasks import ( normalize_image_asset, @@ -1444,6 +1450,16 @@ def test_normalize_tasks_exclude_filenotfounderror_from_autoretry() -> None: f'FileNotFoundError so missing-source raises immediately' ) + # Image task additionally excludes UnidentifiedImageError (Pillow + # only — video has no Pillow path). + assert UnidentifiedImageError in tuple( + getattr(normalize_image_asset, 'dont_autoretry_for', ()) + ), ( + 'normalize_image_asset expected dont_autoretry_for to include ' + 'UnidentifiedImageError so corrupt-image raises immediately ' + '(it inherits from OSError)' + ) + @pytest.mark.django_db def test_normalize_on_failure_writes_error_metadata(