fix(processing): exclude UnidentifiedImageError from autoretry

Pillow's UnidentifiedImageError inherits from OSError, so it was
getting caught by normalize_image_asset's autoretry_for=(OSError,)
filter — a corrupt-image upload would retry up to 3 times with
exponential backoff before metadata.error_message landed.

Add it to dont_autoretry_for alongside FileNotFoundError so the
permanent-failure contract surfaces immediately. Test extended to
assert both exclusions are in place at celery-config time.
This commit is contained in:
Viktor Petersson
2026-05-07 09:37:27 +00:00
parent 58fe62d0c7
commit eb785f768d
2 changed files with 45 additions and 22 deletions

View File

@@ -9,6 +9,7 @@ from typing import Any
import django
import sh
from celery import Celery, Task
from PIL import UnidentifiedImageError
from tenacity import Retrying, stop_after_attempt, wait_fixed
django.setup()
@@ -785,32 +786,38 @@ NORMALIZE_VIDEO_TIME_LIMIT_S = processing.NORMALIZE_VIDEO_TIME_LIMIT_S
base=processing._NormalizeAssetTask,
time_limit=300,
autoretry_for=(OSError,),
# ``FileNotFoundError`` is-a ``OSError`` so the autoretry_for
# filter would catch it — but a missing source file isn't a
# transient condition; retrying just keeps the row in
# ``is_processing=True`` longer before the inevitable
# ``on_failure`` lands. Excluding it here lets the on_failure
# path write ``metadata.error_message`` immediately.
dont_autoretry_for=(FileNotFoundError,),
# Two OSError subclasses are permanent and must NOT trigger a
# retry — they'd just keep the row in ``is_processing=True``
# longer before the inevitable on_failure lands:
# * ``FileNotFoundError`` — source file gone between row
# creation and pickup (cleanup raced operator, disk pressure).
# * ``UnidentifiedImageError`` — Pillow decoded the header far
# enough to refuse the file (corrupt HEIC, truncated TIFF,
# mis-typed extension). It inherits from OSError so the
# autoretry_for filter would otherwise sweep it up; listing
# it explicitly here makes the failure surface immediately.
dont_autoretry_for=(FileNotFoundError, UnidentifiedImageError),
retry_backoff=10,
retry_backoff_max=300,
retry_jitter=True,
max_retries=2,
)
def normalize_image_asset(asset_id: str) -> None:
"""Convert HEIC / HEIF / TIFF uploads to lossless WebP.
"""Convert every extension in
``processing.NORMALIZE_IMAGE_EXTS`` (HEIC / HEIF / TIFF / BMP /
ICO / TGA / JPEG 2000 family / AVIF) to lossless WebP.
No-ops if the row is missing or has already been finalised
(duplicate task fire / operator-edited row). Permanent decode
failures (corrupt HEIC) raise so ``_NormalizeAssetTask.on_failure``
writes ``metadata.error_message`` and clears ``is_processing``.
Retries on OSError covers transient disk pressure / a temporary
libheif read hiccup; Pillow's UnidentifiedImageError is permanent
and lands directly on on_failure. ``FileNotFoundError`` (source
file gone between row creation and pickup) is excluded from
autoretry — see the ``dont_autoretry_for`` rationale on the
decorator.
Retries on OSError cover transient disk pressure / a temporary
libheif read hiccup. Two OSError subclasses are excluded from
autoretry via ``dont_autoretry_for`` because they're permanent:
``FileNotFoundError`` (source file gone) and Pillow's
``UnidentifiedImageError`` (corrupt input). See the decorator's
inline comment for the rationale.
"""
asset = processing._row_or_none(asset_id)
if asset is None:

View File

@@ -1412,15 +1412,21 @@ def test_normalize_image_asset_celery_no_op_when_row_missing() -> None:
run.assert_not_called()
def test_normalize_tasks_exclude_filenotfounderror_from_autoretry() -> None:
def test_normalize_tasks_exclude_permanent_oserrors_from_autoretry() -> None:
"""Both normalisation tasks have ``autoretry_for=(OSError,)`` so a
transient disk hiccup is retried automatically. ``FileNotFoundError``
is-a ``OSError`` so the autoretry filter would otherwise catch it
too — but a missing source file is permanent, and retrying just
delays the on_failure that writes ``metadata.error_message``.
Confirm both tasks were registered with
``dont_autoretry_for=(FileNotFoundError,)`` so the exclusion is
in effect at celery-config time.
transient disk hiccup is retried automatically. Several OSError
subclasses are permanent and must be excluded so they land on
on_failure immediately:
* ``FileNotFoundError`` — source file disappeared between row
creation and pickup.
* ``UnidentifiedImageError`` (image task only) — Pillow refused
to decode the file. Inherits from OSError, so without the
explicit exclusion the autoretry filter would sweep it up.
Confirms the exclusions are in effect at celery-config time so a
future change to the decorators can't silently regress the
immediate-fail contract.
"""
from anthias_server.celery_tasks import (
normalize_image_asset,
@@ -1444,6 +1450,16 @@ def test_normalize_tasks_exclude_filenotfounderror_from_autoretry() -> None:
f'FileNotFoundError so missing-source raises immediately'
)
# Image task additionally excludes UnidentifiedImageError (Pillow
# only — video has no Pillow path).
assert UnidentifiedImageError in tuple(
getattr(normalize_image_asset, 'dont_autoretry_for', ())
), (
'normalize_image_asset expected dont_autoretry_for to include '
'UnidentifiedImageError so corrupt-image raises immediately '
'(it inherits from OSError)'
)
@pytest.mark.django_db
def test_normalize_on_failure_writes_error_metadata(