mirror of
https://github.com/wizarrrr/wizarr.git
synced 2025-12-23 15:51:07 -05:00
This fixes a critical issue where Gunicorn workers would fail to start after upgrading to v2025.11.0, causing containers to show as unhealthy with only the uv wrapper process running and no actual workers. Root Cause: ----------- In v2025.11.0, library scanning and session recovery were added to the create_app() function, which runs during EVERY app creation including: 1. During 'flask db upgrade' (migrations) 2. During Gunicorn master when_ready() hook 3. During each Gunicorn worker spawn The migration 20251103_properly_fix_foreign_keys recreates 4 database tables with CASCADE foreign keys using raw SQL. This holds exclusive database locks during table recreation. When library scanning and session recovery try to query these tables during migration, they hit database locks, creating a race condition that causes workers to timeout and crash during startup. Fix: ---- - Skip library scanning during migrations (FLASK_SKIP_SCHEDULER=true) - Skip activity monitoring/session recovery during migrations - Make Gunicorn log level configurable (GUNICORN_LOG_LEVEL env var) - Add worker lifecycle hooks for better crash debugging - Increase healthcheck start period from 10s to 60s - Increase Gunicorn worker timeout from 30s to 120s Testing: -------- - Verified app starts successfully with FLASK_SKIP_SCHEDULER=true - Verified library scanning runs normally without the flag - Confirmed 0.38s startup during migrations vs 1.61s normal startup Closes #976
147 lines
5.0 KiB
Python
147 lines
5.0 KiB
Python
# Gunicorn configuration with clean logging
|
|
import logging
|
|
import os
|
|
import sys
|
|
|
|
# Add immediate debug to check if config is loaded
|
|
print("DEBUG: gunicorn.conf.py loaded!")
|
|
|
|
# Make log level configurable for debugging production issues
|
|
# Set GUNICORN_LOG_LEVEL=info or debug for verbose output
|
|
loglevel = os.getenv("GUNICORN_LOG_LEVEL", "warning").lower()
|
|
accesslog = None # Disable access logs for clean output
|
|
errorlog = "-" # Only errors to stderr
|
|
|
|
# Make workers configurable (default 4, but allow override for resource-constrained systems)
|
|
workers = int(os.getenv("GUNICORN_WORKERS", "4"))
|
|
worker_class = "sync"
|
|
|
|
# Worker timeout - kill workers that don't respond within this time
|
|
# Increase from default 30s to 120s to account for slow library scans
|
|
timeout = int(os.getenv("GUNICORN_TIMEOUT", "120"))
|
|
|
|
print(
|
|
f"DEBUG: Gunicorn config - workers={workers}, loglevel={loglevel}, timeout={timeout}s"
|
|
)
|
|
|
|
|
|
def on_starting(server): # noqa: ARG001
|
|
"""Called just before the master process is initialized."""
|
|
print("DEBUG: on_starting() hook called!")
|
|
|
|
|
|
def when_ready(server): # noqa: ARG001
|
|
"""Called after the server is started."""
|
|
try:
|
|
print("DEBUG: when_ready() hook called!")
|
|
|
|
# Set environment to indicate Gunicorn context
|
|
os.environ["SERVER_SOFTWARE"] = "gunicorn"
|
|
|
|
# Only run migrations if we haven't already done so
|
|
if os.getenv("WIZARR_MIGRATIONS_DONE"):
|
|
print("DEBUG: Migrations already done, skipping")
|
|
return
|
|
|
|
print("DEBUG: Setting WIZARR_MIGRATIONS_DONE flag")
|
|
os.environ["WIZARR_MIGRATIONS_DONE"] = "1"
|
|
|
|
# Import here to avoid circular imports
|
|
from app.logging_helpers import AppLogger
|
|
from app.scripts.migrate_libraries import (
|
|
run_library_migration,
|
|
update_server_verified,
|
|
)
|
|
from app.scripts.migrate_media_server import migrate_single_to_multi
|
|
|
|
logger = AppLogger("wizarr.master")
|
|
|
|
# Get the already-created app from run.py
|
|
import run
|
|
|
|
app = run.app
|
|
|
|
# Run master-only migrations and setup
|
|
with app.app_context():
|
|
logger.database_migration("server verification", "verifying media servers")
|
|
update_server_verified(app)
|
|
|
|
logger.database_migration("library migration", "updating library structure")
|
|
run_library_migration(app)
|
|
|
|
logger.database_migration(
|
|
"media server migration", "single to multi-server"
|
|
)
|
|
migrate_single_to_multi(app)
|
|
|
|
# Check Flask-APScheduler status - it handles Gunicorn coordination automatically
|
|
from app.extensions import scheduler
|
|
|
|
print("DEBUG: Checking Flask-APScheduler status...")
|
|
|
|
if scheduler and hasattr(scheduler, "scheduler") and scheduler.scheduler:
|
|
if scheduler.running:
|
|
print("DEBUG: Flask-APScheduler is running")
|
|
dev_mode = os.getenv(
|
|
"WIZARR_ENABLE_SCHEDULER", "false"
|
|
).lower() in (
|
|
"true",
|
|
"1",
|
|
"yes",
|
|
)
|
|
logger.scheduler_status(enabled=True, dev_mode=dev_mode)
|
|
else:
|
|
print("DEBUG: Flask-APScheduler exists but not running")
|
|
logger.warning("Scheduler initialized but not running")
|
|
else:
|
|
print("DEBUG: Flask-APScheduler not available")
|
|
logger.info("Scheduler disabled or not initialized")
|
|
|
|
# Complete the startup sequence
|
|
logger.complete()
|
|
|
|
except Exception as e:
|
|
print(f"DEBUG: Error in when_ready(): {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
|
|
|
|
def post_worker_init(worker):
|
|
"""Worker process initialization - runs once per worker after spawn."""
|
|
try:
|
|
# Set worker environment
|
|
os.environ["GUNICORN_WORKER_PID"] = str(worker.pid)
|
|
print(f"DEBUG: Worker {worker.pid} initialized successfully")
|
|
|
|
# Suppress Flask app creation logs in workers
|
|
logging.getLogger("werkzeug").setLevel(logging.ERROR)
|
|
logging.getLogger("app").setLevel(logging.ERROR)
|
|
except Exception as e:
|
|
print(f"ERROR: Worker {worker.pid} failed to initialize: {e}", file=sys.stderr)
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
raise
|
|
|
|
|
|
def worker_exit(server, worker): # noqa: ARG001
|
|
"""Called when a worker exits (crash or graceful shutdown)."""
|
|
print(f"WARNING: Worker {worker.pid} exited (age: {worker.age}s)")
|
|
|
|
|
|
def worker_abort(worker):
|
|
"""Called when a worker receives SIGABRT (usually due to timeout)."""
|
|
print(
|
|
f"ERROR: Worker {worker.pid} aborted (timeout or critical error)",
|
|
file=sys.stderr,
|
|
)
|
|
import traceback
|
|
|
|
traceback.print_stack()
|
|
|
|
|
|
def on_exit(server): # noqa: ARG001
|
|
"""Called just before the master process exits."""
|
|
print("INFO: Gunicorn master process shutting down")
|