feat: add health check endpoint and related schemas with tests

2026-02-20 08:08:05 -05:00 · 2026-02-17 23:01:49 +00:00
parent 9ac8f6fe34
commit 264cae3338
6 changed files with 451 additions and 1 deletions
--- a/.github/skills/code-standards/SKILL.md
+++ b/.github/skills/code-standards/SKILL.md
@@ -5,6 +5,14 @@ description: NetAlertX coding standards and conventions. Use this when writing c

 # Code Standards

+- ask me to review before going to each next step (mention n step out of x)
+- before starting, prepare implementation plan
+- ask me to review it and ask any clarifying questions first
+- add test creation as last step - follow repo architecture patterns - do not place in the root of /test
+- code has to be maintainable, no duplicate code
+- follow DRY principle
+- code files should be less than 500 LOC for better maintainability
+
 ## File Length

 Keep code files under 500 lines. Split larger files into modules.
--- a/server/api_server/api_server_start.py
+++ b/server/api_server/api_server_start.py
@@ -41,6 +41,7 @@ from .nettools_endpoint import (  # noqa: E402 [flake8 lint suppression]
 from .dbquery_endpoint import read_query, write_query, update_query, delete_query  # noqa: E402 [flake8 lint suppression]
 from .sync_endpoint import handle_sync_post, handle_sync_get  # noqa: E402 [flake8 lint suppression]
 from .logs_endpoint import clean_log  # noqa: E402 [flake8 lint suppression]
+from .health_endpoint import get_health_status  # noqa: E402 [flake8 lint suppression]
 from models.user_events_queue_instance import UserEventsQueueInstance  # noqa: E402 [flake8 lint suppression]

 from models.event_instance import EventInstance  # noqa: E402 [flake8 lint suppression]
@@ -86,6 +87,7 @@ from .openapi.schemas import (  # noqa: E402 [flake8 lint suppression]
    RecentEventsResponse, LastEventsResponse,
    NetworkTopologyResponse,
    InternetInfoResponse, NetworkInterfacesResponse,
+    HealthCheckResponse,
    CreateEventRequest, CreateSessionRequest,
    DeleteSessionRequest, CreateNotificationRequest,
    SyncPushRequest, SyncPullResponse,
@@ -1930,6 +1932,33 @@ def check_auth(payload=None):
    if request.method == "GET":
        return jsonify({"success": True, "message": "Authentication check successful"}), 200

+
+# --------------------------
+# Health endpoint
+# --------------------------
+@app.route("/health", methods=["GET"])
+@validate_request(
+    operation_id="check_health",
+    summary="System Health Check",
+    description="Retrieve system vitality metrics including database size, memory pressure, system load, disk usage, and CPU temperature.",
+    response_model=HealthCheckResponse,
+    tags=["system", "health"],
+    auth_callable=is_authorized
+)
+def check_health(payload=None):
+    """Get system health metrics for monitoring and diagnostics."""
+    try:
+        health_data = get_health_status()
+        return jsonify({"success": True, **health_data}), 200
+    except Exception as e:
+        mylog("none", [f"[health] Error retrieving health status: {e}"])
+        return jsonify({
+            "success": False,
+            "error": "Failed to retrieve health status",
+            "message": str(e)
+        }), 500
+
+
 # --------------------------
 # Background Server Start
 # --------------------------
--- a/server/api_server/health_endpoint.py
+++ b/server/api_server/health_endpoint.py
@@ -0,0 +1,147 @@
+"""Health check endpoint for NetAlertX system vitality monitoring."""
+
+import os
+import psutil
+from pathlib import Path
+
+from const import dbPath, dataPath
+from logger import mylog
+
+
+# ===============================================================================
+# Database Vitality
+# ===============================================================================
+
+def get_db_size_mb():
+    """
+    Calculate total database size in MB (app.db + app.db-wal).
+
+    Returns:
+        float: Size in MB, or 0 if database files don't exist.
+    """
+    try:
+        db_file = Path(dbPath)
+        wal_file = Path(f"{dbPath}-wal")
+
+        size_bytes = 0
+        if db_file.exists():
+            size_bytes += db_file.stat().st_size
+        if wal_file.exists():
+            size_bytes += wal_file.stat().st_size
+
+        return round(size_bytes / (1024 * 1024), 2)
+    except Exception as e:
+        mylog("verbose", [f"[health] Error calculating DB size: {e}"])
+        return 0.0
+
+
+# ===============================================================================
+# Memory Pressure
+# ===============================================================================
+
+def get_mem_usage_pct():
+    """
+    Calculate memory usage percentage (used / total * 100).
+
+    Returns:
+        int: Memory usage as integer percentage (0-100), or -1 on error.
+    """
+    try:
+        vm = psutil.virtual_memory()
+        pct = int((vm.used / vm.total) * 100)
+        return max(0, min(100, pct))  # Clamp to 0-100
+    except Exception as e:
+        mylog("verbose", [f"[health] Error calculating memory usage: {e}"])
+        return -1
+
+
+# ===============================================================================
+# System Stress
+# ===============================================================================
+
+def get_load_avg_1m():
+    """
+    Get 1-minute load average.
+
+    Returns:
+        float: 1-minute load average, or -1 on error.
+    """
+    try:
+        load_1m, _, _ = os.getloadavg()
+        return round(load_1m, 2)
+    except Exception as e:
+        mylog("verbose", [f"[health] Error getting load average: {e}"])
+        return -1.0
+
+
+# ===============================================================================
+# Disk Headroom
+# ===============================================================================
+
+def get_storage_pct():
+    """
+    Calculate disk usage percentage of /data mount.
+
+    Returns:
+        int: Disk usage as integer percentage (0-100), or -1 on error.
+    """
+    try:
+        stat = os.statvfs(dataPath)
+        total = stat.f_blocks * stat.f_frsize
+        used = (stat.f_blocks - stat.f_bfree) * stat.f_frsize
+        pct = int((used / total) * 100) if total > 0 else 0
+        return max(0, min(100, pct))  # Clamp to 0-100
+    except Exception as e:
+        mylog("verbose", [f"[health] Error calculating storage usage: {e}"])
+        return -1
+
+
+# ===============================================================================
+# Thermal Health
+# ===============================================================================
+
+def get_cpu_temp():
+    """
+    Get CPU temperature from hardware sensors if available.
+
+    Returns:
+        int: CPU temperature in Celsius, or None if unavailable.
+    """
+    try:
+        temps = psutil.sensors_temperatures()
+        if not temps:
+            return None
+
+        # Prefer 'coretemp' (Intel), fallback to first available
+        if "coretemp" in temps and temps["coretemp"]:
+            return int(temps["coretemp"][0].current)
+
+        # Fallback to first sensor with data
+        for sensor_type, readings in temps.items():
+            if readings:
+                return int(readings[0].current)
+
+        return None
+    except Exception as e:
+        mylog("verbose", [f"[health] Error reading CPU temperature: {e}"])
+        return None
+
+
+# ===============================================================================
+# Aggregator
+# ===============================================================================
+
+def get_health_status():
+    """
+    Collect all health metrics into a single dict.
+
+    Returns:
+        dict: Dictionary with all health metrics.
+    """
+    return {
+        "db_size_mb": get_db_size_mb(),
+        "mem_usage_pct": get_mem_usage_pct(),
+        "load_1m": get_load_avg_1m(),
+        "storage_pct": get_storage_pct(),
+        "cpu_temp": get_cpu_temp(),
+    }
--- a/server/api_server/openapi/schemas.py
+++ b/server/api_server/openapi/schemas.py
@@ -651,6 +651,34 @@ class NetworkInterfacesResponse(BaseResponse):
    interfaces: Dict[str, Any] = Field(..., description="Details about network interfaces.")


+# =============================================================================
+# HEALTH CHECK SCHEMAS
+# =============================================================================
+
+
+class HealthCheckResponse(BaseResponse):
+    """System health check with vitality metrics."""
+    model_config = ConfigDict(
+        extra="allow",
+        json_schema_extra={
+            "examples": [{
+                "success": True,
+                "db_size_mb": 125.45,
+                "mem_usage_pct": 65,
+                "load_1m": 2.15,
+                "storage_pct": 42,
+                "cpu_temp": 58
+            }]
+        }
+    )
+
+    db_size_mb: float = Field(..., description="Database size in MB (app.db + app.db-wal)")
+    mem_usage_pct: int = Field(..., ge=0, le=100, description="Memory usage percentage (0-100)")
+    load_1m: float = Field(..., description="1-minute load average")
+    storage_pct: int = Field(..., ge=0, le=100, description="Disk usage percentage of /data mount (0-100)")
+    cpu_temp: Optional[int] = Field(None, description="CPU temperature in Celsius (nullable if unavailable)")
+
+
 # =============================================================================
 # EVENTS SCHEMAS
 # =============================================================================
--- a/test/api_endpoints/test_devices_endpoints.py
+++ b/test/api_endpoints/test_devices_endpoints.py
@@ -8,6 +8,7 @@ import pytest

 from helper import get_setting_value
 from api_server.api_server_start import app
+from db.db_helper import get_device_conditions


@pytest.fixture(scope="session")
@@ -158,7 +159,7 @@ def test_devices_totals(client, api_token, test_mac):
    # 3. Ensure the response is a JSON list
    data = resp.json
    assert isinstance(data, list)
-    assert len(data) == 6  # devices, connected, favorites, new, down, archived
+    assert len(data) ==  len(get_device_conditions())  # devices, connected, favorites, new, down, archived

    # 4. Check that at least 1 device exists
    assert data[0] >= 1  # 'devices' count includes the dummy device
--- a/test/api_endpoints/test_health_endpoints.py
+++ b/test/api_endpoints/test_health_endpoints.py
@@ -0,0 +1,237 @@
+"""Tests for health check endpoint."""
+
+import sys
+import os
+import pytest
+from unittest.mock import patch
+
+INSTALL_PATH = os.getenv("NETALERTX_APP", "/app")
+sys.path.extend([f"{INSTALL_PATH}/front/plugins", f"{INSTALL_PATH}/server"])
+
+from helper import get_setting_value  # noqa: E402
+from api_server.api_server_start import app  # noqa: E402
+
+
+@pytest.fixture(scope="session")
+def api_token():
+    """Load API token from system settings."""
+    return get_setting_value("API_TOKEN")
+
+
+@pytest.fixture
+def client():
+    """Flask test client."""
+    with app.test_client() as client:
+        yield client
+
+
+def auth_headers(token):
+    """Helper to construct Authorization header."""
+    return {"Authorization": f"Bearer {token}"}
+
+
+# ========================================================================
+# AUTHENTICATION TESTS
+# ========================================================================
+
+def test_health_unauthorized(client):
+    """Missing token should be forbidden."""
+    resp = client.get("/health")
+    assert resp.status_code == 403
+
+    data = resp.get_json()
+    assert data is not None
+    assert data.get("success") is False
+
+
+def test_health_invalid_token(client):
+    """Invalid bearer token should be forbidden."""
+    resp = client.get("/health", headers=auth_headers("INVALID-TOKEN"))
+    assert resp.status_code == 403
+
+    data = resp.get_json()
+    assert data is not None
+    assert data.get("success") is False
+
+
+def test_health_valid_token(client, api_token):
+    """Valid token should allow access."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    assert resp.status_code == 200
+
+    data = resp.get_json()
+    assert data is not None
+    assert data.get("success") is True
+
+
+# ========================================================================
+# RESPONSE STRUCTURE TESTS
+# ========================================================================
+
+def test_health_response_structure(client, api_token):
+    """Response should contain all required health metrics."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    assert resp.status_code == 200
+
+    data = resp.get_json()
+    assert data.get("success") is True
+
+    # Check all required fields are present
+    assert "db_size_mb" in data
+    assert "mem_usage_pct" in data
+    assert "load_1m" in data
+    assert "storage_pct" in data
+    assert "cpu_temp" in data
+
+
+def test_health_db_size_type(client, api_token):
+    """db_size_mb should be a float."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    assert isinstance(data["db_size_mb"], (int, float))
+    assert data["db_size_mb"] >= 0
+
+
+def test_health_mem_usage_type(client, api_token):
+    """mem_usage_pct should be an integer in range [0, 100]."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    mem = data["mem_usage_pct"]
+    assert isinstance(mem, int)
+    assert 0 <= mem <= 100 or mem == -1  # -1 on error
+
+
+def test_health_load_avg_type(client, api_token):
+    """load_1m should be a float."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    load = data["load_1m"]
+    assert isinstance(load, (int, float))
+    assert load >= -1  # -1 on error
+
+
+def test_health_storage_pct_type(client, api_token):
+    """storage_pct should be an integer in range [0, 100]."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    storage = data["storage_pct"]
+    assert isinstance(storage, int)
+    assert 0 <= storage <= 100 or storage == -1  # -1 on error
+
+
+def test_health_cpu_temp_optional(client, api_token):
+    """cpu_temp should be optional (int or null)."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    cpu_temp = data["cpu_temp"]
+    assert cpu_temp is None or isinstance(cpu_temp, int)
+    if isinstance(cpu_temp, int):
+        assert cpu_temp > -100  # Reasonable temperature bounds
+
+
+# ========================================================================
+# METRIC CALCULATION TESTS
+# ========================================================================
+
+def test_health_db_size_realistic(client, api_token):
+    """Database size should be reasonable (>0 MB in active system)."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    # In a real system with data, DB should be > 1 MB
+    # Allow 0 for minimal installations without data
+    assert data["db_size_mb"] >= 0
+    # Sanity check: file shouldn't exceed 5GB
+    assert data["db_size_mb"] < 5000
+
+
+def test_health_mem_usage_reasonable(client, api_token):
+    """Memory usage should be reasonable for normal operation."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    # Sanity check: should be between 0% and 100%
+    if data["mem_usage_pct"] != -1:
+        assert 0 <= data["mem_usage_pct"] <= 100
+
+
+def test_health_storage_pct_reasonable(client, api_token):
+    """Storage percentage should be reasonable."""
+    resp = client.get("/health", headers=auth_headers(api_token))
+    data = resp.get_json()
+
+    # Sanity check: should be between 0% and 100%
+    if data["storage_pct"] != -1:
+        assert 0 <= data["storage_pct"] <= 100
+
+
+# ========================================================================
+# ERROR HANDLING TESTS
+# ========================================================================
+
+@patch('api_server.api_server_start.get_health_status')
+def test_health_exception_handling(mock_health, client, api_token):
+    """Health endpoint should handle exceptions gracefully."""
+    mock_health.side_effect = Exception("Test error")
+
+    resp = client.get("/health", headers=auth_headers(api_token))
+    assert resp.status_code == 500
+
+    data = resp.get_json()
+    assert data.get("success") is False
+    assert "error" in data
+
+
+# ========================================================================
+# METRIC INDEPENDENCE TESTS
+# ========================================================================
+
+def test_health_multiple_calls_consistency(client, api_token):
+    """Multiple calls should return consistent structure."""
+    for _ in range(3):
+        resp = client.get("/health", headers=auth_headers(api_token))
+        assert resp.status_code == 200
+
+        data = resp.get_json()
+        assert data.get("success") is True
+        assert "db_size_mb" in data
+        assert "mem_usage_pct" in data
+        assert "load_1m" in data
+        assert "storage_pct" in data
+        assert "cpu_temp" in data
+
+
+# ========================================================================
+# HTTP METHOD TESTS
+# ========================================================================
+
+def test_health_post_not_allowed(client, api_token):
+    """POST to /health should not be allowed."""
+    resp = client.post("/health", headers=auth_headers(api_token))
+    # Either 405 Method Not Allowed or 404 Not Found is acceptable
+    assert resp.status_code in (404, 405)
+
+
+def test_health_delete_not_allowed(client, api_token):
+    """DELETE to /health should not be allowed."""
+    resp = client.delete("/health", headers=auth_headers(api_token))
+    # Either 405 Method Not Allowed or 404 Not Found is acceptable
+    assert resp.status_code in (404, 405)
+
+
+# ========================================================================
+# QUERY TOKEN AUTHENTICATION TEST
+# ========================================================================
+
+def test_health_query_token_auth(client, api_token):
+    """Query token should also work for authentication."""
+    resp = client.get(f"/health?token={api_token}")
+    assert resp.status_code == 200
+
+    data = resp.get_json()
+    assert data.get("success") is True