From 264cae33386a7a3c2c81fe986a9767e40c6750dd Mon Sep 17 00:00:00 2001 From: "Jokob @NetAlertX" <96159884+jokob-sk@users.noreply.github.com> Date: Tue, 17 Feb 2026 23:01:49 +0000 Subject: [PATCH 1/2] feat: add health check endpoint and related schemas with tests --- .github/skills/code-standards/SKILL.md | 8 + server/api_server/api_server_start.py | 29 +++ server/api_server/health_endpoint.py | 147 ++++++++++++ server/api_server/openapi/schemas.py | 28 +++ test/api_endpoints/test_devices_endpoints.py | 3 +- test/api_endpoints/test_health_endpoints.py | 237 +++++++++++++++++++ 6 files changed, 451 insertions(+), 1 deletion(-) create mode 100644 server/api_server/health_endpoint.py create mode 100644 test/api_endpoints/test_health_endpoints.py diff --git a/.github/skills/code-standards/SKILL.md b/.github/skills/code-standards/SKILL.md index e398323a..8b9e6ad6 100644 --- a/.github/skills/code-standards/SKILL.md +++ b/.github/skills/code-standards/SKILL.md @@ -5,6 +5,14 @@ description: NetAlertX coding standards and conventions. Use this when writing c # Code Standards +- ask me to review before going to each next step (mention n step out of x) +- before starting, prepare implementation plan +- ask me to review it and ask any clarifying questions first +- add test creation as last step - follow repo architecture patterns - do not place in the root of /test +- code has to be maintainable, no duplicate code +- follow DRY principle +- code files should be less than 500 LOC for better maintainability + ## File Length Keep code files under 500 lines. Split larger files into modules. diff --git a/server/api_server/api_server_start.py b/server/api_server/api_server_start.py index 8a0eb1ef..f39734bb 100755 --- a/server/api_server/api_server_start.py +++ b/server/api_server/api_server_start.py @@ -41,6 +41,7 @@ from .nettools_endpoint import ( # noqa: E402 [flake8 lint suppression] from .dbquery_endpoint import read_query, write_query, update_query, delete_query # noqa: E402 [flake8 lint suppression] from .sync_endpoint import handle_sync_post, handle_sync_get # noqa: E402 [flake8 lint suppression] from .logs_endpoint import clean_log # noqa: E402 [flake8 lint suppression] +from .health_endpoint import get_health_status # noqa: E402 [flake8 lint suppression] from models.user_events_queue_instance import UserEventsQueueInstance # noqa: E402 [flake8 lint suppression] from models.event_instance import EventInstance # noqa: E402 [flake8 lint suppression] @@ -86,6 +87,7 @@ from .openapi.schemas import ( # noqa: E402 [flake8 lint suppression] RecentEventsResponse, LastEventsResponse, NetworkTopologyResponse, InternetInfoResponse, NetworkInterfacesResponse, + HealthCheckResponse, CreateEventRequest, CreateSessionRequest, DeleteSessionRequest, CreateNotificationRequest, SyncPushRequest, SyncPullResponse, @@ -1930,6 +1932,33 @@ def check_auth(payload=None): if request.method == "GET": return jsonify({"success": True, "message": "Authentication check successful"}), 200 + +# -------------------------- +# Health endpoint +# -------------------------- +@app.route("/health", methods=["GET"]) +@validate_request( + operation_id="check_health", + summary="System Health Check", + description="Retrieve system vitality metrics including database size, memory pressure, system load, disk usage, and CPU temperature.", + response_model=HealthCheckResponse, + tags=["system", "health"], + auth_callable=is_authorized +) +def check_health(payload=None): + """Get system health metrics for monitoring and diagnostics.""" + try: + health_data = get_health_status() + return jsonify({"success": True, **health_data}), 200 + except Exception as e: + mylog("none", [f"[health] Error retrieving health status: {e}"]) + return jsonify({ + "success": False, + "error": "Failed to retrieve health status", + "message": str(e) + }), 500 + + # -------------------------- # Background Server Start # -------------------------- diff --git a/server/api_server/health_endpoint.py b/server/api_server/health_endpoint.py new file mode 100644 index 00000000..e9b1c56f --- /dev/null +++ b/server/api_server/health_endpoint.py @@ -0,0 +1,147 @@ +"""Health check endpoint for NetAlertX system vitality monitoring.""" + +import os +import psutil +from pathlib import Path + +from const import dbPath, dataPath +from logger import mylog + + +# =============================================================================== +# Database Vitality +# =============================================================================== + +def get_db_size_mb(): + """ + Calculate total database size in MB (app.db + app.db-wal). + + Returns: + float: Size in MB, or 0 if database files don't exist. + """ + try: + db_file = Path(dbPath) + wal_file = Path(f"{dbPath}-wal") + + size_bytes = 0 + if db_file.exists(): + size_bytes += db_file.stat().st_size + if wal_file.exists(): + size_bytes += wal_file.stat().st_size + + return round(size_bytes / (1024 * 1024), 2) + except Exception as e: + mylog("verbose", [f"[health] Error calculating DB size: {e}"]) + return 0.0 + + +# =============================================================================== +# Memory Pressure +# =============================================================================== + +def get_mem_usage_pct(): + """ + Calculate memory usage percentage (used / total * 100). + + Returns: + int: Memory usage as integer percentage (0-100), or -1 on error. + """ + try: + vm = psutil.virtual_memory() + pct = int((vm.used / vm.total) * 100) + return max(0, min(100, pct)) # Clamp to 0-100 + except Exception as e: + mylog("verbose", [f"[health] Error calculating memory usage: {e}"]) + return -1 + + +# =============================================================================== +# System Stress +# =============================================================================== + +def get_load_avg_1m(): + """ + Get 1-minute load average. + + Returns: + float: 1-minute load average, or -1 on error. + """ + try: + load_1m, _, _ = os.getloadavg() + return round(load_1m, 2) + except Exception as e: + mylog("verbose", [f"[health] Error getting load average: {e}"]) + return -1.0 + + +# =============================================================================== +# Disk Headroom +# =============================================================================== + +def get_storage_pct(): + """ + Calculate disk usage percentage of /data mount. + + Returns: + int: Disk usage as integer percentage (0-100), or -1 on error. + """ + try: + stat = os.statvfs(dataPath) + total = stat.f_blocks * stat.f_frsize + used = (stat.f_blocks - stat.f_bfree) * stat.f_frsize + pct = int((used / total) * 100) if total > 0 else 0 + return max(0, min(100, pct)) # Clamp to 0-100 + except Exception as e: + mylog("verbose", [f"[health] Error calculating storage usage: {e}"]) + return -1 + + +# =============================================================================== +# Thermal Health +# =============================================================================== + +def get_cpu_temp(): + """ + Get CPU temperature from hardware sensors if available. + + Returns: + int: CPU temperature in Celsius, or None if unavailable. + """ + try: + temps = psutil.sensors_temperatures() + if not temps: + return None + + # Prefer 'coretemp' (Intel), fallback to first available + if "coretemp" in temps and temps["coretemp"]: + return int(temps["coretemp"][0].current) + + # Fallback to first sensor with data + for sensor_type, readings in temps.items(): + if readings: + return int(readings[0].current) + + return None + except Exception as e: + mylog("verbose", [f"[health] Error reading CPU temperature: {e}"]) + return None + + +# =============================================================================== +# Aggregator +# =============================================================================== + +def get_health_status(): + """ + Collect all health metrics into a single dict. + + Returns: + dict: Dictionary with all health metrics. + """ + return { + "db_size_mb": get_db_size_mb(), + "mem_usage_pct": get_mem_usage_pct(), + "load_1m": get_load_avg_1m(), + "storage_pct": get_storage_pct(), + "cpu_temp": get_cpu_temp(), + } diff --git a/server/api_server/openapi/schemas.py b/server/api_server/openapi/schemas.py index 84561a7c..040ab037 100644 --- a/server/api_server/openapi/schemas.py +++ b/server/api_server/openapi/schemas.py @@ -651,6 +651,34 @@ class NetworkInterfacesResponse(BaseResponse): interfaces: Dict[str, Any] = Field(..., description="Details about network interfaces.") +# ============================================================================= +# HEALTH CHECK SCHEMAS +# ============================================================================= + + +class HealthCheckResponse(BaseResponse): + """System health check with vitality metrics.""" + model_config = ConfigDict( + extra="allow", + json_schema_extra={ + "examples": [{ + "success": True, + "db_size_mb": 125.45, + "mem_usage_pct": 65, + "load_1m": 2.15, + "storage_pct": 42, + "cpu_temp": 58 + }] + } + ) + + db_size_mb: float = Field(..., description="Database size in MB (app.db + app.db-wal)") + mem_usage_pct: int = Field(..., ge=0, le=100, description="Memory usage percentage (0-100)") + load_1m: float = Field(..., description="1-minute load average") + storage_pct: int = Field(..., ge=0, le=100, description="Disk usage percentage of /data mount (0-100)") + cpu_temp: Optional[int] = Field(None, description="CPU temperature in Celsius (nullable if unavailable)") + + # ============================================================================= # EVENTS SCHEMAS # ============================================================================= diff --git a/test/api_endpoints/test_devices_endpoints.py b/test/api_endpoints/test_devices_endpoints.py index ce498227..179fce7e 100644 --- a/test/api_endpoints/test_devices_endpoints.py +++ b/test/api_endpoints/test_devices_endpoints.py @@ -8,6 +8,7 @@ import pytest from helper import get_setting_value from api_server.api_server_start import app +from db.db_helper import get_device_conditions @pytest.fixture(scope="session") @@ -158,7 +159,7 @@ def test_devices_totals(client, api_token, test_mac): # 3. Ensure the response is a JSON list data = resp.json assert isinstance(data, list) - assert len(data) == 6 # devices, connected, favorites, new, down, archived + assert len(data) == len(get_device_conditions()) # devices, connected, favorites, new, down, archived # 4. Check that at least 1 device exists assert data[0] >= 1 # 'devices' count includes the dummy device diff --git a/test/api_endpoints/test_health_endpoints.py b/test/api_endpoints/test_health_endpoints.py new file mode 100644 index 00000000..f34c89d9 --- /dev/null +++ b/test/api_endpoints/test_health_endpoints.py @@ -0,0 +1,237 @@ +"""Tests for health check endpoint.""" + +import sys +import os +import pytest +from unittest.mock import patch + +INSTALL_PATH = os.getenv("NETALERTX_APP", "/app") +sys.path.extend([f"{INSTALL_PATH}/front/plugins", f"{INSTALL_PATH}/server"]) + +from helper import get_setting_value # noqa: E402 +from api_server.api_server_start import app # noqa: E402 + + +@pytest.fixture(scope="session") +def api_token(): + """Load API token from system settings.""" + return get_setting_value("API_TOKEN") + + +@pytest.fixture +def client(): + """Flask test client.""" + with app.test_client() as client: + yield client + + +def auth_headers(token): + """Helper to construct Authorization header.""" + return {"Authorization": f"Bearer {token}"} + + +# ======================================================================== +# AUTHENTICATION TESTS +# ======================================================================== + +def test_health_unauthorized(client): + """Missing token should be forbidden.""" + resp = client.get("/health") + assert resp.status_code == 403 + + data = resp.get_json() + assert data is not None + assert data.get("success") is False + + +def test_health_invalid_token(client): + """Invalid bearer token should be forbidden.""" + resp = client.get("/health", headers=auth_headers("INVALID-TOKEN")) + assert resp.status_code == 403 + + data = resp.get_json() + assert data is not None + assert data.get("success") is False + + +def test_health_valid_token(client, api_token): + """Valid token should allow access.""" + resp = client.get("/health", headers=auth_headers(api_token)) + assert resp.status_code == 200 + + data = resp.get_json() + assert data is not None + assert data.get("success") is True + + +# ======================================================================== +# RESPONSE STRUCTURE TESTS +# ======================================================================== + +def test_health_response_structure(client, api_token): + """Response should contain all required health metrics.""" + resp = client.get("/health", headers=auth_headers(api_token)) + assert resp.status_code == 200 + + data = resp.get_json() + assert data.get("success") is True + + # Check all required fields are present + assert "db_size_mb" in data + assert "mem_usage_pct" in data + assert "load_1m" in data + assert "storage_pct" in data + assert "cpu_temp" in data + + +def test_health_db_size_type(client, api_token): + """db_size_mb should be a float.""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + assert isinstance(data["db_size_mb"], (int, float)) + assert data["db_size_mb"] >= 0 + + +def test_health_mem_usage_type(client, api_token): + """mem_usage_pct should be an integer in range [0, 100].""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + mem = data["mem_usage_pct"] + assert isinstance(mem, int) + assert 0 <= mem <= 100 or mem == -1 # -1 on error + + +def test_health_load_avg_type(client, api_token): + """load_1m should be a float.""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + load = data["load_1m"] + assert isinstance(load, (int, float)) + assert load >= -1 # -1 on error + + +def test_health_storage_pct_type(client, api_token): + """storage_pct should be an integer in range [0, 100].""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + storage = data["storage_pct"] + assert isinstance(storage, int) + assert 0 <= storage <= 100 or storage == -1 # -1 on error + + +def test_health_cpu_temp_optional(client, api_token): + """cpu_temp should be optional (int or null).""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + cpu_temp = data["cpu_temp"] + assert cpu_temp is None or isinstance(cpu_temp, int) + if isinstance(cpu_temp, int): + assert cpu_temp > -100 # Reasonable temperature bounds + + +# ======================================================================== +# METRIC CALCULATION TESTS +# ======================================================================== + +def test_health_db_size_realistic(client, api_token): + """Database size should be reasonable (>0 MB in active system).""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + # In a real system with data, DB should be > 1 MB + # Allow 0 for minimal installations without data + assert data["db_size_mb"] >= 0 + # Sanity check: file shouldn't exceed 5GB + assert data["db_size_mb"] < 5000 + + +def test_health_mem_usage_reasonable(client, api_token): + """Memory usage should be reasonable for normal operation.""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + # Sanity check: should be between 0% and 100% + if data["mem_usage_pct"] != -1: + assert 0 <= data["mem_usage_pct"] <= 100 + + +def test_health_storage_pct_reasonable(client, api_token): + """Storage percentage should be reasonable.""" + resp = client.get("/health", headers=auth_headers(api_token)) + data = resp.get_json() + + # Sanity check: should be between 0% and 100% + if data["storage_pct"] != -1: + assert 0 <= data["storage_pct"] <= 100 + + +# ======================================================================== +# ERROR HANDLING TESTS +# ======================================================================== + +@patch('api_server.api_server_start.get_health_status') +def test_health_exception_handling(mock_health, client, api_token): + """Health endpoint should handle exceptions gracefully.""" + mock_health.side_effect = Exception("Test error") + + resp = client.get("/health", headers=auth_headers(api_token)) + assert resp.status_code == 500 + + data = resp.get_json() + assert data.get("success") is False + assert "error" in data + + +# ======================================================================== +# METRIC INDEPENDENCE TESTS +# ======================================================================== + +def test_health_multiple_calls_consistency(client, api_token): + """Multiple calls should return consistent structure.""" + for _ in range(3): + resp = client.get("/health", headers=auth_headers(api_token)) + assert resp.status_code == 200 + + data = resp.get_json() + assert data.get("success") is True + assert "db_size_mb" in data + assert "mem_usage_pct" in data + assert "load_1m" in data + assert "storage_pct" in data + assert "cpu_temp" in data + + +# ======================================================================== +# HTTP METHOD TESTS +# ======================================================================== + +def test_health_post_not_allowed(client, api_token): + """POST to /health should not be allowed.""" + resp = client.post("/health", headers=auth_headers(api_token)) + # Either 405 Method Not Allowed or 404 Not Found is acceptable + assert resp.status_code in (404, 405) + + +def test_health_delete_not_allowed(client, api_token): + """DELETE to /health should not be allowed.""" + resp = client.delete("/health", headers=auth_headers(api_token)) + # Either 405 Method Not Allowed or 404 Not Found is acceptable + assert resp.status_code in (404, 405) + + +# ======================================================================== +# QUERY TOKEN AUTHENTICATION TEST +# ======================================================================== + +def test_health_query_token_auth(client, api_token): + """Query token should also work for authentication.""" + resp = client.get(f"/health?token={api_token}") + assert resp.status_code == 200 + + data = resp.get_json() + assert data.get("success") is True From bc97a80375b78b3901bc112d159076e91e44e484 Mon Sep 17 00:00:00 2001 From: "Jokob @NetAlertX" <96159884+jokob-sk@users.noreply.github.com> Date: Tue, 17 Feb 2026 23:16:21 +0000 Subject: [PATCH 2/2] fix: update health check response and schema to handle nullable memory and storage usage --- server/api_server/api_server_start.py | 2 +- server/api_server/health_endpoint.py | 18 ++++-------------- server/api_server/openapi/schemas.py | 4 ++-- 3 files changed, 7 insertions(+), 17 deletions(-) diff --git a/server/api_server/api_server_start.py b/server/api_server/api_server_start.py index f39734bb..197ee26d 100755 --- a/server/api_server/api_server_start.py +++ b/server/api_server/api_server_start.py @@ -1955,7 +1955,7 @@ def check_health(payload=None): return jsonify({ "success": False, "error": "Failed to retrieve health status", - "message": str(e) + "message": "Internal server error" }), 500 diff --git a/server/api_server/health_endpoint.py b/server/api_server/health_endpoint.py index e9b1c56f..b4ca8730 100644 --- a/server/api_server/health_endpoint.py +++ b/server/api_server/health_endpoint.py @@ -44,7 +44,7 @@ def get_mem_usage_pct(): Calculate memory usage percentage (used / total * 100). Returns: - int: Memory usage as integer percentage (0-100), or -1 on error. + int: Memory usage as integer percentage (0-100), or None on error. """ try: vm = psutil.virtual_memory() @@ -52,12 +52,7 @@ def get_mem_usage_pct(): return max(0, min(100, pct)) # Clamp to 0-100 except Exception as e: mylog("verbose", [f"[health] Error calculating memory usage: {e}"]) - return -1 - - -# =============================================================================== -# System Stress -# =============================================================================== + return None def get_load_avg_1m(): """ @@ -83,7 +78,7 @@ def get_storage_pct(): Calculate disk usage percentage of /data mount. Returns: - int: Disk usage as integer percentage (0-100), or -1 on error. + int: Disk usage as integer percentage (0-100), or None on error. """ try: stat = os.statvfs(dataPath) @@ -93,12 +88,7 @@ def get_storage_pct(): return max(0, min(100, pct)) # Clamp to 0-100 except Exception as e: mylog("verbose", [f"[health] Error calculating storage usage: {e}"]) - return -1 - - -# =============================================================================== -# Thermal Health -# =============================================================================== + return None def get_cpu_temp(): """ diff --git a/server/api_server/openapi/schemas.py b/server/api_server/openapi/schemas.py index 040ab037..cb4e1dc1 100644 --- a/server/api_server/openapi/schemas.py +++ b/server/api_server/openapi/schemas.py @@ -673,9 +673,9 @@ class HealthCheckResponse(BaseResponse): ) db_size_mb: float = Field(..., description="Database size in MB (app.db + app.db-wal)") - mem_usage_pct: int = Field(..., ge=0, le=100, description="Memory usage percentage (0-100)") + mem_usage_pct: Optional[int] = Field(None, ge=0, le=100, description="Memory usage percentage (0-100, nullable if unavailable)") load_1m: float = Field(..., description="1-minute load average") - storage_pct: int = Field(..., ge=0, le=100, description="Disk usage percentage of /data mount (0-100)") + storage_pct: Optional[int] = Field(None, ge=0, le=100, description="Disk usage percentage of /data mount (0-100, nullable if unavailable)") cpu_temp: Optional[int] = Field(None, description="CPU temperature in Celsius (nullable if unavailable)")