feat: add health check endpoint and related schemas with tests

This commit is contained in:
Jokob @NetAlertX
2026-02-17 23:01:49 +00:00
parent 9ac8f6fe34
commit 264cae3338
6 changed files with 451 additions and 1 deletions

View File

@@ -5,6 +5,14 @@ description: NetAlertX coding standards and conventions. Use this when writing c
# Code Standards
- ask me to review before going to each next step (mention n step out of x)
- before starting, prepare implementation plan
- ask me to review it and ask any clarifying questions first
- add test creation as last step - follow repo architecture patterns - do not place in the root of /test
- code has to be maintainable, no duplicate code
- follow DRY principle
- code files should be less than 500 LOC for better maintainability
## File Length
Keep code files under 500 lines. Split larger files into modules.

View File

@@ -41,6 +41,7 @@ from .nettools_endpoint import ( # noqa: E402 [flake8 lint suppression]
from .dbquery_endpoint import read_query, write_query, update_query, delete_query # noqa: E402 [flake8 lint suppression]
from .sync_endpoint import handle_sync_post, handle_sync_get # noqa: E402 [flake8 lint suppression]
from .logs_endpoint import clean_log # noqa: E402 [flake8 lint suppression]
from .health_endpoint import get_health_status # noqa: E402 [flake8 lint suppression]
from models.user_events_queue_instance import UserEventsQueueInstance # noqa: E402 [flake8 lint suppression]
from models.event_instance import EventInstance # noqa: E402 [flake8 lint suppression]
@@ -86,6 +87,7 @@ from .openapi.schemas import ( # noqa: E402 [flake8 lint suppression]
RecentEventsResponse, LastEventsResponse,
NetworkTopologyResponse,
InternetInfoResponse, NetworkInterfacesResponse,
HealthCheckResponse,
CreateEventRequest, CreateSessionRequest,
DeleteSessionRequest, CreateNotificationRequest,
SyncPushRequest, SyncPullResponse,
@@ -1930,6 +1932,33 @@ def check_auth(payload=None):
if request.method == "GET":
return jsonify({"success": True, "message": "Authentication check successful"}), 200
# --------------------------
# Health endpoint
# --------------------------
@app.route("/health", methods=["GET"])
@validate_request(
operation_id="check_health",
summary="System Health Check",
description="Retrieve system vitality metrics including database size, memory pressure, system load, disk usage, and CPU temperature.",
response_model=HealthCheckResponse,
tags=["system", "health"],
auth_callable=is_authorized
)
def check_health(payload=None):
"""Get system health metrics for monitoring and diagnostics."""
try:
health_data = get_health_status()
return jsonify({"success": True, **health_data}), 200
except Exception as e:
mylog("none", [f"[health] Error retrieving health status: {e}"])
return jsonify({
"success": False,
"error": "Failed to retrieve health status",
"message": str(e)
}), 500
# --------------------------
# Background Server Start
# --------------------------

View File

@@ -0,0 +1,147 @@
"""Health check endpoint for NetAlertX system vitality monitoring."""
import os
import psutil
from pathlib import Path
from const import dbPath, dataPath
from logger import mylog
# ===============================================================================
# Database Vitality
# ===============================================================================
def get_db_size_mb():
"""
Calculate total database size in MB (app.db + app.db-wal).
Returns:
float: Size in MB, or 0 if database files don't exist.
"""
try:
db_file = Path(dbPath)
wal_file = Path(f"{dbPath}-wal")
size_bytes = 0
if db_file.exists():
size_bytes += db_file.stat().st_size
if wal_file.exists():
size_bytes += wal_file.stat().st_size
return round(size_bytes / (1024 * 1024), 2)
except Exception as e:
mylog("verbose", [f"[health] Error calculating DB size: {e}"])
return 0.0
# ===============================================================================
# Memory Pressure
# ===============================================================================
def get_mem_usage_pct():
"""
Calculate memory usage percentage (used / total * 100).
Returns:
int: Memory usage as integer percentage (0-100), or -1 on error.
"""
try:
vm = psutil.virtual_memory()
pct = int((vm.used / vm.total) * 100)
return max(0, min(100, pct)) # Clamp to 0-100
except Exception as e:
mylog("verbose", [f"[health] Error calculating memory usage: {e}"])
return -1
# ===============================================================================
# System Stress
# ===============================================================================
def get_load_avg_1m():
"""
Get 1-minute load average.
Returns:
float: 1-minute load average, or -1 on error.
"""
try:
load_1m, _, _ = os.getloadavg()
return round(load_1m, 2)
except Exception as e:
mylog("verbose", [f"[health] Error getting load average: {e}"])
return -1.0
# ===============================================================================
# Disk Headroom
# ===============================================================================
def get_storage_pct():
"""
Calculate disk usage percentage of /data mount.
Returns:
int: Disk usage as integer percentage (0-100), or -1 on error.
"""
try:
stat = os.statvfs(dataPath)
total = stat.f_blocks * stat.f_frsize
used = (stat.f_blocks - stat.f_bfree) * stat.f_frsize
pct = int((used / total) * 100) if total > 0 else 0
return max(0, min(100, pct)) # Clamp to 0-100
except Exception as e:
mylog("verbose", [f"[health] Error calculating storage usage: {e}"])
return -1
# ===============================================================================
# Thermal Health
# ===============================================================================
def get_cpu_temp():
"""
Get CPU temperature from hardware sensors if available.
Returns:
int: CPU temperature in Celsius, or None if unavailable.
"""
try:
temps = psutil.sensors_temperatures()
if not temps:
return None
# Prefer 'coretemp' (Intel), fallback to first available
if "coretemp" in temps and temps["coretemp"]:
return int(temps["coretemp"][0].current)
# Fallback to first sensor with data
for sensor_type, readings in temps.items():
if readings:
return int(readings[0].current)
return None
except Exception as e:
mylog("verbose", [f"[health] Error reading CPU temperature: {e}"])
return None
# ===============================================================================
# Aggregator
# ===============================================================================
def get_health_status():
"""
Collect all health metrics into a single dict.
Returns:
dict: Dictionary with all health metrics.
"""
return {
"db_size_mb": get_db_size_mb(),
"mem_usage_pct": get_mem_usage_pct(),
"load_1m": get_load_avg_1m(),
"storage_pct": get_storage_pct(),
"cpu_temp": get_cpu_temp(),
}

View File

@@ -651,6 +651,34 @@ class NetworkInterfacesResponse(BaseResponse):
interfaces: Dict[str, Any] = Field(..., description="Details about network interfaces.")
# =============================================================================
# HEALTH CHECK SCHEMAS
# =============================================================================
class HealthCheckResponse(BaseResponse):
"""System health check with vitality metrics."""
model_config = ConfigDict(
extra="allow",
json_schema_extra={
"examples": [{
"success": True,
"db_size_mb": 125.45,
"mem_usage_pct": 65,
"load_1m": 2.15,
"storage_pct": 42,
"cpu_temp": 58
}]
}
)
db_size_mb: float = Field(..., description="Database size in MB (app.db + app.db-wal)")
mem_usage_pct: int = Field(..., ge=0, le=100, description="Memory usage percentage (0-100)")
load_1m: float = Field(..., description="1-minute load average")
storage_pct: int = Field(..., ge=0, le=100, description="Disk usage percentage of /data mount (0-100)")
cpu_temp: Optional[int] = Field(None, description="CPU temperature in Celsius (nullable if unavailable)")
# =============================================================================
# EVENTS SCHEMAS
# =============================================================================

View File

@@ -8,6 +8,7 @@ import pytest
from helper import get_setting_value
from api_server.api_server_start import app
from db.db_helper import get_device_conditions
@pytest.fixture(scope="session")
@@ -158,7 +159,7 @@ def test_devices_totals(client, api_token, test_mac):
# 3. Ensure the response is a JSON list
data = resp.json
assert isinstance(data, list)
assert len(data) == 6 # devices, connected, favorites, new, down, archived
assert len(data) == len(get_device_conditions()) # devices, connected, favorites, new, down, archived
# 4. Check that at least 1 device exists
assert data[0] >= 1 # 'devices' count includes the dummy device

View File

@@ -0,0 +1,237 @@
"""Tests for health check endpoint."""
import sys
import os
import pytest
from unittest.mock import patch
INSTALL_PATH = os.getenv("NETALERTX_APP", "/app")
sys.path.extend([f"{INSTALL_PATH}/front/plugins", f"{INSTALL_PATH}/server"])
from helper import get_setting_value # noqa: E402
from api_server.api_server_start import app # noqa: E402
@pytest.fixture(scope="session")
def api_token():
"""Load API token from system settings."""
return get_setting_value("API_TOKEN")
@pytest.fixture
def client():
"""Flask test client."""
with app.test_client() as client:
yield client
def auth_headers(token):
"""Helper to construct Authorization header."""
return {"Authorization": f"Bearer {token}"}
# ========================================================================
# AUTHENTICATION TESTS
# ========================================================================
def test_health_unauthorized(client):
"""Missing token should be forbidden."""
resp = client.get("/health")
assert resp.status_code == 403
data = resp.get_json()
assert data is not None
assert data.get("success") is False
def test_health_invalid_token(client):
"""Invalid bearer token should be forbidden."""
resp = client.get("/health", headers=auth_headers("INVALID-TOKEN"))
assert resp.status_code == 403
data = resp.get_json()
assert data is not None
assert data.get("success") is False
def test_health_valid_token(client, api_token):
"""Valid token should allow access."""
resp = client.get("/health", headers=auth_headers(api_token))
assert resp.status_code == 200
data = resp.get_json()
assert data is not None
assert data.get("success") is True
# ========================================================================
# RESPONSE STRUCTURE TESTS
# ========================================================================
def test_health_response_structure(client, api_token):
"""Response should contain all required health metrics."""
resp = client.get("/health", headers=auth_headers(api_token))
assert resp.status_code == 200
data = resp.get_json()
assert data.get("success") is True
# Check all required fields are present
assert "db_size_mb" in data
assert "mem_usage_pct" in data
assert "load_1m" in data
assert "storage_pct" in data
assert "cpu_temp" in data
def test_health_db_size_type(client, api_token):
"""db_size_mb should be a float."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
assert isinstance(data["db_size_mb"], (int, float))
assert data["db_size_mb"] >= 0
def test_health_mem_usage_type(client, api_token):
"""mem_usage_pct should be an integer in range [0, 100]."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
mem = data["mem_usage_pct"]
assert isinstance(mem, int)
assert 0 <= mem <= 100 or mem == -1 # -1 on error
def test_health_load_avg_type(client, api_token):
"""load_1m should be a float."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
load = data["load_1m"]
assert isinstance(load, (int, float))
assert load >= -1 # -1 on error
def test_health_storage_pct_type(client, api_token):
"""storage_pct should be an integer in range [0, 100]."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
storage = data["storage_pct"]
assert isinstance(storage, int)
assert 0 <= storage <= 100 or storage == -1 # -1 on error
def test_health_cpu_temp_optional(client, api_token):
"""cpu_temp should be optional (int or null)."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
cpu_temp = data["cpu_temp"]
assert cpu_temp is None or isinstance(cpu_temp, int)
if isinstance(cpu_temp, int):
assert cpu_temp > -100 # Reasonable temperature bounds
# ========================================================================
# METRIC CALCULATION TESTS
# ========================================================================
def test_health_db_size_realistic(client, api_token):
"""Database size should be reasonable (>0 MB in active system)."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
# In a real system with data, DB should be > 1 MB
# Allow 0 for minimal installations without data
assert data["db_size_mb"] >= 0
# Sanity check: file shouldn't exceed 5GB
assert data["db_size_mb"] < 5000
def test_health_mem_usage_reasonable(client, api_token):
"""Memory usage should be reasonable for normal operation."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
# Sanity check: should be between 0% and 100%
if data["mem_usage_pct"] != -1:
assert 0 <= data["mem_usage_pct"] <= 100
def test_health_storage_pct_reasonable(client, api_token):
"""Storage percentage should be reasonable."""
resp = client.get("/health", headers=auth_headers(api_token))
data = resp.get_json()
# Sanity check: should be between 0% and 100%
if data["storage_pct"] != -1:
assert 0 <= data["storage_pct"] <= 100
# ========================================================================
# ERROR HANDLING TESTS
# ========================================================================
@patch('api_server.api_server_start.get_health_status')
def test_health_exception_handling(mock_health, client, api_token):
"""Health endpoint should handle exceptions gracefully."""
mock_health.side_effect = Exception("Test error")
resp = client.get("/health", headers=auth_headers(api_token))
assert resp.status_code == 500
data = resp.get_json()
assert data.get("success") is False
assert "error" in data
# ========================================================================
# METRIC INDEPENDENCE TESTS
# ========================================================================
def test_health_multiple_calls_consistency(client, api_token):
"""Multiple calls should return consistent structure."""
for _ in range(3):
resp = client.get("/health", headers=auth_headers(api_token))
assert resp.status_code == 200
data = resp.get_json()
assert data.get("success") is True
assert "db_size_mb" in data
assert "mem_usage_pct" in data
assert "load_1m" in data
assert "storage_pct" in data
assert "cpu_temp" in data
# ========================================================================
# HTTP METHOD TESTS
# ========================================================================
def test_health_post_not_allowed(client, api_token):
"""POST to /health should not be allowed."""
resp = client.post("/health", headers=auth_headers(api_token))
# Either 405 Method Not Allowed or 404 Not Found is acceptable
assert resp.status_code in (404, 405)
def test_health_delete_not_allowed(client, api_token):
"""DELETE to /health should not be allowed."""
resp = client.delete("/health", headers=auth_headers(api_token))
# Either 405 Method Not Allowed or 404 Not Found is acceptable
assert resp.status_code in (404, 405)
# ========================================================================
# QUERY TOKEN AUTHENTICATION TEST
# ========================================================================
def test_health_query_token_auth(client, api_token):
"""Query token should also work for authentication."""
resp = client.get(f"/health?token={api_token}")
assert resp.status_code == 200
data = resp.get_json()
assert data.get("success") is True