add resource monitor

Co-authored-by: Gelu Vrabie <gelu@exolabs.net>
2025-12-23 14:17:58 -05:00 · 2025-07-25 13:10:53 +01:00
parent a241c92dd1
commit 9be08ec7dd
14 changed files with 470 additions and 2 deletions
--- a/.gitattributes
+++ b/.gitattributes
@@ -0,0 +1 @@
+worker/utils/macmon/bin/macmon filter=lfs diff=lfs merge=lfs -text
--- a/.githooks/post-checkout
+++ b/.githooks/post-checkout
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs post-checkout "$@"
--- a/.githooks/post-commit
+++ b/.githooks/post-commit
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs post-commit "$@"
--- a/.githooks/post-merge
+++ b/.githooks/post-merge
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs post-merge "$@"
--- a/.githooks/pre-push
+++ b/.githooks/pre-push
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs pre-push "$@"
--- a/worker/init.py
+++ b/worker/init.py
@@ -1 +0,0 @@
- 
--- a/worker/main.py
+++ b/worker/main.py
@@ -15,10 +15,12 @@ from shared.types.events import (
    ChunkGenerated,
    Event,
    InstanceId,
+    NodePerformanceMeasured,
    RunnerDeleted,
    RunnerStatusUpdated,
    TaskStateUpdated,
 )
+from shared.types.profiling import NodePerformanceProfile
 from shared.types.state import State
 from shared.types.tasks import TaskStatus
 from shared.types.worker.common import RunnerId
@@ -52,6 +54,7 @@ from shared.types.worker.runners import (
 from shared.types.worker.shards import ShardMetadata
 from worker.download.download_utils import build_model_path
 from worker.runner.runner_supervisor import RunnerSupervisor
+from worker.utils.profile import start_polling_node_metrics


 def get_node_id() -> NodeId:
@@ -482,7 +485,6 @@ class Worker:

            await asyncio.sleep(0.01)

-    # TODO: Handle resource monitoring (write-only)

 async def main():
    node_id: NodeId = get_node_id()
@@ -491,6 +493,13 @@ async def main():
    event_log_manager = EventLogManager(EventLogConfig(), logger)
    await event_log_manager.initialize()
    
+    # TODO: add profiling etc to resource monitor
+    async def resource_monitor_callback(node_performance_profile: NodePerformanceProfile) -> None:
+        await event_log_manager.worker_events.append_events(
+            [NodePerformanceMeasured(node_id=node_id, node_profile=node_performance_profile)], origin=node_id
+        )
+    asyncio.create_task(start_polling_node_metrics(callback=resource_monitor_callback))
+
    worker = Worker(node_id, logger, event_log_manager.worker_events)

    await worker.run()
--- a/worker/utils/init.py
+++ b/worker/utils/init.py
@@ -0,0 +1,3 @@
+from .profile import start_polling_node_metrics
+
+__all__ = ["start_polling_node_metrics"]
--- a/worker/utils/macmon/.DS_Store
+++ b/worker/utils/macmon/.DS_Store
--- a/worker/utils/macmon/init.py
+++ b/worker/utils/macmon/init.py
@@ -0,0 +1,3 @@
+from .macmon import MacMonError, get_metrics, get_metrics_async
+
+__all__ = ['get_metrics', 'get_metrics_async', 'MacMonError']
--- a/worker/utils/macmon/bin/LICENSE.txt
+++ b/worker/utils/macmon/bin/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 vladkens
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
--- a/worker/utils/macmon/bin/readme.md
+++ b/worker/utils/macmon/bin/readme.md
@@ -0,0 +1,154 @@
+# `macmon` – Mac Monitor
+
+<div align="center">
+
+Sudoless performance monitoring CLI tool for Apple Silicon processors.
+
+[<img src="https://badges.ws/github/assets-dl/vladkens/macmon" />](https://github.com/vladkens/macmon/releases)
+[<img src="https://badges.ws/github/release/vladkens/macmon" />](https://github.com/vladkens/macmon/releases)
+[<img src="https://badges.ws/github/license/vladkens/macmon" />](https://github.com/vladkens/macmon/blob/main/LICENSE)
+[<img src="https://badges.ws/badge/-/buy%20me%20a%20coffee/ff813f?icon=buymeacoffee&label" alt="donate" />](https://buymeacoffee.com/vladkens)
+
+</div>
+
+<div align="center">
+  <img src="https://github.com/vladkens/macmon/blob/assets/macmon.png?raw=true" alt="preview" />
+</div>
+
+## Motivation
+
+Apple Silicon processors don't provide an easy way to see live power consumption. I was interested in this information while testing local LLM models. `asitop` is a nice and simple TUI to quickly see current metrics, but it reads data from `powermetrics` and requires root privileges. `macmon` uses a private macOS API to gather metrics (essentially the same as `powermetrics`) but runs without sudo. 🎉
+
+## 🌟 Features
+
+- 🚫 Works without sudo
+- ⚡ Real-time CPU / GPU / ANE power usage
+- 📊 CPU utilization per cluster
+- 💾 RAM / Swap usage
+- 📈 Historical charts + avg / max values
+- 🌡️ Average CPU / GPU temperature
+- 🎨 Switchable colors (6 variants)
+- 🪟 Can be rendered in a small window
+- 🦀 Written in Rust
+
+## 🍺 Install via Homebrew
+
+You can install [`macmon`](https://formulae.brew.sh/formula/macmon) using [brew](https://brew.sh/):
+
+```sh
+$ brew install macmon
+```
+
+## 🖥️ Install via MacPorts
+
+You can also install [`macmon`](https://ports.macports.org/port/macmon/) using [MacPorts](https://macports.org/):
+
+```sh
+$ sudo port install macmon
+```
+
+## 📦 Install from source
+
+1. Install [Rust toolchain](https://www.rust-lang.org/tools/install)
+
+2. Clone the repo:
+
+```sh
+git clone https://github.com/vladkens/macmon.git && cd macmon
+```
+
+3. Build and run:
+
+```sh
+cargo run -r
+```
+
+4. (Optionally) Binary can be moved to bin folder:
+
+```sh
+sudo cp target/release/macmon /usr/local/bin
+```
+
+## 🚀 Usage
+
+```sh
+Usage: macmon [OPTIONS] [COMMAND]
+
+Commands:
+  pipe   Output metrics in JSON format
+  debug  Print debug information
+  help   Print this message or the help of the given subcommand(s)
+
+Options:
+  -i, --interval <INTERVAL>  Update interval in milliseconds [default: 1000]
+  -h, --help                 Print help
+  -V, --version              Print version
+
+Controls:
+  c - change color
+  v - switch charts view: gauge / sparkline
+  q - quit
+```
+
+## 🚰 Piping
+
+You can use the pipe subcommand to output metrics in JSON format, which is suitable for piping into other tools or scripts. For example:
+
+```sh
+macmon pipe | jq
+```
+
+This command runs `macmon` in "pipe" mode and navigate output to `jq` for pretty-printing.
+
+You can also specify the number of samples to run using `-s` or `--samples` parameter (default: `0`, which runs indefinitely), and set update interval in milliseconds using the `-i` or `--interval` parameter (default: `1000` ms). For example:
+
+```sh
+macmon pipe -s 10 -i 500 | jq
+```
+
+This will collect 10 samples with an update interval of 500 milliseconds.
+
+### Output
+
+```jsonc
+{
+  "timestamp": "2025-02-24T20:38:15.427569+00:00",
+  "temp": {
+    "cpu_temp_avg": 43.73614,         // Celsius
+    "gpu_temp_avg": 36.95167          // Celsius
+  },
+  "memory": {
+    "ram_total": 25769803776,         // Bytes
+    "ram_usage": 20985479168,         // Bytes
+    "swap_total": 4294967296,         // Bytes
+    "swap_usage": 2602434560          // Bytes
+  },
+  "ecpu_usage": [1181, 0.082656614],  // (Frequency MHz, Usage %)
+  "pcpu_usage": [1974, 0.015181795],  // (Frequency MHz, Usage %)
+  "gpu_usage": [461, 0.021497859],    // (Frequency MHz, Usage %)
+  "cpu_power": 0.20486385,            // Watts
+  "gpu_power": 0.017451683,           // Watts
+  "ane_power": 0.0,                   // Watts
+  "all_power": 0.22231553,            // Watts
+  "sys_power": 5.876533,              // Watts
+  "ram_power": 0.11635789,            // Watts
+  "gpu_ram_power": 0.0009615385       // Watts (not sure what it means)
+}
+```
+
+## 🤝 Contributing
+We love contributions! Whether you have ideas, suggestions, or bug reports, feel free to open an issue or submit a pull request. Your input is essential in helping us improve `macmon` 💪
+
+## 📝 License
+`macmon` is distributed under the MIT License. For more details, check out the LICENSE.
+
+## 🔍 See also
+- [tlkh/asitop](https://github.com/tlkh/asitop) – Original tool. Python, requires sudo.
+- [dehydratedpotato/socpowerbud](https://github.com/dehydratedpotato/socpowerbud) – ObjectiveC, sudoless, no TUI.
+- [op06072/NeoAsitop](https://github.com/op06072/NeoAsitop) – Swift, sudoless.
+- [graelo/pumas](https://github.com/graelo/pumas) – Rust, requires sudo.
+- [context-labs/mactop](https://github.com/context-labs/mactop) – Go, requires sudo.
+
+---
+
+*PS: One More Thing... Remember, monitoring your Mac's performance with `macmon` is like having a personal trainer for your processor — keeping those cores in shape! 💪*
--- a/worker/utils/macmon/macmon.py
+++ b/worker/utils/macmon/macmon.py
@@ -0,0 +1,174 @@
+import asyncio
+import os
+import platform
+import subprocess
+from pathlib import Path
+from typing import Optional, Tuple
+
+from pydantic import BaseModel, ConfigDict, ValidationError
+
+
+class MacMonError(Exception):
+    """Exception raised for errors in the MacMon functions."""
+
+
+def _get_binary_path(binary_path: Optional[str] = None) -> str:
+    """
+    Get the path to the macmon binary.
+
+    Args:
+        binary_path: Optional path to the binary. If not provided, will use the bundled binary.
+
+    Returns:
+        The path to the macmon binary.
+
+    Raises:
+        MacMonError: If the binary doesn't exist or can't be made executable.
+    """
+    # Check for macOS with ARM chip
+    system = platform.system().lower()
+    machine = platform.machine().lower()
+
+    if system != "darwin" or not (
+        "arm" in machine or "m1" in machine or "m2" in machine
+    ):
+        raise MacMonError("MacMon only supports macOS with Apple Silicon (ARM) chips")
+
+    if binary_path:
+        path = binary_path
+    else:
+        # Get the directory where this module is located
+        module_dir = Path(os.path.dirname(os.path.abspath(__file__)))
+        path = str(module_dir / "bin" / "macmon")
+
+    # Ensure the binary exists and is executable
+    if not os.path.isfile(path):
+        raise MacMonError(f"Binary not found at: {path}")
+
+    # Make the binary executable if it's not already
+    if not os.access(path, os.X_OK):
+        try:
+            os.chmod(path, 0o755)  # rwx r-x r-x
+        except OSError as e:
+            raise MacMonError(f"Failed to make binary executable: {e}") from e
+
+    return path
+
+
+# ---------------------------------------------------------------------------
+# Pydantic metric structures
+# ---------------------------------------------------------------------------
+
+
+class MemoryMetrics(BaseModel):
+    """Memory-related metrics returned by macmon."""
+
+    ram_total: Optional[int] = None
+    ram_usage: Optional[int] = None
+    swap_total: Optional[int] = None
+    swap_usage: Optional[int] = None
+
+    model_config = ConfigDict(extra="ignore")
+
+
+class TempMetrics(BaseModel):
+    """Temperature-related metrics returned by macmon."""
+
+    cpu_temp_avg: Optional[float] = None
+    gpu_temp_avg: Optional[float] = None
+
+    model_config = ConfigDict(extra="ignore")
+
+
+class Metrics(BaseModel):
+    """Complete set of metrics returned by *macmon* binary.
+
+    All fields are optional to allow for partial output from the binary.
+    Unknown fields are ignored for forward-compatibility.
+    """
+
+    all_power: Optional[float] = None
+    ane_power: Optional[float] = None
+    cpu_power: Optional[float] = None
+    ecpu_usage: Optional[Tuple[int, float]] = None
+    gpu_power: Optional[float] = None
+    gpu_ram_power: Optional[float] = None
+    gpu_usage: Optional[Tuple[int, float]] = None
+    memory: Optional[MemoryMetrics] = None
+    pcpu_usage: Optional[Tuple[int, float]] = None
+    ram_power: Optional[float] = None
+    sys_power: Optional[float] = None
+    temp: Optional[TempMetrics] = None
+    timestamp: Optional[str] = None
+
+    model_config = ConfigDict(extra="ignore")
+
+
+# ---------------------------------------------------------------------------
+# Synchronous helper
+# ---------------------------------------------------------------------------
+
+
+def get_metrics(binary_path: Optional[str] = None) -> Metrics:
+    """
+    Run the binary and return the metrics as a Python dictionary.
+
+    Args:
+        binary_path: Optional path to the binary. If not provided, will use the bundled binary.
+
+    Returns:
+        A mapping containing system metrics.
+
+    Raises:
+        MacMonError: If there's an error running the binary.
+    """
+    path = _get_binary_path(binary_path)
+
+    try:
+        # Run the binary with the argument -s 1 and capture its output
+        result = subprocess.run(
+            [path, "pipe", "-s", "1"], capture_output=True, text=True, check=True
+        )
+
+        return Metrics.model_validate_json(result.stdout)
+
+    except subprocess.CalledProcessError as e:
+        raise MacMonError(f"Error running binary: {e.stderr}") from e  # type: ignore
+    except ValidationError as e:
+        raise MacMonError(f"Error parsing JSON output: {e}") from e
+
+
+async def get_metrics_async(binary_path: Optional[str] = None) -> Metrics:
+    """
+    Asynchronously run the binary and return the metrics as a Python dictionary.
+
+    Args:
+        binary_path: Optional path to the binary. If not provided, will use the bundled binary.
+
+    Returns:
+        A mapping containing system metrics.
+
+    Raises:
+        MacMonError: If there's an error running the binary.
+    """
+    path = _get_binary_path(binary_path)
+
+    try:
+        proc = await asyncio.create_subprocess_exec(
+            path,
+            "pipe",
+            "-s",
+            "1",
+            stdout=asyncio.subprocess.PIPE,
+            stderr=asyncio.subprocess.PIPE,
+        )
+
+        stdout, stderr = await proc.communicate()
+
+        if proc.returncode != 0:
+            raise MacMonError(f"Error running binary: {stderr.decode().strip()}")
+
+        return Metrics.model_validate_json(stdout.decode().strip())
+
+    except ValidationError as e:
+        raise MacMonError(f"Error parsing JSON output: {e}") from e
--- a/worker/utils/profile.py
+++ b/worker/utils/profile.py
@@ -0,0 +1,92 @@
+import asyncio
+import platform
+from typing import Any, Callable, Coroutine
+
+from shared.types.profiling import (
+    MemoryPerformanceProfile,
+    NodePerformanceProfile,
+    SystemPerformanceProfile,
+)
+from worker.utils.macmon.macmon import (
+    Metrics,
+)
+from worker.utils.macmon.macmon import (
+    get_metrics_async as macmon_get_metrics_async,
+)
+
+# from exo.infra.event_log import EventLog
+# from exo.app.config import ResourceMonitorConfig
+# from exo.utils.mlx.mlx_utils import profile_flops_fp16
+
+
+async def get_metrics_async() -> Metrics:
+    """Return detailed Metrics on macOS or a minimal fallback elsewhere.
+
+    The *Metrics* schema comes from ``utils.macmon.macmon``; on non-macOS systems we
+    fill only the ``memory`` sub-structure so downstream code can still access
+    ``metrics.memory.ram_total`` & ``ram_usage``.
+    """
+
+    if platform.system().lower() == "darwin":
+        return await macmon_get_metrics_async()
+    return Metrics()
+
+
+async def start_polling_node_metrics(
+    callback: Callable[[NodePerformanceProfile], Coroutine[Any, Any, None]],
+):
+    poll_interval_s = 1.0
+    while True:
+        try:
+            # Gather metrics & system info with a timeout on each call
+            metrics = await get_metrics_async()
+
+            # Extract memory totals from metrics
+            total_mem = (
+                metrics.memory.ram_total
+                if metrics.memory is not None and metrics.memory.ram_total is not None
+                else 0
+            )
+            used_mem = (
+                metrics.memory.ram_usage
+                if metrics.memory is not None and metrics.memory.ram_usage is not None
+                else 0
+            )
+
+            # Run heavy FLOPs profiling only if enough time has elapsed
+
+            await callback(
+                NodePerformanceProfile(
+                    model_id=platform.machine(),
+                    chip_id=platform.processor(),
+                    memory=MemoryPerformanceProfile(
+                        ram_total=total_mem,
+                        ram_available=total_mem - used_mem,
+                        swap_total=metrics.memory.swap_total
+                        if metrics.memory is not None
+                        and metrics.memory.swap_total is not None
+                        else 0,
+                        swap_available=metrics.memory.swap_total
+                        - metrics.memory.swap_usage
+                        if metrics.memory is not None
+                        and metrics.memory.swap_usage is not None
+                        and metrics.memory.swap_total is not None
+                        else 0,
+                    ),
+                    network_interfaces=[],
+                    system=SystemPerformanceProfile(
+                        flops_fp16=0,
+                    ),
+                )
+            )
+
+        except asyncio.TimeoutError:
+            # One of the operations took too long; skip this iteration but keep the loop alive.
+            print(
+                "[resource_monitor] Operation timed out after 30s, skipping this cycle."
+            )
+        except Exception as e:
+            # Catch-all to ensure the monitor keeps running.
+            print(f"[resource_monitor] Encountered error: {e}")
+        finally:
+            await asyncio.sleep(poll_interval_s)
				`@@ -0,0 +1 @@`
				`worker/utils/macmon/bin/macmon filter=lfs diff=lfs merge=lfs -text`