From 9be08ec7dddf3ca6682db760d3d539cbd55270a6 Mon Sep 17 00:00:00 2001 From: Gelu Vrabie Date: Fri, 25 Jul 2025 13:10:53 +0100 Subject: [PATCH] add resource monitor Co-authored-by: Gelu Vrabie --- .gitattributes | 1 + .githooks/post-checkout | 3 + .githooks/post-commit | 3 + .githooks/post-merge | 3 + .githooks/pre-push | 3 + worker/__init__.py | 1 - worker/main.py | 11 +- worker/utils/__init__.py | 3 + worker/utils/macmon/.DS_Store | Bin 0 -> 6148 bytes worker/utils/macmon/__init__.py | 3 + worker/utils/macmon/bin/LICENSE.txt | 21 ++++ worker/utils/macmon/bin/readme.md | 154 ++++++++++++++++++++++++ worker/utils/macmon/macmon.py | 174 ++++++++++++++++++++++++++++ worker/utils/profile.py | 92 +++++++++++++++ 14 files changed, 470 insertions(+), 2 deletions(-) create mode 100644 .gitattributes create mode 100755 .githooks/post-checkout create mode 100755 .githooks/post-commit create mode 100755 .githooks/post-merge create mode 100755 .githooks/pre-push create mode 100644 worker/utils/__init__.py create mode 100644 worker/utils/macmon/.DS_Store create mode 100644 worker/utils/macmon/__init__.py create mode 100644 worker/utils/macmon/bin/LICENSE.txt create mode 100644 worker/utils/macmon/bin/readme.md create mode 100644 worker/utils/macmon/macmon.py create mode 100644 worker/utils/profile.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..c2b5fa9b --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +worker/utils/macmon/bin/macmon filter=lfs diff=lfs merge=lfs -text diff --git a/.githooks/post-checkout b/.githooks/post-checkout new file mode 100755 index 00000000..5abf8ed9 --- /dev/null +++ b/.githooks/post-checkout @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit new file mode 100755 index 00000000..b8b76c2c --- /dev/null +++ b/.githooks/post-commit @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge new file mode 100755 index 00000000..726f9098 --- /dev/null +++ b/.githooks/post-merge @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-merge "$@" diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 00000000..5f26dc45 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs pre-push "$@" diff --git a/worker/__init__.py b/worker/__init__.py index 0519ecba..e69de29b 100644 --- a/worker/__init__.py +++ b/worker/__init__.py @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/worker/main.py b/worker/main.py index 8a078c6a..3c4a5c45 100644 --- a/worker/main.py +++ b/worker/main.py @@ -15,10 +15,12 @@ from shared.types.events import ( ChunkGenerated, Event, InstanceId, + NodePerformanceMeasured, RunnerDeleted, RunnerStatusUpdated, TaskStateUpdated, ) +from shared.types.profiling import NodePerformanceProfile from shared.types.state import State from shared.types.tasks import TaskStatus from shared.types.worker.common import RunnerId @@ -52,6 +54,7 @@ from shared.types.worker.runners import ( from shared.types.worker.shards import ShardMetadata from worker.download.download_utils import build_model_path from worker.runner.runner_supervisor import RunnerSupervisor +from worker.utils.profile import start_polling_node_metrics def get_node_id() -> NodeId: @@ -482,7 +485,6 @@ class Worker: await asyncio.sleep(0.01) - # TODO: Handle resource monitoring (write-only) async def main(): node_id: NodeId = get_node_id() @@ -490,6 +492,13 @@ async def main(): event_log_manager = EventLogManager(EventLogConfig(), logger) await event_log_manager.initialize() + + # TODO: add profiling etc to resource monitor + async def resource_monitor_callback(node_performance_profile: NodePerformanceProfile) -> None: + await event_log_manager.worker_events.append_events( + [NodePerformanceMeasured(node_id=node_id, node_profile=node_performance_profile)], origin=node_id + ) + asyncio.create_task(start_polling_node_metrics(callback=resource_monitor_callback)) worker = Worker(node_id, logger, event_log_manager.worker_events) diff --git a/worker/utils/__init__.py b/worker/utils/__init__.py new file mode 100644 index 00000000..386a613c --- /dev/null +++ b/worker/utils/__init__.py @@ -0,0 +1,3 @@ +from .profile import start_polling_node_metrics + +__all__ = ["start_polling_node_metrics"] diff --git a/worker/utils/macmon/.DS_Store b/worker/utils/macmon/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..a3585876b6842aef6ec2eca2c673c194487ea3b6 GIT binary patch literal 6148 zcmeH~O=`nH427SXECStl+2w30pQvfVyQljO&;ssLc!1UOG})p;=82 zR;?Ceh}WZ?+UmMqI#RP8R>OzYoz15hnq@nzF`-!xQ4j$USP + +Sudoless performance monitoring CLI tool for Apple Silicon processors. + +[](https://github.com/vladkens/macmon/releases) +[](https://github.com/vladkens/macmon/releases) +[](https://github.com/vladkens/macmon/blob/main/LICENSE) +[donate](https://buymeacoffee.com/vladkens) + + + +
+ preview +
+ +## Motivation + +Apple Silicon processors don't provide an easy way to see live power consumption. I was interested in this information while testing local LLM models. `asitop` is a nice and simple TUI to quickly see current metrics, but it reads data from `powermetrics` and requires root privileges. `macmon` uses a private macOS API to gather metrics (essentially the same as `powermetrics`) but runs without sudo. 🎉 + +## 🌟 Features + +- 🚫 Works without sudo +- ⚡ Real-time CPU / GPU / ANE power usage +- 📊 CPU utilization per cluster +- 💾 RAM / Swap usage +- 📈 Historical charts + avg / max values +- 🌡️ Average CPU / GPU temperature +- 🎨 Switchable colors (6 variants) +- 🪟 Can be rendered in a small window +- 🦀 Written in Rust + +## 🍺 Install via Homebrew + +You can install [`macmon`](https://formulae.brew.sh/formula/macmon) using [brew](https://brew.sh/): + +```sh +$ brew install macmon +``` + +## 🖥️ Install via MacPorts + +You can also install [`macmon`](https://ports.macports.org/port/macmon/) using [MacPorts](https://macports.org/): + +```sh +$ sudo port install macmon +``` + +## 📦 Install from source + +1. Install [Rust toolchain](https://www.rust-lang.org/tools/install) + +2. Clone the repo: + +```sh +git clone https://github.com/vladkens/macmon.git && cd macmon +``` + +3. Build and run: + +```sh +cargo run -r +``` + +4. (Optionally) Binary can be moved to bin folder: + +```sh +sudo cp target/release/macmon /usr/local/bin +``` + +## 🚀 Usage + +```sh +Usage: macmon [OPTIONS] [COMMAND] + +Commands: + pipe Output metrics in JSON format + debug Print debug information + help Print this message or the help of the given subcommand(s) + +Options: + -i, --interval Update interval in milliseconds [default: 1000] + -h, --help Print help + -V, --version Print version + +Controls: + c - change color + v - switch charts view: gauge / sparkline + q - quit +``` + +## 🚰 Piping + +You can use the pipe subcommand to output metrics in JSON format, which is suitable for piping into other tools or scripts. For example: + +```sh +macmon pipe | jq +``` + +This command runs `macmon` in "pipe" mode and navigate output to `jq` for pretty-printing. + +You can also specify the number of samples to run using `-s` or `--samples` parameter (default: `0`, which runs indefinitely), and set update interval in milliseconds using the `-i` or `--interval` parameter (default: `1000` ms). For example: + +```sh +macmon pipe -s 10 -i 500 | jq +``` + +This will collect 10 samples with an update interval of 500 milliseconds. + +### Output + +```jsonc +{ + "timestamp": "2025-02-24T20:38:15.427569+00:00", + "temp": { + "cpu_temp_avg": 43.73614, // Celsius + "gpu_temp_avg": 36.95167 // Celsius + }, + "memory": { + "ram_total": 25769803776, // Bytes + "ram_usage": 20985479168, // Bytes + "swap_total": 4294967296, // Bytes + "swap_usage": 2602434560 // Bytes + }, + "ecpu_usage": [1181, 0.082656614], // (Frequency MHz, Usage %) + "pcpu_usage": [1974, 0.015181795], // (Frequency MHz, Usage %) + "gpu_usage": [461, 0.021497859], // (Frequency MHz, Usage %) + "cpu_power": 0.20486385, // Watts + "gpu_power": 0.017451683, // Watts + "ane_power": 0.0, // Watts + "all_power": 0.22231553, // Watts + "sys_power": 5.876533, // Watts + "ram_power": 0.11635789, // Watts + "gpu_ram_power": 0.0009615385 // Watts (not sure what it means) +} +``` + +## 🤝 Contributing +We love contributions! Whether you have ideas, suggestions, or bug reports, feel free to open an issue or submit a pull request. Your input is essential in helping us improve `macmon` 💪 + +## 📝 License +`macmon` is distributed under the MIT License. For more details, check out the LICENSE. + +## 🔍 See also +- [tlkh/asitop](https://github.com/tlkh/asitop) – Original tool. Python, requires sudo. +- [dehydratedpotato/socpowerbud](https://github.com/dehydratedpotato/socpowerbud) – ObjectiveC, sudoless, no TUI. +- [op06072/NeoAsitop](https://github.com/op06072/NeoAsitop) – Swift, sudoless. +- [graelo/pumas](https://github.com/graelo/pumas) – Rust, requires sudo. +- [context-labs/mactop](https://github.com/context-labs/mactop) – Go, requires sudo. + +--- + +*PS: One More Thing... Remember, monitoring your Mac's performance with `macmon` is like having a personal trainer for your processor — keeping those cores in shape! 💪* diff --git a/worker/utils/macmon/macmon.py b/worker/utils/macmon/macmon.py new file mode 100644 index 00000000..26b18416 --- /dev/null +++ b/worker/utils/macmon/macmon.py @@ -0,0 +1,174 @@ +import asyncio +import os +import platform +import subprocess +from pathlib import Path +from typing import Optional, Tuple + +from pydantic import BaseModel, ConfigDict, ValidationError + + +class MacMonError(Exception): + """Exception raised for errors in the MacMon functions.""" + + +def _get_binary_path(binary_path: Optional[str] = None) -> str: + """ + Get the path to the macmon binary. + + Args: + binary_path: Optional path to the binary. If not provided, will use the bundled binary. + + Returns: + The path to the macmon binary. + + Raises: + MacMonError: If the binary doesn't exist or can't be made executable. + """ + # Check for macOS with ARM chip + system = platform.system().lower() + machine = platform.machine().lower() + + if system != "darwin" or not ( + "arm" in machine or "m1" in machine or "m2" in machine + ): + raise MacMonError("MacMon only supports macOS with Apple Silicon (ARM) chips") + + if binary_path: + path = binary_path + else: + # Get the directory where this module is located + module_dir = Path(os.path.dirname(os.path.abspath(__file__))) + path = str(module_dir / "bin" / "macmon") + + # Ensure the binary exists and is executable + if not os.path.isfile(path): + raise MacMonError(f"Binary not found at: {path}") + + # Make the binary executable if it's not already + if not os.access(path, os.X_OK): + try: + os.chmod(path, 0o755) # rwx r-x r-x + except OSError as e: + raise MacMonError(f"Failed to make binary executable: {e}") from e + + return path + + +# --------------------------------------------------------------------------- +# Pydantic metric structures +# --------------------------------------------------------------------------- + + +class MemoryMetrics(BaseModel): + """Memory-related metrics returned by macmon.""" + + ram_total: Optional[int] = None + ram_usage: Optional[int] = None + swap_total: Optional[int] = None + swap_usage: Optional[int] = None + + model_config = ConfigDict(extra="ignore") + + +class TempMetrics(BaseModel): + """Temperature-related metrics returned by macmon.""" + + cpu_temp_avg: Optional[float] = None + gpu_temp_avg: Optional[float] = None + + model_config = ConfigDict(extra="ignore") + + +class Metrics(BaseModel): + """Complete set of metrics returned by *macmon* binary. + + All fields are optional to allow for partial output from the binary. + Unknown fields are ignored for forward-compatibility. + """ + + all_power: Optional[float] = None + ane_power: Optional[float] = None + cpu_power: Optional[float] = None + ecpu_usage: Optional[Tuple[int, float]] = None + gpu_power: Optional[float] = None + gpu_ram_power: Optional[float] = None + gpu_usage: Optional[Tuple[int, float]] = None + memory: Optional[MemoryMetrics] = None + pcpu_usage: Optional[Tuple[int, float]] = None + ram_power: Optional[float] = None + sys_power: Optional[float] = None + temp: Optional[TempMetrics] = None + timestamp: Optional[str] = None + + model_config = ConfigDict(extra="ignore") + + +# --------------------------------------------------------------------------- +# Synchronous helper +# --------------------------------------------------------------------------- + + +def get_metrics(binary_path: Optional[str] = None) -> Metrics: + """ + Run the binary and return the metrics as a Python dictionary. + + Args: + binary_path: Optional path to the binary. If not provided, will use the bundled binary. + + Returns: + A mapping containing system metrics. + + Raises: + MacMonError: If there's an error running the binary. + """ + path = _get_binary_path(binary_path) + + try: + # Run the binary with the argument -s 1 and capture its output + result = subprocess.run( + [path, "pipe", "-s", "1"], capture_output=True, text=True, check=True + ) + + return Metrics.model_validate_json(result.stdout) + + except subprocess.CalledProcessError as e: + raise MacMonError(f"Error running binary: {e.stderr}") from e # type: ignore + except ValidationError as e: + raise MacMonError(f"Error parsing JSON output: {e}") from e + + +async def get_metrics_async(binary_path: Optional[str] = None) -> Metrics: + """ + Asynchronously run the binary and return the metrics as a Python dictionary. + + Args: + binary_path: Optional path to the binary. If not provided, will use the bundled binary. + + Returns: + A mapping containing system metrics. + + Raises: + MacMonError: If there's an error running the binary. + """ + path = _get_binary_path(binary_path) + + try: + proc = await asyncio.create_subprocess_exec( + path, + "pipe", + "-s", + "1", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + stdout, stderr = await proc.communicate() + + if proc.returncode != 0: + raise MacMonError(f"Error running binary: {stderr.decode().strip()}") + + return Metrics.model_validate_json(stdout.decode().strip()) + + except ValidationError as e: + raise MacMonError(f"Error parsing JSON output: {e}") from e diff --git a/worker/utils/profile.py b/worker/utils/profile.py new file mode 100644 index 00000000..b152e00c --- /dev/null +++ b/worker/utils/profile.py @@ -0,0 +1,92 @@ +import asyncio +import platform +from typing import Any, Callable, Coroutine + +from shared.types.profiling import ( + MemoryPerformanceProfile, + NodePerformanceProfile, + SystemPerformanceProfile, +) +from worker.utils.macmon.macmon import ( + Metrics, +) +from worker.utils.macmon.macmon import ( + get_metrics_async as macmon_get_metrics_async, +) + +# from exo.infra.event_log import EventLog +# from exo.app.config import ResourceMonitorConfig +# from exo.utils.mlx.mlx_utils import profile_flops_fp16 + + +async def get_metrics_async() -> Metrics: + """Return detailed Metrics on macOS or a minimal fallback elsewhere. + + The *Metrics* schema comes from ``utils.macmon.macmon``; on non-macOS systems we + fill only the ``memory`` sub-structure so downstream code can still access + ``metrics.memory.ram_total`` & ``ram_usage``. + """ + + if platform.system().lower() == "darwin": + return await macmon_get_metrics_async() + return Metrics() + + +async def start_polling_node_metrics( + callback: Callable[[NodePerformanceProfile], Coroutine[Any, Any, None]], +): + poll_interval_s = 1.0 + while True: + try: + # Gather metrics & system info with a timeout on each call + metrics = await get_metrics_async() + + # Extract memory totals from metrics + total_mem = ( + metrics.memory.ram_total + if metrics.memory is not None and metrics.memory.ram_total is not None + else 0 + ) + used_mem = ( + metrics.memory.ram_usage + if metrics.memory is not None and metrics.memory.ram_usage is not None + else 0 + ) + + # Run heavy FLOPs profiling only if enough time has elapsed + + await callback( + NodePerformanceProfile( + model_id=platform.machine(), + chip_id=platform.processor(), + memory=MemoryPerformanceProfile( + ram_total=total_mem, + ram_available=total_mem - used_mem, + swap_total=metrics.memory.swap_total + if metrics.memory is not None + and metrics.memory.swap_total is not None + else 0, + swap_available=metrics.memory.swap_total + - metrics.memory.swap_usage + if metrics.memory is not None + and metrics.memory.swap_usage is not None + and metrics.memory.swap_total is not None + else 0, + ), + network_interfaces=[], + system=SystemPerformanceProfile( + flops_fp16=0, + ), + ) + ) + + except asyncio.TimeoutError: + # One of the operations took too long; skip this iteration but keep the loop alive. + print( + "[resource_monitor] Operation timed out after 30s, skipping this cycle." + ) + except Exception as e: + # Catch-all to ensure the monitor keeps running. + print(f"[resource_monitor] Encountered error: {e}") + finally: + await asyncio.sleep(poll_interval_s)