add resource monitor

Co-authored-by: Gelu Vrabie <gelu@exolabs.net>
This commit is contained in:
Gelu Vrabie
2025-07-25 13:10:53 +01:00
committed by GitHub
parent a241c92dd1
commit 9be08ec7dd
14 changed files with 470 additions and 2 deletions

1
.gitattributes vendored Normal file
View File

@@ -0,0 +1 @@
worker/utils/macmon/bin/macmon filter=lfs diff=lfs merge=lfs -text

3
.githooks/post-checkout Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
git lfs post-checkout "$@"

3
.githooks/post-commit Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
git lfs post-commit "$@"

3
.githooks/post-merge Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
git lfs post-merge "$@"

3
.githooks/pre-push Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/sh
command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
git lfs pre-push "$@"

View File

@@ -1 +0,0 @@

View File

@@ -15,10 +15,12 @@ from shared.types.events import (
ChunkGenerated,
Event,
InstanceId,
NodePerformanceMeasured,
RunnerDeleted,
RunnerStatusUpdated,
TaskStateUpdated,
)
from shared.types.profiling import NodePerformanceProfile
from shared.types.state import State
from shared.types.tasks import TaskStatus
from shared.types.worker.common import RunnerId
@@ -52,6 +54,7 @@ from shared.types.worker.runners import (
from shared.types.worker.shards import ShardMetadata
from worker.download.download_utils import build_model_path
from worker.runner.runner_supervisor import RunnerSupervisor
from worker.utils.profile import start_polling_node_metrics
def get_node_id() -> NodeId:
@@ -482,7 +485,6 @@ class Worker:
await asyncio.sleep(0.01)
# TODO: Handle resource monitoring (write-only)
async def main():
node_id: NodeId = get_node_id()
@@ -491,6 +493,13 @@ async def main():
event_log_manager = EventLogManager(EventLogConfig(), logger)
await event_log_manager.initialize()
# TODO: add profiling etc to resource monitor
async def resource_monitor_callback(node_performance_profile: NodePerformanceProfile) -> None:
await event_log_manager.worker_events.append_events(
[NodePerformanceMeasured(node_id=node_id, node_profile=node_performance_profile)], origin=node_id
)
asyncio.create_task(start_polling_node_metrics(callback=resource_monitor_callback))
worker = Worker(node_id, logger, event_log_manager.worker_events)
await worker.run()

3
worker/utils/__init__.py Normal file
View File

@@ -0,0 +1,3 @@
from .profile import start_polling_node_metrics
__all__ = ["start_polling_node_metrics"]

BIN
worker/utils/macmon/.DS_Store vendored Normal file
View File

Binary file not shown.

View File

@@ -0,0 +1,3 @@
from .macmon import MacMonError, get_metrics, get_metrics_async
__all__ = ['get_metrics', 'get_metrics_async', 'MacMonError']

View File

@@ -0,0 +1,21 @@
MIT License
Copyright (c) 2024 vladkens
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

View File

@@ -0,0 +1,154 @@
# `macmon` Mac Monitor
<div align="center">
Sudoless performance monitoring CLI tool for Apple Silicon processors.
[<img src="https://badges.ws/github/assets-dl/vladkens/macmon" />](https://github.com/vladkens/macmon/releases)
[<img src="https://badges.ws/github/release/vladkens/macmon" />](https://github.com/vladkens/macmon/releases)
[<img src="https://badges.ws/github/license/vladkens/macmon" />](https://github.com/vladkens/macmon/blob/main/LICENSE)
[<img src="https://badges.ws/badge/-/buy%20me%20a%20coffee/ff813f?icon=buymeacoffee&label" alt="donate" />](https://buymeacoffee.com/vladkens)
</div>
<div align="center">
<img src="https://github.com/vladkens/macmon/blob/assets/macmon.png?raw=true" alt="preview" />
</div>
## Motivation
Apple Silicon processors don't provide an easy way to see live power consumption. I was interested in this information while testing local LLM models. `asitop` is a nice and simple TUI to quickly see current metrics, but it reads data from `powermetrics` and requires root privileges. `macmon` uses a private macOS API to gather metrics (essentially the same as `powermetrics`) but runs without sudo. 🎉
## 🌟 Features
- 🚫 Works without sudo
- ⚡ Real-time CPU / GPU / ANE power usage
- 📊 CPU utilization per cluster
- 💾 RAM / Swap usage
- 📈 Historical charts + avg / max values
- 🌡️ Average CPU / GPU temperature
- 🎨 Switchable colors (6 variants)
- 🪟 Can be rendered in a small window
- 🦀 Written in Rust
## 🍺 Install via Homebrew
You can install [`macmon`](https://formulae.brew.sh/formula/macmon) using [brew](https://brew.sh/):
```sh
$ brew install macmon
```
## 🖥️ Install via MacPorts
You can also install [`macmon`](https://ports.macports.org/port/macmon/) using [MacPorts](https://macports.org/):
```sh
$ sudo port install macmon
```
## 📦 Install from source
1. Install [Rust toolchain](https://www.rust-lang.org/tools/install)
2. Clone the repo:
```sh
git clone https://github.com/vladkens/macmon.git && cd macmon
```
3. Build and run:
```sh
cargo run -r
```
4. (Optionally) Binary can be moved to bin folder:
```sh
sudo cp target/release/macmon /usr/local/bin
```
## 🚀 Usage
```sh
Usage: macmon [OPTIONS] [COMMAND]
Commands:
pipe Output metrics in JSON format
debug Print debug information
help Print this message or the help of the given subcommand(s)
Options:
-i, --interval <INTERVAL> Update interval in milliseconds [default: 1000]
-h, --help Print help
-V, --version Print version
Controls:
c - change color
v - switch charts view: gauge / sparkline
q - quit
```
## 🚰 Piping
You can use the pipe subcommand to output metrics in JSON format, which is suitable for piping into other tools or scripts. For example:
```sh
macmon pipe | jq
```
This command runs `macmon` in "pipe" mode and navigate output to `jq` for pretty-printing.
You can also specify the number of samples to run using `-s` or `--samples` parameter (default: `0`, which runs indefinitely), and set update interval in milliseconds using the `-i` or `--interval` parameter (default: `1000` ms). For example:
```sh
macmon pipe -s 10 -i 500 | jq
```
This will collect 10 samples with an update interval of 500 milliseconds.
### Output
```jsonc
{
"timestamp": "2025-02-24T20:38:15.427569+00:00",
"temp": {
"cpu_temp_avg": 43.73614, // Celsius
"gpu_temp_avg": 36.95167 // Celsius
},
"memory": {
"ram_total": 25769803776, // Bytes
"ram_usage": 20985479168, // Bytes
"swap_total": 4294967296, // Bytes
"swap_usage": 2602434560 // Bytes
},
"ecpu_usage": [1181, 0.082656614], // (Frequency MHz, Usage %)
"pcpu_usage": [1974, 0.015181795], // (Frequency MHz, Usage %)
"gpu_usage": [461, 0.021497859], // (Frequency MHz, Usage %)
"cpu_power": 0.20486385, // Watts
"gpu_power": 0.017451683, // Watts
"ane_power": 0.0, // Watts
"all_power": 0.22231553, // Watts
"sys_power": 5.876533, // Watts
"ram_power": 0.11635789, // Watts
"gpu_ram_power": 0.0009615385 // Watts (not sure what it means)
}
```
## 🤝 Contributing
We love contributions! Whether you have ideas, suggestions, or bug reports, feel free to open an issue or submit a pull request. Your input is essential in helping us improve `macmon` 💪
## 📝 License
`macmon` is distributed under the MIT License. For more details, check out the LICENSE.
## 🔍 See also
- [tlkh/asitop](https://github.com/tlkh/asitop) Original tool. Python, requires sudo.
- [dehydratedpotato/socpowerbud](https://github.com/dehydratedpotato/socpowerbud) ObjectiveC, sudoless, no TUI.
- [op06072/NeoAsitop](https://github.com/op06072/NeoAsitop) Swift, sudoless.
- [graelo/pumas](https://github.com/graelo/pumas) Rust, requires sudo.
- [context-labs/mactop](https://github.com/context-labs/mactop) Go, requires sudo.
---
*PS: One More Thing... Remember, monitoring your Mac's performance with `macmon` is like having a personal trainer for your processor — keeping those cores in shape! 💪*

View File

@@ -0,0 +1,174 @@
import asyncio
import os
import platform
import subprocess
from pathlib import Path
from typing import Optional, Tuple
from pydantic import BaseModel, ConfigDict, ValidationError
class MacMonError(Exception):
"""Exception raised for errors in the MacMon functions."""
def _get_binary_path(binary_path: Optional[str] = None) -> str:
"""
Get the path to the macmon binary.
Args:
binary_path: Optional path to the binary. If not provided, will use the bundled binary.
Returns:
The path to the macmon binary.
Raises:
MacMonError: If the binary doesn't exist or can't be made executable.
"""
# Check for macOS with ARM chip
system = platform.system().lower()
machine = platform.machine().lower()
if system != "darwin" or not (
"arm" in machine or "m1" in machine or "m2" in machine
):
raise MacMonError("MacMon only supports macOS with Apple Silicon (ARM) chips")
if binary_path:
path = binary_path
else:
# Get the directory where this module is located
module_dir = Path(os.path.dirname(os.path.abspath(__file__)))
path = str(module_dir / "bin" / "macmon")
# Ensure the binary exists and is executable
if not os.path.isfile(path):
raise MacMonError(f"Binary not found at: {path}")
# Make the binary executable if it's not already
if not os.access(path, os.X_OK):
try:
os.chmod(path, 0o755) # rwx r-x r-x
except OSError as e:
raise MacMonError(f"Failed to make binary executable: {e}") from e
return path
# ---------------------------------------------------------------------------
# Pydantic metric structures
# ---------------------------------------------------------------------------
class MemoryMetrics(BaseModel):
"""Memory-related metrics returned by macmon."""
ram_total: Optional[int] = None
ram_usage: Optional[int] = None
swap_total: Optional[int] = None
swap_usage: Optional[int] = None
model_config = ConfigDict(extra="ignore")
class TempMetrics(BaseModel):
"""Temperature-related metrics returned by macmon."""
cpu_temp_avg: Optional[float] = None
gpu_temp_avg: Optional[float] = None
model_config = ConfigDict(extra="ignore")
class Metrics(BaseModel):
"""Complete set of metrics returned by *macmon* binary.
All fields are optional to allow for partial output from the binary.
Unknown fields are ignored for forward-compatibility.
"""
all_power: Optional[float] = None
ane_power: Optional[float] = None
cpu_power: Optional[float] = None
ecpu_usage: Optional[Tuple[int, float]] = None
gpu_power: Optional[float] = None
gpu_ram_power: Optional[float] = None
gpu_usage: Optional[Tuple[int, float]] = None
memory: Optional[MemoryMetrics] = None
pcpu_usage: Optional[Tuple[int, float]] = None
ram_power: Optional[float] = None
sys_power: Optional[float] = None
temp: Optional[TempMetrics] = None
timestamp: Optional[str] = None
model_config = ConfigDict(extra="ignore")
# ---------------------------------------------------------------------------
# Synchronous helper
# ---------------------------------------------------------------------------
def get_metrics(binary_path: Optional[str] = None) -> Metrics:
"""
Run the binary and return the metrics as a Python dictionary.
Args:
binary_path: Optional path to the binary. If not provided, will use the bundled binary.
Returns:
A mapping containing system metrics.
Raises:
MacMonError: If there's an error running the binary.
"""
path = _get_binary_path(binary_path)
try:
# Run the binary with the argument -s 1 and capture its output
result = subprocess.run(
[path, "pipe", "-s", "1"], capture_output=True, text=True, check=True
)
return Metrics.model_validate_json(result.stdout)
except subprocess.CalledProcessError as e:
raise MacMonError(f"Error running binary: {e.stderr}") from e # type: ignore
except ValidationError as e:
raise MacMonError(f"Error parsing JSON output: {e}") from e
async def get_metrics_async(binary_path: Optional[str] = None) -> Metrics:
"""
Asynchronously run the binary and return the metrics as a Python dictionary.
Args:
binary_path: Optional path to the binary. If not provided, will use the bundled binary.
Returns:
A mapping containing system metrics.
Raises:
MacMonError: If there's an error running the binary.
"""
path = _get_binary_path(binary_path)
try:
proc = await asyncio.create_subprocess_exec(
path,
"pipe",
"-s",
"1",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
stdout, stderr = await proc.communicate()
if proc.returncode != 0:
raise MacMonError(f"Error running binary: {stderr.decode().strip()}")
return Metrics.model_validate_json(stdout.decode().strip())
except ValidationError as e:
raise MacMonError(f"Error parsing JSON output: {e}") from e

92
worker/utils/profile.py Normal file
View File

@@ -0,0 +1,92 @@
import asyncio
import platform
from typing import Any, Callable, Coroutine
from shared.types.profiling import (
MemoryPerformanceProfile,
NodePerformanceProfile,
SystemPerformanceProfile,
)
from worker.utils.macmon.macmon import (
Metrics,
)
from worker.utils.macmon.macmon import (
get_metrics_async as macmon_get_metrics_async,
)
# from exo.infra.event_log import EventLog
# from exo.app.config import ResourceMonitorConfig
# from exo.utils.mlx.mlx_utils import profile_flops_fp16
async def get_metrics_async() -> Metrics:
"""Return detailed Metrics on macOS or a minimal fallback elsewhere.
The *Metrics* schema comes from ``utils.macmon.macmon``; on non-macOS systems we
fill only the ``memory`` sub-structure so downstream code can still access
``metrics.memory.ram_total`` & ``ram_usage``.
"""
if platform.system().lower() == "darwin":
return await macmon_get_metrics_async()
return Metrics()
async def start_polling_node_metrics(
callback: Callable[[NodePerformanceProfile], Coroutine[Any, Any, None]],
):
poll_interval_s = 1.0
while True:
try:
# Gather metrics & system info with a timeout on each call
metrics = await get_metrics_async()
# Extract memory totals from metrics
total_mem = (
metrics.memory.ram_total
if metrics.memory is not None and metrics.memory.ram_total is not None
else 0
)
used_mem = (
metrics.memory.ram_usage
if metrics.memory is not None and metrics.memory.ram_usage is not None
else 0
)
# Run heavy FLOPs profiling only if enough time has elapsed
await callback(
NodePerformanceProfile(
model_id=platform.machine(),
chip_id=platform.processor(),
memory=MemoryPerformanceProfile(
ram_total=total_mem,
ram_available=total_mem - used_mem,
swap_total=metrics.memory.swap_total
if metrics.memory is not None
and metrics.memory.swap_total is not None
else 0,
swap_available=metrics.memory.swap_total
- metrics.memory.swap_usage
if metrics.memory is not None
and metrics.memory.swap_usage is not None
and metrics.memory.swap_total is not None
else 0,
),
network_interfaces=[],
system=SystemPerformanceProfile(
flops_fp16=0,
),
)
)
except asyncio.TimeoutError:
# One of the operations took too long; skip this iteration but keep the loop alive.
print(
"[resource_monitor] Operation timed out after 30s, skipping this cycle."
)
except Exception as e:
# Catch-all to ensure the monitor keeps running.
print(f"[resource_monitor] Encountered error: {e}")
finally:
await asyncio.sleep(poll_interval_s)