diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000..c2b5fa9b --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +worker/utils/macmon/bin/macmon filter=lfs diff=lfs merge=lfs -text diff --git a/.githooks/post-checkout b/.githooks/post-checkout new file mode 100755 index 00000000..5abf8ed9 --- /dev/null +++ b/.githooks/post-checkout @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-checkout "$@" diff --git a/.githooks/post-commit b/.githooks/post-commit new file mode 100755 index 00000000..b8b76c2c --- /dev/null +++ b/.githooks/post-commit @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-commit "$@" diff --git a/.githooks/post-merge b/.githooks/post-merge new file mode 100755 index 00000000..726f9098 --- /dev/null +++ b/.githooks/post-merge @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs post-merge "$@" diff --git a/.githooks/pre-push b/.githooks/pre-push new file mode 100755 index 00000000..5f26dc45 --- /dev/null +++ b/.githooks/pre-push @@ -0,0 +1,3 @@ +#!/bin/sh +command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; } +git lfs pre-push "$@" diff --git a/worker/__init__.py b/worker/__init__.py index 0519ecba..e69de29b 100644 --- a/worker/__init__.py +++ b/worker/__init__.py @@ -1 +0,0 @@ - \ No newline at end of file diff --git a/worker/main.py b/worker/main.py index 8a078c6a..3c4a5c45 100644 --- a/worker/main.py +++ b/worker/main.py @@ -15,10 +15,12 @@ from shared.types.events import ( ChunkGenerated, Event, InstanceId, + NodePerformanceMeasured, RunnerDeleted, RunnerStatusUpdated, TaskStateUpdated, ) +from shared.types.profiling import NodePerformanceProfile from shared.types.state import State from shared.types.tasks import TaskStatus from shared.types.worker.common import RunnerId @@ -52,6 +54,7 @@ from shared.types.worker.runners import ( from shared.types.worker.shards import ShardMetadata from worker.download.download_utils import build_model_path from worker.runner.runner_supervisor import RunnerSupervisor +from worker.utils.profile import start_polling_node_metrics def get_node_id() -> NodeId: @@ -482,7 +485,6 @@ class Worker: await asyncio.sleep(0.01) - # TODO: Handle resource monitoring (write-only) async def main(): node_id: NodeId = get_node_id() @@ -490,6 +492,13 @@ async def main(): event_log_manager = EventLogManager(EventLogConfig(), logger) await event_log_manager.initialize() + + # TODO: add profiling etc to resource monitor + async def resource_monitor_callback(node_performance_profile: NodePerformanceProfile) -> None: + await event_log_manager.worker_events.append_events( + [NodePerformanceMeasured(node_id=node_id, node_profile=node_performance_profile)], origin=node_id + ) + asyncio.create_task(start_polling_node_metrics(callback=resource_monitor_callback)) worker = Worker(node_id, logger, event_log_manager.worker_events) diff --git a/worker/utils/__init__.py b/worker/utils/__init__.py new file mode 100644 index 00000000..386a613c --- /dev/null +++ b/worker/utils/__init__.py @@ -0,0 +1,3 @@ +from .profile import start_polling_node_metrics + +__all__ = ["start_polling_node_metrics"] diff --git a/worker/utils/macmon/.DS_Store b/worker/utils/macmon/.DS_Store new file mode 100644 index 00000000..a3585876 Binary files /dev/null and b/worker/utils/macmon/.DS_Store differ diff --git a/worker/utils/macmon/__init__.py b/worker/utils/macmon/__init__.py new file mode 100644 index 00000000..ad950d89 --- /dev/null +++ b/worker/utils/macmon/__init__.py @@ -0,0 +1,3 @@ +from .macmon import MacMonError, get_metrics, get_metrics_async + +__all__ = ['get_metrics', 'get_metrics_async', 'MacMonError'] \ No newline at end of file diff --git a/worker/utils/macmon/bin/LICENSE.txt b/worker/utils/macmon/bin/LICENSE.txt new file mode 100644 index 00000000..4659b63d --- /dev/null +++ b/worker/utils/macmon/bin/LICENSE.txt @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2024 vladkens + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/worker/utils/macmon/bin/readme.md b/worker/utils/macmon/bin/readme.md new file mode 100644 index 00000000..9c44eed7 --- /dev/null +++ b/worker/utils/macmon/bin/readme.md @@ -0,0 +1,154 @@ +# `macmon` – Mac Monitor + +
+ +Sudoless performance monitoring CLI tool for Apple Silicon processors. + +[](https://github.com/vladkens/macmon/releases) +[](https://github.com/vladkens/macmon/releases) +[](https://github.com/vladkens/macmon/blob/main/LICENSE) +[donate](https://buymeacoffee.com/vladkens) + +
+ +
+ preview +
+ +## Motivation + +Apple Silicon processors don't provide an easy way to see live power consumption. I was interested in this information while testing local LLM models. `asitop` is a nice and simple TUI to quickly see current metrics, but it reads data from `powermetrics` and requires root privileges. `macmon` uses a private macOS API to gather metrics (essentially the same as `powermetrics`) but runs without sudo. 🎉 + +## 🌟 Features + +- 🚫 Works without sudo +- ⚡ Real-time CPU / GPU / ANE power usage +- 📊 CPU utilization per cluster +- 💾 RAM / Swap usage +- 📈 Historical charts + avg / max values +- 🌡️ Average CPU / GPU temperature +- 🎨 Switchable colors (6 variants) +- 🪟 Can be rendered in a small window +- 🦀 Written in Rust + +## 🍺 Install via Homebrew + +You can install [`macmon`](https://formulae.brew.sh/formula/macmon) using [brew](https://brew.sh/): + +```sh +$ brew install macmon +``` + +## 🖥️ Install via MacPorts + +You can also install [`macmon`](https://ports.macports.org/port/macmon/) using [MacPorts](https://macports.org/): + +```sh +$ sudo port install macmon +``` + +## 📦 Install from source + +1. Install [Rust toolchain](https://www.rust-lang.org/tools/install) + +2. Clone the repo: + +```sh +git clone https://github.com/vladkens/macmon.git && cd macmon +``` + +3. Build and run: + +```sh +cargo run -r +``` + +4. (Optionally) Binary can be moved to bin folder: + +```sh +sudo cp target/release/macmon /usr/local/bin +``` + +## 🚀 Usage + +```sh +Usage: macmon [OPTIONS] [COMMAND] + +Commands: + pipe Output metrics in JSON format + debug Print debug information + help Print this message or the help of the given subcommand(s) + +Options: + -i, --interval Update interval in milliseconds [default: 1000] + -h, --help Print help + -V, --version Print version + +Controls: + c - change color + v - switch charts view: gauge / sparkline + q - quit +``` + +## 🚰 Piping + +You can use the pipe subcommand to output metrics in JSON format, which is suitable for piping into other tools or scripts. For example: + +```sh +macmon pipe | jq +``` + +This command runs `macmon` in "pipe" mode and navigate output to `jq` for pretty-printing. + +You can also specify the number of samples to run using `-s` or `--samples` parameter (default: `0`, which runs indefinitely), and set update interval in milliseconds using the `-i` or `--interval` parameter (default: `1000` ms). For example: + +```sh +macmon pipe -s 10 -i 500 | jq +``` + +This will collect 10 samples with an update interval of 500 milliseconds. + +### Output + +```jsonc +{ + "timestamp": "2025-02-24T20:38:15.427569+00:00", + "temp": { + "cpu_temp_avg": 43.73614, // Celsius + "gpu_temp_avg": 36.95167 // Celsius + }, + "memory": { + "ram_total": 25769803776, // Bytes + "ram_usage": 20985479168, // Bytes + "swap_total": 4294967296, // Bytes + "swap_usage": 2602434560 // Bytes + }, + "ecpu_usage": [1181, 0.082656614], // (Frequency MHz, Usage %) + "pcpu_usage": [1974, 0.015181795], // (Frequency MHz, Usage %) + "gpu_usage": [461, 0.021497859], // (Frequency MHz, Usage %) + "cpu_power": 0.20486385, // Watts + "gpu_power": 0.017451683, // Watts + "ane_power": 0.0, // Watts + "all_power": 0.22231553, // Watts + "sys_power": 5.876533, // Watts + "ram_power": 0.11635789, // Watts + "gpu_ram_power": 0.0009615385 // Watts (not sure what it means) +} +``` + +## 🤝 Contributing +We love contributions! Whether you have ideas, suggestions, or bug reports, feel free to open an issue or submit a pull request. Your input is essential in helping us improve `macmon` 💪 + +## 📝 License +`macmon` is distributed under the MIT License. For more details, check out the LICENSE. + +## 🔍 See also +- [tlkh/asitop](https://github.com/tlkh/asitop) – Original tool. Python, requires sudo. +- [dehydratedpotato/socpowerbud](https://github.com/dehydratedpotato/socpowerbud) – ObjectiveC, sudoless, no TUI. +- [op06072/NeoAsitop](https://github.com/op06072/NeoAsitop) – Swift, sudoless. +- [graelo/pumas](https://github.com/graelo/pumas) – Rust, requires sudo. +- [context-labs/mactop](https://github.com/context-labs/mactop) – Go, requires sudo. + +--- + +*PS: One More Thing... Remember, monitoring your Mac's performance with `macmon` is like having a personal trainer for your processor — keeping those cores in shape! 💪* diff --git a/worker/utils/macmon/macmon.py b/worker/utils/macmon/macmon.py new file mode 100644 index 00000000..26b18416 --- /dev/null +++ b/worker/utils/macmon/macmon.py @@ -0,0 +1,174 @@ +import asyncio +import os +import platform +import subprocess +from pathlib import Path +from typing import Optional, Tuple + +from pydantic import BaseModel, ConfigDict, ValidationError + + +class MacMonError(Exception): + """Exception raised for errors in the MacMon functions.""" + + +def _get_binary_path(binary_path: Optional[str] = None) -> str: + """ + Get the path to the macmon binary. + + Args: + binary_path: Optional path to the binary. If not provided, will use the bundled binary. + + Returns: + The path to the macmon binary. + + Raises: + MacMonError: If the binary doesn't exist or can't be made executable. + """ + # Check for macOS with ARM chip + system = platform.system().lower() + machine = platform.machine().lower() + + if system != "darwin" or not ( + "arm" in machine or "m1" in machine or "m2" in machine + ): + raise MacMonError("MacMon only supports macOS with Apple Silicon (ARM) chips") + + if binary_path: + path = binary_path + else: + # Get the directory where this module is located + module_dir = Path(os.path.dirname(os.path.abspath(__file__))) + path = str(module_dir / "bin" / "macmon") + + # Ensure the binary exists and is executable + if not os.path.isfile(path): + raise MacMonError(f"Binary not found at: {path}") + + # Make the binary executable if it's not already + if not os.access(path, os.X_OK): + try: + os.chmod(path, 0o755) # rwx r-x r-x + except OSError as e: + raise MacMonError(f"Failed to make binary executable: {e}") from e + + return path + + +# --------------------------------------------------------------------------- +# Pydantic metric structures +# --------------------------------------------------------------------------- + + +class MemoryMetrics(BaseModel): + """Memory-related metrics returned by macmon.""" + + ram_total: Optional[int] = None + ram_usage: Optional[int] = None + swap_total: Optional[int] = None + swap_usage: Optional[int] = None + + model_config = ConfigDict(extra="ignore") + + +class TempMetrics(BaseModel): + """Temperature-related metrics returned by macmon.""" + + cpu_temp_avg: Optional[float] = None + gpu_temp_avg: Optional[float] = None + + model_config = ConfigDict(extra="ignore") + + +class Metrics(BaseModel): + """Complete set of metrics returned by *macmon* binary. + + All fields are optional to allow for partial output from the binary. + Unknown fields are ignored for forward-compatibility. + """ + + all_power: Optional[float] = None + ane_power: Optional[float] = None + cpu_power: Optional[float] = None + ecpu_usage: Optional[Tuple[int, float]] = None + gpu_power: Optional[float] = None + gpu_ram_power: Optional[float] = None + gpu_usage: Optional[Tuple[int, float]] = None + memory: Optional[MemoryMetrics] = None + pcpu_usage: Optional[Tuple[int, float]] = None + ram_power: Optional[float] = None + sys_power: Optional[float] = None + temp: Optional[TempMetrics] = None + timestamp: Optional[str] = None + + model_config = ConfigDict(extra="ignore") + + +# --------------------------------------------------------------------------- +# Synchronous helper +# --------------------------------------------------------------------------- + + +def get_metrics(binary_path: Optional[str] = None) -> Metrics: + """ + Run the binary and return the metrics as a Python dictionary. + + Args: + binary_path: Optional path to the binary. If not provided, will use the bundled binary. + + Returns: + A mapping containing system metrics. + + Raises: + MacMonError: If there's an error running the binary. + """ + path = _get_binary_path(binary_path) + + try: + # Run the binary with the argument -s 1 and capture its output + result = subprocess.run( + [path, "pipe", "-s", "1"], capture_output=True, text=True, check=True + ) + + return Metrics.model_validate_json(result.stdout) + + except subprocess.CalledProcessError as e: + raise MacMonError(f"Error running binary: {e.stderr}") from e # type: ignore + except ValidationError as e: + raise MacMonError(f"Error parsing JSON output: {e}") from e + + +async def get_metrics_async(binary_path: Optional[str] = None) -> Metrics: + """ + Asynchronously run the binary and return the metrics as a Python dictionary. + + Args: + binary_path: Optional path to the binary. If not provided, will use the bundled binary. + + Returns: + A mapping containing system metrics. + + Raises: + MacMonError: If there's an error running the binary. + """ + path = _get_binary_path(binary_path) + + try: + proc = await asyncio.create_subprocess_exec( + path, + "pipe", + "-s", + "1", + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + ) + + stdout, stderr = await proc.communicate() + + if proc.returncode != 0: + raise MacMonError(f"Error running binary: {stderr.decode().strip()}") + + return Metrics.model_validate_json(stdout.decode().strip()) + + except ValidationError as e: + raise MacMonError(f"Error parsing JSON output: {e}") from e diff --git a/worker/utils/profile.py b/worker/utils/profile.py new file mode 100644 index 00000000..b152e00c --- /dev/null +++ b/worker/utils/profile.py @@ -0,0 +1,92 @@ +import asyncio +import platform +from typing import Any, Callable, Coroutine + +from shared.types.profiling import ( + MemoryPerformanceProfile, + NodePerformanceProfile, + SystemPerformanceProfile, +) +from worker.utils.macmon.macmon import ( + Metrics, +) +from worker.utils.macmon.macmon import ( + get_metrics_async as macmon_get_metrics_async, +) + +# from exo.infra.event_log import EventLog +# from exo.app.config import ResourceMonitorConfig +# from exo.utils.mlx.mlx_utils import profile_flops_fp16 + + +async def get_metrics_async() -> Metrics: + """Return detailed Metrics on macOS or a minimal fallback elsewhere. + + The *Metrics* schema comes from ``utils.macmon.macmon``; on non-macOS systems we + fill only the ``memory`` sub-structure so downstream code can still access + ``metrics.memory.ram_total`` & ``ram_usage``. + """ + + if platform.system().lower() == "darwin": + return await macmon_get_metrics_async() + return Metrics() + + +async def start_polling_node_metrics( + callback: Callable[[NodePerformanceProfile], Coroutine[Any, Any, None]], +): + poll_interval_s = 1.0 + while True: + try: + # Gather metrics & system info with a timeout on each call + metrics = await get_metrics_async() + + # Extract memory totals from metrics + total_mem = ( + metrics.memory.ram_total + if metrics.memory is not None and metrics.memory.ram_total is not None + else 0 + ) + used_mem = ( + metrics.memory.ram_usage + if metrics.memory is not None and metrics.memory.ram_usage is not None + else 0 + ) + + # Run heavy FLOPs profiling only if enough time has elapsed + + await callback( + NodePerformanceProfile( + model_id=platform.machine(), + chip_id=platform.processor(), + memory=MemoryPerformanceProfile( + ram_total=total_mem, + ram_available=total_mem - used_mem, + swap_total=metrics.memory.swap_total + if metrics.memory is not None + and metrics.memory.swap_total is not None + else 0, + swap_available=metrics.memory.swap_total + - metrics.memory.swap_usage + if metrics.memory is not None + and metrics.memory.swap_usage is not None + and metrics.memory.swap_total is not None + else 0, + ), + network_interfaces=[], + system=SystemPerformanceProfile( + flops_fp16=0, + ), + ) + ) + + except asyncio.TimeoutError: + # One of the operations took too long; skip this iteration but keep the loop alive. + print( + "[resource_monitor] Operation timed out after 30s, skipping this cycle." + ) + except Exception as e: + # Catch-all to ensure the monitor keeps running. + print(f"[resource_monitor] Encountered error: {e}") + finally: + await asyncio.sleep(poll_interval_s)