diff --git a/.gitattributes b/.gitattributes
new file mode 100644
index 00000000..c2b5fa9b
--- /dev/null
+++ b/.gitattributes
@@ -0,0 +1 @@
+worker/utils/macmon/bin/macmon filter=lfs diff=lfs merge=lfs -text
diff --git a/.githooks/post-checkout b/.githooks/post-checkout
new file mode 100755
index 00000000..5abf8ed9
--- /dev/null
+++ b/.githooks/post-checkout
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-checkout' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs post-checkout "$@"
diff --git a/.githooks/post-commit b/.githooks/post-commit
new file mode 100755
index 00000000..b8b76c2c
--- /dev/null
+++ b/.githooks/post-commit
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-commit' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs post-commit "$@"
diff --git a/.githooks/post-merge b/.githooks/post-merge
new file mode 100755
index 00000000..726f9098
--- /dev/null
+++ b/.githooks/post-merge
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'post-merge' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs post-merge "$@"
diff --git a/.githooks/pre-push b/.githooks/pre-push
new file mode 100755
index 00000000..5f26dc45
--- /dev/null
+++ b/.githooks/pre-push
@@ -0,0 +1,3 @@
+#!/bin/sh
+command -v git-lfs >/dev/null 2>&1 || { printf >&2 "\n%s\n\n" "This repository is configured for Git LFS but 'git-lfs' was not found on your path. If you no longer wish to use Git LFS, remove this hook by deleting the 'pre-push' file in the hooks directory (set by 'core.hookspath'; usually '.git/hooks')."; exit 2; }
+git lfs pre-push "$@"
diff --git a/worker/__init__.py b/worker/__init__.py
index 0519ecba..e69de29b 100644
--- a/worker/__init__.py
+++ b/worker/__init__.py
@@ -1 +0,0 @@
-
\ No newline at end of file
diff --git a/worker/main.py b/worker/main.py
index 8a078c6a..3c4a5c45 100644
--- a/worker/main.py
+++ b/worker/main.py
@@ -15,10 +15,12 @@ from shared.types.events import (
ChunkGenerated,
Event,
InstanceId,
+ NodePerformanceMeasured,
RunnerDeleted,
RunnerStatusUpdated,
TaskStateUpdated,
)
+from shared.types.profiling import NodePerformanceProfile
from shared.types.state import State
from shared.types.tasks import TaskStatus
from shared.types.worker.common import RunnerId
@@ -52,6 +54,7 @@ from shared.types.worker.runners import (
from shared.types.worker.shards import ShardMetadata
from worker.download.download_utils import build_model_path
from worker.runner.runner_supervisor import RunnerSupervisor
+from worker.utils.profile import start_polling_node_metrics
def get_node_id() -> NodeId:
@@ -482,7 +485,6 @@ class Worker:
await asyncio.sleep(0.01)
- # TODO: Handle resource monitoring (write-only)
async def main():
node_id: NodeId = get_node_id()
@@ -490,6 +492,13 @@ async def main():
event_log_manager = EventLogManager(EventLogConfig(), logger)
await event_log_manager.initialize()
+
+ # TODO: add profiling etc to resource monitor
+ async def resource_monitor_callback(node_performance_profile: NodePerformanceProfile) -> None:
+ await event_log_manager.worker_events.append_events(
+ [NodePerformanceMeasured(node_id=node_id, node_profile=node_performance_profile)], origin=node_id
+ )
+ asyncio.create_task(start_polling_node_metrics(callback=resource_monitor_callback))
worker = Worker(node_id, logger, event_log_manager.worker_events)
diff --git a/worker/utils/__init__.py b/worker/utils/__init__.py
new file mode 100644
index 00000000..386a613c
--- /dev/null
+++ b/worker/utils/__init__.py
@@ -0,0 +1,3 @@
+from .profile import start_polling_node_metrics
+
+__all__ = ["start_polling_node_metrics"]
diff --git a/worker/utils/macmon/.DS_Store b/worker/utils/macmon/.DS_Store
new file mode 100644
index 00000000..a3585876
Binary files /dev/null and b/worker/utils/macmon/.DS_Store differ
diff --git a/worker/utils/macmon/__init__.py b/worker/utils/macmon/__init__.py
new file mode 100644
index 00000000..ad950d89
--- /dev/null
+++ b/worker/utils/macmon/__init__.py
@@ -0,0 +1,3 @@
+from .macmon import MacMonError, get_metrics, get_metrics_async
+
+__all__ = ['get_metrics', 'get_metrics_async', 'MacMonError']
\ No newline at end of file
diff --git a/worker/utils/macmon/bin/LICENSE.txt b/worker/utils/macmon/bin/LICENSE.txt
new file mode 100644
index 00000000..4659b63d
--- /dev/null
+++ b/worker/utils/macmon/bin/LICENSE.txt
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2024 vladkens
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/worker/utils/macmon/bin/readme.md b/worker/utils/macmon/bin/readme.md
new file mode 100644
index 00000000..9c44eed7
--- /dev/null
+++ b/worker/utils/macmon/bin/readme.md
@@ -0,0 +1,154 @@
+# `macmon` – Mac Monitor
+
+
+
+Sudoless performance monitoring CLI tool for Apple Silicon processors.
+
+[

](https://github.com/vladkens/macmon/releases)
+[

](https://github.com/vladkens/macmon/releases)
+[

](https://github.com/vladkens/macmon/blob/main/LICENSE)
+[

](https://buymeacoffee.com/vladkens)
+
+
+
+
+

+
+
+## Motivation
+
+Apple Silicon processors don't provide an easy way to see live power consumption. I was interested in this information while testing local LLM models. `asitop` is a nice and simple TUI to quickly see current metrics, but it reads data from `powermetrics` and requires root privileges. `macmon` uses a private macOS API to gather metrics (essentially the same as `powermetrics`) but runs without sudo. 🎉
+
+## 🌟 Features
+
+- 🚫 Works without sudo
+- ⚡ Real-time CPU / GPU / ANE power usage
+- 📊 CPU utilization per cluster
+- 💾 RAM / Swap usage
+- 📈 Historical charts + avg / max values
+- 🌡️ Average CPU / GPU temperature
+- 🎨 Switchable colors (6 variants)
+- 🪟 Can be rendered in a small window
+- 🦀 Written in Rust
+
+## 🍺 Install via Homebrew
+
+You can install [`macmon`](https://formulae.brew.sh/formula/macmon) using [brew](https://brew.sh/):
+
+```sh
+$ brew install macmon
+```
+
+## 🖥️ Install via MacPorts
+
+You can also install [`macmon`](https://ports.macports.org/port/macmon/) using [MacPorts](https://macports.org/):
+
+```sh
+$ sudo port install macmon
+```
+
+## 📦 Install from source
+
+1. Install [Rust toolchain](https://www.rust-lang.org/tools/install)
+
+2. Clone the repo:
+
+```sh
+git clone https://github.com/vladkens/macmon.git && cd macmon
+```
+
+3. Build and run:
+
+```sh
+cargo run -r
+```
+
+4. (Optionally) Binary can be moved to bin folder:
+
+```sh
+sudo cp target/release/macmon /usr/local/bin
+```
+
+## 🚀 Usage
+
+```sh
+Usage: macmon [OPTIONS] [COMMAND]
+
+Commands:
+ pipe Output metrics in JSON format
+ debug Print debug information
+ help Print this message or the help of the given subcommand(s)
+
+Options:
+ -i, --interval Update interval in milliseconds [default: 1000]
+ -h, --help Print help
+ -V, --version Print version
+
+Controls:
+ c - change color
+ v - switch charts view: gauge / sparkline
+ q - quit
+```
+
+## 🚰 Piping
+
+You can use the pipe subcommand to output metrics in JSON format, which is suitable for piping into other tools or scripts. For example:
+
+```sh
+macmon pipe | jq
+```
+
+This command runs `macmon` in "pipe" mode and navigate output to `jq` for pretty-printing.
+
+You can also specify the number of samples to run using `-s` or `--samples` parameter (default: `0`, which runs indefinitely), and set update interval in milliseconds using the `-i` or `--interval` parameter (default: `1000` ms). For example:
+
+```sh
+macmon pipe -s 10 -i 500 | jq
+```
+
+This will collect 10 samples with an update interval of 500 milliseconds.
+
+### Output
+
+```jsonc
+{
+ "timestamp": "2025-02-24T20:38:15.427569+00:00",
+ "temp": {
+ "cpu_temp_avg": 43.73614, // Celsius
+ "gpu_temp_avg": 36.95167 // Celsius
+ },
+ "memory": {
+ "ram_total": 25769803776, // Bytes
+ "ram_usage": 20985479168, // Bytes
+ "swap_total": 4294967296, // Bytes
+ "swap_usage": 2602434560 // Bytes
+ },
+ "ecpu_usage": [1181, 0.082656614], // (Frequency MHz, Usage %)
+ "pcpu_usage": [1974, 0.015181795], // (Frequency MHz, Usage %)
+ "gpu_usage": [461, 0.021497859], // (Frequency MHz, Usage %)
+ "cpu_power": 0.20486385, // Watts
+ "gpu_power": 0.017451683, // Watts
+ "ane_power": 0.0, // Watts
+ "all_power": 0.22231553, // Watts
+ "sys_power": 5.876533, // Watts
+ "ram_power": 0.11635789, // Watts
+ "gpu_ram_power": 0.0009615385 // Watts (not sure what it means)
+}
+```
+
+## 🤝 Contributing
+We love contributions! Whether you have ideas, suggestions, or bug reports, feel free to open an issue or submit a pull request. Your input is essential in helping us improve `macmon` 💪
+
+## 📝 License
+`macmon` is distributed under the MIT License. For more details, check out the LICENSE.
+
+## 🔍 See also
+- [tlkh/asitop](https://github.com/tlkh/asitop) – Original tool. Python, requires sudo.
+- [dehydratedpotato/socpowerbud](https://github.com/dehydratedpotato/socpowerbud) – ObjectiveC, sudoless, no TUI.
+- [op06072/NeoAsitop](https://github.com/op06072/NeoAsitop) – Swift, sudoless.
+- [graelo/pumas](https://github.com/graelo/pumas) – Rust, requires sudo.
+- [context-labs/mactop](https://github.com/context-labs/mactop) – Go, requires sudo.
+
+---
+
+*PS: One More Thing... Remember, monitoring your Mac's performance with `macmon` is like having a personal trainer for your processor — keeping those cores in shape! 💪*
diff --git a/worker/utils/macmon/macmon.py b/worker/utils/macmon/macmon.py
new file mode 100644
index 00000000..26b18416
--- /dev/null
+++ b/worker/utils/macmon/macmon.py
@@ -0,0 +1,174 @@
+import asyncio
+import os
+import platform
+import subprocess
+from pathlib import Path
+from typing import Optional, Tuple
+
+from pydantic import BaseModel, ConfigDict, ValidationError
+
+
+class MacMonError(Exception):
+ """Exception raised for errors in the MacMon functions."""
+
+
+def _get_binary_path(binary_path: Optional[str] = None) -> str:
+ """
+ Get the path to the macmon binary.
+
+ Args:
+ binary_path: Optional path to the binary. If not provided, will use the bundled binary.
+
+ Returns:
+ The path to the macmon binary.
+
+ Raises:
+ MacMonError: If the binary doesn't exist or can't be made executable.
+ """
+ # Check for macOS with ARM chip
+ system = platform.system().lower()
+ machine = platform.machine().lower()
+
+ if system != "darwin" or not (
+ "arm" in machine or "m1" in machine or "m2" in machine
+ ):
+ raise MacMonError("MacMon only supports macOS with Apple Silicon (ARM) chips")
+
+ if binary_path:
+ path = binary_path
+ else:
+ # Get the directory where this module is located
+ module_dir = Path(os.path.dirname(os.path.abspath(__file__)))
+ path = str(module_dir / "bin" / "macmon")
+
+ # Ensure the binary exists and is executable
+ if not os.path.isfile(path):
+ raise MacMonError(f"Binary not found at: {path}")
+
+ # Make the binary executable if it's not already
+ if not os.access(path, os.X_OK):
+ try:
+ os.chmod(path, 0o755) # rwx r-x r-x
+ except OSError as e:
+ raise MacMonError(f"Failed to make binary executable: {e}") from e
+
+ return path
+
+
+# ---------------------------------------------------------------------------
+# Pydantic metric structures
+# ---------------------------------------------------------------------------
+
+
+class MemoryMetrics(BaseModel):
+ """Memory-related metrics returned by macmon."""
+
+ ram_total: Optional[int] = None
+ ram_usage: Optional[int] = None
+ swap_total: Optional[int] = None
+ swap_usage: Optional[int] = None
+
+ model_config = ConfigDict(extra="ignore")
+
+
+class TempMetrics(BaseModel):
+ """Temperature-related metrics returned by macmon."""
+
+ cpu_temp_avg: Optional[float] = None
+ gpu_temp_avg: Optional[float] = None
+
+ model_config = ConfigDict(extra="ignore")
+
+
+class Metrics(BaseModel):
+ """Complete set of metrics returned by *macmon* binary.
+
+ All fields are optional to allow for partial output from the binary.
+ Unknown fields are ignored for forward-compatibility.
+ """
+
+ all_power: Optional[float] = None
+ ane_power: Optional[float] = None
+ cpu_power: Optional[float] = None
+ ecpu_usage: Optional[Tuple[int, float]] = None
+ gpu_power: Optional[float] = None
+ gpu_ram_power: Optional[float] = None
+ gpu_usage: Optional[Tuple[int, float]] = None
+ memory: Optional[MemoryMetrics] = None
+ pcpu_usage: Optional[Tuple[int, float]] = None
+ ram_power: Optional[float] = None
+ sys_power: Optional[float] = None
+ temp: Optional[TempMetrics] = None
+ timestamp: Optional[str] = None
+
+ model_config = ConfigDict(extra="ignore")
+
+
+# ---------------------------------------------------------------------------
+# Synchronous helper
+# ---------------------------------------------------------------------------
+
+
+def get_metrics(binary_path: Optional[str] = None) -> Metrics:
+ """
+ Run the binary and return the metrics as a Python dictionary.
+
+ Args:
+ binary_path: Optional path to the binary. If not provided, will use the bundled binary.
+
+ Returns:
+ A mapping containing system metrics.
+
+ Raises:
+ MacMonError: If there's an error running the binary.
+ """
+ path = _get_binary_path(binary_path)
+
+ try:
+ # Run the binary with the argument -s 1 and capture its output
+ result = subprocess.run(
+ [path, "pipe", "-s", "1"], capture_output=True, text=True, check=True
+ )
+
+ return Metrics.model_validate_json(result.stdout)
+
+ except subprocess.CalledProcessError as e:
+ raise MacMonError(f"Error running binary: {e.stderr}") from e # type: ignore
+ except ValidationError as e:
+ raise MacMonError(f"Error parsing JSON output: {e}") from e
+
+
+async def get_metrics_async(binary_path: Optional[str] = None) -> Metrics:
+ """
+ Asynchronously run the binary and return the metrics as a Python dictionary.
+
+ Args:
+ binary_path: Optional path to the binary. If not provided, will use the bundled binary.
+
+ Returns:
+ A mapping containing system metrics.
+
+ Raises:
+ MacMonError: If there's an error running the binary.
+ """
+ path = _get_binary_path(binary_path)
+
+ try:
+ proc = await asyncio.create_subprocess_exec(
+ path,
+ "pipe",
+ "-s",
+ "1",
+ stdout=asyncio.subprocess.PIPE,
+ stderr=asyncio.subprocess.PIPE,
+ )
+
+ stdout, stderr = await proc.communicate()
+
+ if proc.returncode != 0:
+ raise MacMonError(f"Error running binary: {stderr.decode().strip()}")
+
+ return Metrics.model_validate_json(stdout.decode().strip())
+
+ except ValidationError as e:
+ raise MacMonError(f"Error parsing JSON output: {e}") from e
diff --git a/worker/utils/profile.py b/worker/utils/profile.py
new file mode 100644
index 00000000..b152e00c
--- /dev/null
+++ b/worker/utils/profile.py
@@ -0,0 +1,92 @@
+import asyncio
+import platform
+from typing import Any, Callable, Coroutine
+
+from shared.types.profiling import (
+ MemoryPerformanceProfile,
+ NodePerformanceProfile,
+ SystemPerformanceProfile,
+)
+from worker.utils.macmon.macmon import (
+ Metrics,
+)
+from worker.utils.macmon.macmon import (
+ get_metrics_async as macmon_get_metrics_async,
+)
+
+# from exo.infra.event_log import EventLog
+# from exo.app.config import ResourceMonitorConfig
+# from exo.utils.mlx.mlx_utils import profile_flops_fp16
+
+
+async def get_metrics_async() -> Metrics:
+ """Return detailed Metrics on macOS or a minimal fallback elsewhere.
+
+ The *Metrics* schema comes from ``utils.macmon.macmon``; on non-macOS systems we
+ fill only the ``memory`` sub-structure so downstream code can still access
+ ``metrics.memory.ram_total`` & ``ram_usage``.
+ """
+
+ if platform.system().lower() == "darwin":
+ return await macmon_get_metrics_async()
+ return Metrics()
+
+
+async def start_polling_node_metrics(
+ callback: Callable[[NodePerformanceProfile], Coroutine[Any, Any, None]],
+):
+ poll_interval_s = 1.0
+ while True:
+ try:
+ # Gather metrics & system info with a timeout on each call
+ metrics = await get_metrics_async()
+
+ # Extract memory totals from metrics
+ total_mem = (
+ metrics.memory.ram_total
+ if metrics.memory is not None and metrics.memory.ram_total is not None
+ else 0
+ )
+ used_mem = (
+ metrics.memory.ram_usage
+ if metrics.memory is not None and metrics.memory.ram_usage is not None
+ else 0
+ )
+
+ # Run heavy FLOPs profiling only if enough time has elapsed
+
+ await callback(
+ NodePerformanceProfile(
+ model_id=platform.machine(),
+ chip_id=platform.processor(),
+ memory=MemoryPerformanceProfile(
+ ram_total=total_mem,
+ ram_available=total_mem - used_mem,
+ swap_total=metrics.memory.swap_total
+ if metrics.memory is not None
+ and metrics.memory.swap_total is not None
+ else 0,
+ swap_available=metrics.memory.swap_total
+ - metrics.memory.swap_usage
+ if metrics.memory is not None
+ and metrics.memory.swap_usage is not None
+ and metrics.memory.swap_total is not None
+ else 0,
+ ),
+ network_interfaces=[],
+ system=SystemPerformanceProfile(
+ flops_fp16=0,
+ ),
+ )
+ )
+
+ except asyncio.TimeoutError:
+ # One of the operations took too long; skip this iteration but keep the loop alive.
+ print(
+ "[resource_monitor] Operation timed out after 30s, skipping this cycle."
+ )
+ except Exception as e:
+ # Catch-all to ensure the monitor keeps running.
+ print(f"[resource_monitor] Encountered error: {e}")
+ finally:
+ await asyncio.sleep(poll_interval_s)