From facf2d4d03165de6134162ff27f65eafd40a8798 Mon Sep 17 00:00:00 2001 From: rltakashige Date: Tue, 17 Feb 2026 17:48:43 +0000 Subject: [PATCH] Use custom fork that resolves GPU locks (#1489) ## Motivation There is an issue on Macs that means that an explicit synchronization is necessary for memory to be updated from L1 cache. This means that GPU locks can occur when a spin wait does not see the updated timestamp. ## Changes Updated in my own personal fork. ## Why It Works https://github.com/ARM-software/acle/releases ## Test Plan ### Manual Testing Tested manually that no GPU locks occur (even with multiple simultaneous instances running) and that the performance differential is negligible (267 vs 269 tps on Llama 3.2 1B at an approx 10k context.) ------------------------------------------------------ I have seen a GPU lock, specifically when sending a particularly large chat completion while the model was loading. However, I have since been unable to reproduce and this may be something I did wrong. Please do create an issue and tag me if any GPU locks do occur. --------- Co-authored-by: Jake Hillion Co-authored-by: Claude Opus 4.6 --- README.md | 11 +++++++++-- flake.nix | 2 +- nix/mlx.nix | 10 +++++----- pyproject.toml | 3 ++- python/parts.nix | 32 +++++++++++++++++++++++++++++--- uv.lock | 40 ++++++++++++++++------------------------ 6 files changed, 62 insertions(+), 36 deletions(-) diff --git a/README.md b/README.md index 58d41c379..ff1fe04be 100644 --- a/README.md +++ b/README.md @@ -72,16 +72,23 @@ There are two ways to run exo: ### Run from Source (macOS) +If you have [Nix](https://nixos.org/) installed, you can skip most of the steps below and run exo directly (after accepting the Cachix cache): + +```bash +nix run .#exo +``` + **Prerequisites:** +- [Xcode](https://developer.apple.com/xcode/) (provides the Metal ToolChain required for MLX compilation) - [brew](https://github.com/Homebrew/brew) (for simple package management on macOS) - + ```bash /bin/bash -c "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/HEAD/install.sh)" ``` - [uv](https://github.com/astral-sh/uv) (for Python dependency management) - [macmon](https://github.com/vladkens/macmon) (for hardware monitoring on Apple Silicon) - [node](https://github.com/nodejs/node) (for building the dashboard) - + ```bash brew install uv macmon node ``` diff --git a/flake.nix b/flake.nix index 9c2ca1ef5..e90e0bd25 100644 --- a/flake.nix +++ b/flake.nix @@ -115,7 +115,7 @@ packages = lib.optionalAttrs pkgs.stdenv.hostPlatform.isDarwin ( let uvLock = builtins.fromTOML (builtins.readFile ./uv.lock); - mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx") uvLock.package); + mlxPackage = builtins.head (builtins.filter (p: p.name == "mlx" && p.source ? git) uvLock.package); uvLockMlxVersion = mlxPackage.version; in { diff --git a/nix/mlx.nix b/nix/mlx.nix index f29217b85..8a40e11b5 100644 --- a/nix/mlx.nix +++ b/nix/mlx.nix @@ -41,16 +41,16 @@ let mlx = stdenv.mkDerivation rec { pname = "mlx"; - version = let v = "0.30.6"; in + version = let v = "0.30.7.dev20260217+50487b41"; in assert v == uvLockMlxVersion || throw "MLX version mismatch: nix/mlx.nix has ${v} but uv.lock has ${uvLockMlxVersion}. Update both the version and hash in nix/mlx.nix."; v; pyproject = true; src = fetchFromGitHub { - owner = "ml-explore"; - repo = "mlx"; - tag = "v${version}"; - hash = "sha256-avD5EGhwgmPdXLAyQSqTO6AXk/W3ziH+f6AetjK3Sdo="; + owner = "rltakashige"; + repo = "mlx-jaccl-fix-small-recv"; + rev = "50487b4141f3c951122655db3b83df5146c1fbeb"; + hash = "sha256-IL4a9vMX5nocgJU1WG4zE8hArHkHJtnh4sdYh3od5zU="; }; patches = [ diff --git a/pyproject.toml b/pyproject.toml index 5d8d79a50..02aa60714 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -17,7 +17,7 @@ dependencies = [ "loguru>=0.7.3", "exo_pyo3_bindings", # rust bindings "anyio==4.11.0", - "mlx==0.30.6; sys_platform == 'darwin'", + "mlx; sys_platform == 'darwin'", "mlx[cpu]==0.30.6; sys_platform == 'linux'", "mlx-lm==0.30.6", "tiktoken>=0.12.0", # required for kimi k2 tokenizer @@ -64,6 +64,7 @@ members = [ [tool.uv.sources] exo_pyo3_bindings = { workspace = true } +mlx = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git", branch = "address-rdma-gpu-locks", marker = "sys_platform == 'darwin'" } #mlx-lm = { git = "https://github.com/davidmcc73/mlx-lm", branch = "stable" } # Uncomment to use local mlx/mlx-lm development versions: # mlx = { path = "/Users/Shared/mlx", editable=true } diff --git a/python/parts.nix b/python/parts.nix index 46b4abdff..bac8ddab7 100644 --- a/python/parts.nix +++ b/python/parts.nix @@ -58,6 +58,21 @@ lib.optionalAttrs pkgs.stdenv.hostPlatform.isLinux ( (lib.mapAttrs (_: ignoreMissing) nvidiaPackages) // { mlx = ignoreMissing prev.mlx; + mlx-cuda-13 = prev.mlx-cuda-13.overrideAttrs (old: { + buildInputs = (old.buildInputs or [ ]) ++ [ + final.nvidia-cublas + final.nvidia-cuda-nvrtc + final.nvidia-cudnn-cu13 + final.nvidia-nccl-cu13 + ]; + preFixup = '' + addAutoPatchelfSearchPath ${final.nvidia-cublas} + addAutoPatchelfSearchPath ${final.nvidia-cuda-nvrtc} + addAutoPatchelfSearchPath ${final.nvidia-cudnn-cu13} + addAutoPatchelfSearchPath ${final.nvidia-nccl-cu13} + ''; + autoPatchelfIgnoreMissingDeps = [ "libcuda.so.1" ]; + }); torch = ignoreMissing prev.torch; triton = ignoreMissing prev.triton; } @@ -74,14 +89,25 @@ linuxOverlay ] ); - exoVenv = pythonSet.mkVirtualEnv "exo-env" workspace.deps.default; + # mlx-cpu and mlx-cuda-13 both ship mlx/ site-packages files; keep first. + # mlx-cpu/mlx-cuda-13 and nvidia-cudnn-cu12/cu13 ship overlapping files. + venvCollisionPaths = lib.optionals pkgs.stdenv.hostPlatform.isLinux [ + "lib/python3.13/site-packages/mlx*" + "lib/python3.13/site-packages/nvidia*" + ]; + + exoVenv = (pythonSet.mkVirtualEnv "exo-env" workspace.deps.default).overrideAttrs { + venvIgnoreCollisions = venvCollisionPaths; + }; # Virtual environment with dev dependencies for testing - testVenv = pythonSet.mkVirtualEnv "exo-test-env" ( + testVenv = (pythonSet.mkVirtualEnv "exo-test-env" ( workspace.deps.default // { exo = [ "dev" ]; # Include pytest, pytest-asyncio, pytest-env } - ); + )).overrideAttrs { + venvIgnoreCollisions = venvCollisionPaths; + }; mkPythonScript = name: path: pkgs.writeShellApplication { inherit name; diff --git a/uv.lock b/uv.lock index 627e69512..74687ad38 100644 --- a/uv.lock +++ b/uv.lock @@ -377,8 +377,8 @@ dependencies = [ { name = "hypercorn", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "loguru", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "mflux", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "mlx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "mlx", extra = ["cpu"], marker = "sys_platform == 'linux'" }, + { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cpu"], marker = "sys_platform == 'linux'" }, + { name = "mlx", version = "0.30.7.dev20260217+50487b41", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" }, marker = "sys_platform == 'darwin'" }, { name = "mlx-lm", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "msgspec", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "openai-harmony", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -416,7 +416,7 @@ requires-dist = [ { name = "hypercorn", specifier = ">=0.18.0" }, { name = "loguru", specifier = ">=0.7.3" }, { name = "mflux", specifier = "==0.15.5" }, - { name = "mlx", marker = "sys_platform == 'darwin'", specifier = "==0.30.6" }, + { name = "mlx", marker = "sys_platform == 'darwin'", git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks" }, { name = "mlx", extras = ["cpu"], marker = "sys_platform == 'linux'", specifier = "==0.30.6" }, { name = "mlx-lm", specifier = "==0.30.6" }, { name = "msgspec", specifier = ">=0.19.0" }, @@ -1020,8 +1020,8 @@ dependencies = [ { name = "fonttools", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "huggingface-hub", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "matplotlib", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "mlx", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "mlx", extra = ["cuda13"], marker = "sys_platform == 'linux'" }, + { name = "mlx", version = "0.30.6", source = { registry = "https://pypi.org/simple" }, extra = ["cuda13"], marker = "sys_platform == 'linux'" }, + { name = "mlx", version = "0.30.7.dev20260217+50487b41", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" }, marker = "sys_platform == 'darwin'" }, { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "opencv-python", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "piexif", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -1048,18 +1048,12 @@ wheels = [ name = "mlx" version = "0.30.6" source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mlx-metal", marker = "sys_platform == 'darwin'" }, +resolution-markers = [ + "sys_platform == 'linux'", ] wheels = [ - { url = "https://files.pythonhosted.org/packages/ae/5b/e460e144a34d5529e010056cccf50b538d56ed001473bc6b246018fd58cb/mlx-0.30.6-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:ed86f8bffc174c2f259ca589ea25464c96cf69d1bb457074a2bf2ef53737e54f", size = 573515, upload-time = "2026-02-06T03:45:23.405Z" }, - { url = "https://files.pythonhosted.org/packages/60/25/69833fefb9a3fef30b56792b1bcd022496c4fea83e45411d289b77ef7546/mlx-0.30.6-cp313-cp313-macosx_15_0_arm64.whl", hash = "sha256:c52294958269e20f300639a17c1900ca8fc737d859ddda737f9811e94bd040e5", size = 573516, upload-time = "2026-02-06T03:45:24.618Z" }, - { url = "https://files.pythonhosted.org/packages/9c/6a/7e7fbeebc5cb51b6a5eba96b263a6298707bcbdc059f4b0b73e088bc3dea/mlx-0.30.6-cp313-cp313-macosx_26_0_arm64.whl", hash = "sha256:b5b6636f7c49a4d86d8ec82643b972f45a144a7a9f3a967b27b2e6e22cf71e6a", size = 573592, upload-time = "2026-02-06T03:45:25.928Z" }, { url = "https://files.pythonhosted.org/packages/93/06/280f6f2ba80520a7109730425eda0d966658793aa0d02d8be8d351f75253/mlx-0.30.6-cp313-cp313-manylinux_2_35_aarch64.whl", hash = "sha256:67e6c9e30a9faeacc209917ef5523177cf9b086914b6b5d83ff886e4294b727d", size = 622011, upload-time = "2026-02-06T03:45:28.165Z" }, { url = "https://files.pythonhosted.org/packages/fe/35/f872afbee9c079cc69924d9e9c46f5663adb7da58cba3511db082dd307c1/mlx-0.30.6-cp313-cp313-manylinux_2_35_x86_64.whl", hash = "sha256:47db8b16fcb6f6c5a47c0bdb24ed377b41237017ac93aa6cb6aa206c9bdf82e4", size = 663650, upload-time = "2026-02-06T03:45:30.315Z" }, - { url = "https://files.pythonhosted.org/packages/60/23/361dc7a5797634e4d7e9bdd6564c6b28f9b1246672632def2f91bf066b18/mlx-0.30.6-cp314-cp314-macosx_14_0_arm64.whl", hash = "sha256:78804a89dcff4a838f7c2da72392fe87a523e95122a3c840e53df019122aad45", size = 575028, upload-time = "2026-02-06T03:45:31.549Z" }, - { url = "https://files.pythonhosted.org/packages/a8/69/1854484d414171586814dfbe8def95f75c4ea2c7341ba13ba8ee675f7c62/mlx-0.30.6-cp314-cp314-macosx_15_0_arm64.whl", hash = "sha256:ec13584ab069665cc7ad34a05494d9291cd623aef6ae96be48875fc87cfc25d6", size = 575026, upload-time = "2026-02-06T03:45:33.072Z" }, - { url = "https://files.pythonhosted.org/packages/6b/b8/3adbc441924209a7e4c568308b2a0b54bd09aee6a68db5bae85304791e54/mlx-0.30.6-cp314-cp314-macosx_26_0_arm64.whl", hash = "sha256:b2c5e8a090a753ef99a1380a4d059c983083f36198864f6df9faaf1223d083df", size = 575041, upload-time = "2026-02-06T03:45:34.814Z" }, { url = "https://files.pythonhosted.org/packages/3f/54/9d9e06804fb2088202a2cdf60458e00b221f71420bea285720b60f9e82b5/mlx-0.30.6-cp314-cp314-manylinux_2_35_aarch64.whl", hash = "sha256:9ceddede4af0de31d1f6b3099f70e5469d60cd7c546975dedbdbeab3519cab3f", size = 624002, upload-time = "2026-02-06T03:45:36Z" }, { url = "https://files.pythonhosted.org/packages/42/92/3140a15a50cb1f9267a6552171e1dfa577861de53e093124bc43707f2a0e/mlx-0.30.6-cp314-cp314-manylinux_2_35_x86_64.whl", hash = "sha256:4a6ffd2d16728cf95f63a1b555d7c2eaeea686a0e6b73228bd265411cb5d77a4", size = 663569, upload-time = "2026-02-06T03:45:37.242Z" }, ] @@ -1072,6 +1066,14 @@ cuda13 = [ { name = "mlx-cuda-13", marker = "sys_platform == 'linux'" }, ] +[[package]] +name = "mlx" +version = "0.30.7.dev20260217+50487b41" +source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" } +resolution-markers = [ + "sys_platform == 'darwin'", +] + [[package]] name = "mlx-cpu" version = "0.30.6" @@ -1102,7 +1104,7 @@ version = "0.30.6" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "jinja2", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, - { name = "mlx", marker = "sys_platform == 'darwin'" }, + { name = "mlx", version = "0.30.7.dev20260217+50487b41", source = { git = "https://github.com/rltakashige/mlx-jaccl-fix-small-recv.git?branch=address-rdma-gpu-locks#50487b4141f3c951122655db3b83df5146c1fbeb" }, marker = "sys_platform == 'darwin'" }, { name = "numpy", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "protobuf", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, { name = "pyyaml", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" }, @@ -1114,16 +1116,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/20/5f/01d281f1fa8a1521d5936659beb4f5ab1f32b463d059263cf9d4cef969d9/mlx_lm-0.30.6-py3-none-any.whl", hash = "sha256:a7405bd581eacc4bf8209d7a6b7f23629585a0d7c6740c2a97e51fee35b3b0e1", size = 379451, upload-time = "2026-02-04T21:27:43.222Z" }, ] -[[package]] -name = "mlx-metal" -version = "0.30.6" -source = { registry = "https://pypi.org/simple" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/f3/85/44406b521f920248fad621334d4dc15e77660a494edf890e7cbee33bf38d/mlx_metal-0.30.6-py3-none-macosx_14_0_arm64.whl", hash = "sha256:ea6d0c973def9a5b4f652cc77036237db3f88c9d0af63701d76b5fddde99b820", size = 38437818, upload-time = "2026-02-06T03:44:56.19Z" }, - { url = "https://files.pythonhosted.org/packages/d0/cb/10a516995f7d0c154b0d7e633c54b51e96977a86a355105b6474cfcbe0d0/mlx_metal-0.30.6-py3-none-macosx_15_0_arm64.whl", hash = "sha256:0f8cb94634d07e06a372d6ad9a090f38a18bab1ff19a140aede60eacf707bb94", size = 38433701, upload-time = "2026-02-06T03:44:59.678Z" }, - { url = "https://files.pythonhosted.org/packages/4c/7d/70cb272f7373c334709f210ed8420511fc9d64d05a7a646c0b3b94c29c04/mlx_metal-0.30.6-py3-none-macosx_26_0_arm64.whl", hash = "sha256:d761ae26304f2c4b454eeea7f612a56919d9e5e57dbb1dc0788f8e34aa6f41c2", size = 47718448, upload-time = "2026-02-06T03:45:03.133Z" }, -] - [[package]] name = "more-itertools" version = "10.8.0"