Compare commits

..

1 Commits

Author SHA1 Message Date
Evan
7ccb233373 nix mlx compilation for better portability 2026-01-13 17:13:28 +00:00
56 changed files with 1344 additions and 2014 deletions

View File

@@ -113,22 +113,11 @@ jobs:
uv python install
uv sync --locked
- name: Install Nix
uses: cachix/install-nix-action@v31
with:
nix_path: nixpkgs=channel:nixos-unstable
- name: Configure Cachix
uses: cachix/cachix-action@v14
with:
name: exo
authToken: "${{ secrets.CACHIX_AUTH_TOKEN }}"
- name: Build dashboard
run: |
DASHBOARD_OUT=$(nix build .#dashboard --print-build-logs --no-link --print-out-paths)
mkdir -p dashboard/build
cp -r "$DASHBOARD_OUT"/* dashboard/build/
cd dashboard
npm ci
npm run build
- name: Install Sparkle CLI
run: |

View File

@@ -113,6 +113,28 @@ jobs:
with:
lfs: false
- name: Select Xcode
if: startsWith(matrix.runner, 'macos-')
run: |
XCODE_BASEDIR="$(printf '%s\n' /Applications/Xcode_*.app | sort -V | tail -n 1)"
[[ -z "$XCODE_BASEDIR" ]] && exit 1
sudo mv "$XCODE_BASEDIR" /Applications/Xcode.app
ls -ld "/Applications/Xcode.app"
sudo /usr/bin/xcode-select -s "/Applications/Xcode.app"
/usr/bin/xcode-select -p || true
/usr/bin/xcrun --toolchain default --find xcodebuild || true
- name: Install Metal toolchain component
if: startsWith(matrix.runner, 'macos-')
run: |
set -e
if ! xcrun --find metal >/dev/null 2>&1; then
sudo xcodebuild -downloadComponent MetalToolchain
fi
xcrun --find metal
xcrun --find metallib
- uses: cachix/install-nix-action@v31
with:
nix_path: nixpkgs=channel:nixos-unstable

19
Cargo.lock generated
View File

@@ -4340,6 +4340,25 @@ dependencies = [
"libc",
]
[[package]]
name = "system_custodian"
version = "0.0.1"
dependencies = [
"delegate",
"derive_more",
"either",
"extend",
"futures",
"futures-timer",
"impl-trait-for-tuples",
"keccak-const",
"log",
"thiserror 2.0.17",
"tokio",
"tracing-subscriber",
"util",
]
[[package]]
name = "tagptr"
version = "0.2.0"

View File

@@ -3,6 +3,7 @@ resolver = "3"
members = [
"rust/networking",
"rust/exo_pyo3_bindings",
"rust/system_custodian",
"rust/util",
]
@@ -24,6 +25,7 @@ opt-level = 3
[workspace.dependencies]
## Crate members as common dependencies
networking = { path = "rust/networking" }
system_custodian = { path = "rust/system_custodian" }
util = { path = "rust/util" }
# Proc-macro authoring tools

View File

@@ -1,60 +0,0 @@
{ lib
, config
, dream2nix
, ...
}:
let
# Read and parse the lock file
rawLockFile = builtins.fromJSON (builtins.readFile "${config.deps.dashboardSrc}/package-lock.json");
# For packages with bundleDependencies, filter out deps that are bundled
# (bundled deps are inside the tarball, not separate lockfile entries)
fixedPackages = lib.mapAttrs
(path: entry:
if entry ? bundleDependencies && entry.bundleDependencies != [ ]
then entry // {
dependencies = lib.filterAttrs
(name: _: !(lib.elem name entry.bundleDependencies))
(entry.dependencies or { });
}
else entry
)
(rawLockFile.packages or { });
fixedLockFile = rawLockFile // { packages = fixedPackages; };
in
{
imports = [
dream2nix.modules.dream2nix.nodejs-package-lock-v3
dream2nix.modules.dream2nix.nodejs-granular-v3
];
name = "exo-dashboard";
version = "1.0.0";
mkDerivation = {
src = config.deps.dashboardSrc;
buildPhase = ''
runHook preBuild
npm run build
runHook postBuild
'';
installPhase = ''
runHook preInstall
cp -r build $out/build
runHook postInstall
'';
};
deps = { nixpkgs, ... }: {
inherit (nixpkgs) stdenv;
dashboardSrc = null; # Injected by parts.nix
};
nodejs-package-lock-v3 = {
# Don't use packageLockFile - provide the fixed lock content directly
packageLock = fixedLockFile;
};
}

View File

@@ -863,7 +863,6 @@
"integrity": "sha512-oH8tXw7EZnie8FdOWYrF7Yn4IKrqTFHhXvl8YxXxbKwTMcD/5NNCryUSEXRk2ZR4ojnub0P8rNrsVGHXWqIDtA==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@standard-schema/spec": "^1.0.0",
"@sveltejs/acorn-typescript": "^1.0.5",
@@ -903,7 +902,6 @@
"integrity": "sha512-Y1Cs7hhTc+a5E9Va/xwKlAJoariQyHY+5zBgCZg4PFWNYQ1nMN9sjK1zhw1gK69DuqVP++sht/1GZg1aRwmAXQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"@sveltejs/vite-plugin-svelte-inspector": "^4.0.1",
"debug": "^4.4.1",
@@ -1520,7 +1518,6 @@
"integrity": "sha512-LCCV0HdSZZZb34qifBsyWlUmok6W7ouER+oQIGBScS8EsZsQbrtFTUrDX4hOl+CS6p7cnNC4td+qrSVGSCTUfQ==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"undici-types": "~6.21.0"
}
@@ -1530,7 +1527,6 @@
"resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz",
"integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==",
"license": "MIT",
"peer": true,
"bin": {
"acorn": "bin/acorn"
},
@@ -1943,7 +1939,6 @@
"integrity": "sha512-fmTRWbNMmsmWq6xJV8D19U/gw/bwrHfNXxrIN+HfZgnzqTHp9jOmKMhsTUjXOJnZOdZY9Q28y4yebKzqDKlxlQ==",
"dev": true,
"license": "ISC",
"peer": true,
"engines": {
"node": ">=12"
}
@@ -2651,7 +2646,6 @@
"integrity": "sha512-5gTmgEY/sqK6gFXLIsQNH19lWb4ebPDLA4SdLP7dsWkIXHWlG66oPuVvXSGFPppYZz8ZDZq0dYYrbHfBCVUb1Q==",
"dev": true,
"license": "MIT",
"peer": true,
"engines": {
"node": ">=12"
},
@@ -2839,7 +2833,6 @@
"resolved": "https://registry.npmjs.org/svelte/-/svelte-5.45.3.tgz",
"integrity": "sha512-ngKXNhNvwPzF43QqEhDOue7TQTrG09em1sd4HBxVF0Wr2gopAmdEWan+rgbdgK4fhBtSOTJO8bYU4chUG7VXZQ==",
"license": "MIT",
"peer": true,
"dependencies": {
"@jridgewell/remapping": "^2.3.4",
"@jridgewell/sourcemap-codec": "^1.5.0",
@@ -2984,7 +2977,6 @@
"integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==",
"dev": true,
"license": "Apache-2.0",
"peer": true,
"bin": {
"tsc": "bin/tsc",
"tsserver": "bin/tsserver"
@@ -3006,7 +2998,6 @@
"integrity": "sha512-+Oxm7q9hDoLMyJOYfUYBuHQo+dkAloi33apOPP56pzj+vsdJDzr+j1NISE5pyaAuKL4A3UD34qd0lx5+kfKp2g==",
"dev": true,
"license": "MIT",
"peer": true,
"dependencies": {
"esbuild": "^0.25.0",
"fdir": "^6.4.4",

View File

@@ -1,44 +0,0 @@
{ inputs, ... }:
{
perSystem =
{ pkgs, lib, ... }:
let
# Filter source to only include dashboard directory
src = lib.cleanSourceWith {
src = inputs.self;
filter =
path: type:
let
baseName = builtins.baseNameOf path;
inDashboardDir =
(lib.hasInfix "/dashboard/" path)
|| (lib.hasSuffix "/dashboard" (builtins.dirOf path))
|| (baseName == "dashboard" && type == "directory");
in
inDashboardDir;
};
# Build the dashboard with dream2nix (includes node_modules in output)
dashboardFull = inputs.dream2nix.lib.evalModules {
packageSets.nixpkgs = pkgs;
modules = [
./dashboard.nix
{
paths.projectRoot = inputs.self;
paths.projectRootFile = "flake.nix";
paths.package = inputs.self + "/dashboard";
}
# Inject the filtered source
{
deps.dashboardSrc = lib.mkForce "${src}/dashboard";
}
];
};
in
{
# Extract just the static site from the full build
packages.dashboard = pkgs.runCommand "exo-dashboard" { } ''
cp -r ${dashboardFull}/build $out
'';
};
}

158
flake.lock generated
View File

@@ -1,42 +1,5 @@
{
"nodes": {
"crane": {
"locked": {
"lastModified": 1767744144,
"narHash": "sha256-9/9ntI0D+HbN4G0TrK3KmHbTvwgswz7p8IEJsWyef8Q=",
"owner": "ipetkov",
"repo": "crane",
"rev": "2fb033290bf6b23f226d4c8b32f7f7a16b043d7e",
"type": "github"
},
"original": {
"owner": "ipetkov",
"repo": "crane",
"type": "github"
}
},
"dream2nix": {
"inputs": {
"nixpkgs": [
"nixpkgs"
],
"purescript-overlay": "purescript-overlay",
"pyproject-nix": "pyproject-nix"
},
"locked": {
"lastModified": 1765953015,
"narHash": "sha256-5FBZbbWR1Csp3Y2icfRkxMJw/a/5FGg8hCXej2//bbI=",
"owner": "nix-community",
"repo": "dream2nix",
"rev": "69eb01fa0995e1e90add49d8ca5bcba213b0416f",
"type": "github"
},
"original": {
"owner": "nix-community",
"repo": "dream2nix",
"type": "github"
}
},
"fenix": {
"inputs": {
"nixpkgs": [
@@ -45,11 +8,11 @@
"rust-analyzer-src": "rust-analyzer-src"
},
"locked": {
"lastModified": 1768287139,
"narHash": "sha256-nsXFt0OzUi6K7dUzzJD5/v9e0Ic+fvclfIW936/43ZM=",
"lastModified": 1761893049,
"narHash": "sha256-1TtFDPhC+ZsrOOtBnry1EZC+WipTTvsOVjIEVugqji8=",
"owner": "nix-community",
"repo": "fenix",
"rev": "a4a3aa956931f90f35453cb519e4545e9ad7f773",
"rev": "c2ac9a5c0d6d16630c3b225b874bd14528d1abe6",
"type": "github"
},
"original": {
@@ -58,22 +21,6 @@
"type": "github"
}
},
"flake-compat": {
"flake": false,
"locked": {
"lastModified": 1696426674,
"narHash": "sha256-kvjfFW7WAETZlt09AgDn1MrtKzP7t90Vf7vypd3OL1U=",
"owner": "edolstra",
"repo": "flake-compat",
"rev": "0f9255e01c2351cc7d116c072cb317785dd33b33",
"type": "github"
},
"original": {
"owner": "edolstra",
"repo": "flake-compat",
"type": "github"
}
},
"flake-parts": {
"inputs": {
"nixpkgs-lib": [
@@ -95,22 +42,6 @@
}
},
"nixpkgs": {
"locked": {
"lastModified": 1768127708,
"narHash": "sha256-1Sm77VfZh3mU0F5OqKABNLWxOuDeHIlcFjsXeeiPazs=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "ffbc9f8cbaacfb331b6017d5a5abb21a492c9a38",
"type": "github"
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"type": "github"
}
},
"nixpkgs-swift": {
"locked": {
"lastModified": 1761672384,
"narHash": "sha256-o9KF3DJL7g7iYMZq9SWgfS1BFlNbsm6xplRjVlOCkXI=",
@@ -121,74 +52,27 @@
},
"original": {
"owner": "NixOS",
"ref": "nixos-unstable",
"repo": "nixpkgs",
"rev": "08dacfca559e1d7da38f3cf05f1f45ee9bfd213c",
"type": "github"
}
},
"purescript-overlay": {
"inputs": {
"flake-compat": "flake-compat",
"nixpkgs": [
"dream2nix",
"nixpkgs"
],
"slimlock": "slimlock"
},
"locked": {
"lastModified": 1728546539,
"narHash": "sha256-Sws7w0tlnjD+Bjck1nv29NjC5DbL6nH5auL9Ex9Iz2A=",
"owner": "thomashoneyman",
"repo": "purescript-overlay",
"rev": "4ad4c15d07bd899d7346b331f377606631eb0ee4",
"type": "github"
},
"original": {
"owner": "thomashoneyman",
"repo": "purescript-overlay",
"type": "github"
}
},
"pyproject-nix": {
"inputs": {
"nixpkgs": [
"dream2nix",
"nixpkgs"
]
},
"locked": {
"lastModified": 1763017646,
"narHash": "sha256-Z+R2lveIp6Skn1VPH3taQIuMhABg1IizJd8oVdmdHsQ=",
"owner": "pyproject-nix",
"repo": "pyproject.nix",
"rev": "47bd6f296502842643078d66128f7b5e5370790c",
"type": "github"
},
"original": {
"owner": "pyproject-nix",
"repo": "pyproject.nix",
"type": "github"
}
},
"root": {
"inputs": {
"crane": "crane",
"dream2nix": "dream2nix",
"fenix": "fenix",
"flake-parts": "flake-parts",
"nixpkgs": "nixpkgs",
"nixpkgs-swift": "nixpkgs-swift",
"treefmt-nix": "treefmt-nix"
}
},
"rust-analyzer-src": {
"flake": false,
"locked": {
"lastModified": 1768224240,
"narHash": "sha256-Pp1dDrXKPBUJReZnnDElFyHYn67XTd48zRhToheLjtk=",
"lastModified": 1761849405,
"narHash": "sha256-igXdvC+WCUN+3gnfk+ptT7rMmxQuY6WbIg1rXMUN1DM=",
"owner": "rust-lang",
"repo": "rust-analyzer",
"rev": "725349602e525df37f377701e001fe8aab807878",
"rev": "f7de8ae045a5fe80f1203c5a1c3015b05f7c3550",
"type": "github"
},
"original": {
@@ -198,28 +82,6 @@
"type": "github"
}
},
"slimlock": {
"inputs": {
"nixpkgs": [
"dream2nix",
"purescript-overlay",
"nixpkgs"
]
},
"locked": {
"lastModified": 1688756706,
"narHash": "sha256-xzkkMv3neJJJ89zo3o2ojp7nFeaZc2G0fYwNXNJRFlo=",
"owner": "thomashoneyman",
"repo": "slimlock",
"rev": "cf72723f59e2340d24881fd7bf61cb113b4c407c",
"type": "github"
},
"original": {
"owner": "thomashoneyman",
"repo": "slimlock",
"type": "github"
}
},
"treefmt-nix": {
"inputs": {
"nixpkgs": [
@@ -227,11 +89,11 @@
]
},
"locked": {
"lastModified": 1768158989,
"narHash": "sha256-67vyT1+xClLldnumAzCTBvU0jLZ1YBcf4vANRWP3+Ak=",
"lastModified": 1762938485,
"narHash": "sha256-AlEObg0syDl+Spi4LsZIBrjw+snSVU4T8MOeuZJUJjM=",
"owner": "numtide",
"repo": "treefmt-nix",
"rev": "e96d59dff5c0d7fddb9d113ba108f03c3ef99eca",
"rev": "5b4ee75aeefd1e2d5a1cc43cf6ba65eba75e83e4",
"type": "github"
},
"original": {

View File

@@ -9,8 +9,6 @@
inputs.nixpkgs-lib.follows = "nixpkgs";
};
crane.url = "github:ipetkov/crane";
fenix = {
url = "github:nix-community/fenix";
inputs.nixpkgs.follows = "nixpkgs";
@@ -20,14 +18,6 @@
url = "github:numtide/treefmt-nix";
inputs.nixpkgs.follows = "nixpkgs";
};
dream2nix = {
url = "github:nix-community/dream2nix";
inputs.nixpkgs.follows = "nixpkgs";
};
# Pinned nixpkgs for swift-format (swift is broken on x86_64-linux in newer nixpkgs)
nixpkgs-swift.url = "github:NixOS/nixpkgs/08dacfca559e1d7da38f3cf05f1f45ee9bfd213c";
};
nixConfig = {
@@ -46,16 +36,12 @@
imports = [
inputs.treefmt-nix.flakeModule
./dashboard/parts.nix
./rust/parts.nix
];
perSystem =
{ config, self', inputs', pkgs, lib, system, ... }:
{ config, inputs', pkgs, lib, ... }:
let
fenixToolchain = inputs'.fenix.packages.complete;
# Use pinned nixpkgs for swift-format (swift is broken on x86_64-linux in newer nixpkgs)
pkgsSwift = import inputs.nixpkgs-swift { inherit system; };
in
{
treefmt = {
@@ -68,16 +54,13 @@
};
rustfmt = {
enable = true;
package = config.rust.toolchain;
package = fenixToolchain.rustfmt;
};
prettier = {
enable = true;
includes = [ "*.ts" ];
};
swift-format = {
enable = true;
package = pkgsSwift.swiftPackages.swift-format;
};
swift-format.enable = true;
};
};
@@ -87,9 +70,12 @@
touch $out
'';
devShells.default = with pkgs; pkgs.mkShell {
inputsFrom = [ self'.checks.cargo-build ];
packages =
if pkgs.stdenv.isDarwin then {
metal = pkgs.callPackage ./nix/metalWrapper.nix { metalVersion = "230"; };
} else { };
devShells.default = with pkgs; mkShellNoCC {
packages =
[
# FORMATTING
@@ -102,8 +88,14 @@
basedpyright
# RUST
config.rust.toolchain
maturin
(fenixToolchain.withComponents [
"cargo"
"rustc"
"clippy"
"rustfmt"
"rust-src"
])
rustup # Just here to make RustRover happy
# NIX
nixpkgs-fmt
@@ -115,20 +107,31 @@
just
jq
]
++ lib.optionals stdenv.isLinux [
++ (lib.optionals stdenv.isLinux [
# IFCONFIG
unixtools.ifconfig
]
++ lib.optionals stdenv.isDarwin [
macmon
];
OPENSSL_NO_VENDOR = "1";
# Build dependencies for Linux
pkg-config
openssl
])
++ (lib.optionals stdenv.isDarwin [
# MACMON
macmon
]);
shellHook = ''
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${python313}/lib"
${lib.optionalString stdenv.isLinux ''
export LD_LIBRARY_PATH="${openssl.out}/lib:$LD_LIBRARY_PATH"
# PYTHON
export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:${pkgs.python313}/lib"
${lib.optionalString pkgs.stdenv.isLinux ''
# Build environment for Linux
export PKG_CONFIG_PATH="${pkgs.openssl.dev}/lib/pkgconfig:$PKG_CONFIG_PATH"
export LD_LIBRARY_PATH="${pkgs.openssl.out}/lib:$LD_LIBRARY_PATH"
''}
echo
echo "🍎🍎 Run 'just <recipe>' to get started"
just --list
'';
};
};

View File

@@ -0,0 +1,79 @@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0ed30932..d8528132 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -177,11 +177,7 @@ if(MLX_BUILD_METAL)
add_compile_definitions(MLX_METAL_DEBUG)
endif()
- # Throw an error if xcrun not found
- execute_process(
- COMMAND zsh "-c" "/usr/bin/xcrun -sdk macosx --show-sdk-version"
- OUTPUT_VARIABLE MACOS_SDK_VERSION
- OUTPUT_STRIP_TRAILING_WHITESPACE COMMAND_ERROR_IS_FATAL ANY)
+ set(MACOS_SDK_VERSION @sdkVersion@)
if(${MACOS_SDK_VERSION} LESS 14.0)
message(
@@ -199,11 +195,8 @@ if(MLX_BUILD_METAL)
endif()
set(XCRUN_FLAGS "-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
endif()
- execute_process(
- COMMAND
- zsh "-c"
- "echo \"__METAL_VERSION__\" | xcrun -sdk macosx metal ${XCRUN_FLAGS} -E -x metal -P - | tail -1 | tr -d '\n'"
- OUTPUT_VARIABLE MLX_METAL_VERSION COMMAND_ERROR_IS_FATAL ANY)
+ set(
+ MLX_METAL_VERSION @metalVersion@)
FetchContent_Declare(metal_cpp URL ${METAL_CPP_URL})
FetchContent_MakeAvailable(metal_cpp)
target_include_directories(
diff --git a/cmake/extension.cmake b/cmake/extension.cmake
index 13db804a..5b385132 100644
--- a/cmake/extension.cmake
+++ b/cmake/extension.cmake
@@ -36,7 +36,7 @@ macro(mlx_build_metallib)
add_custom_command(
OUTPUT ${MTLLIB_BUILD_TARGET}
COMMAND
- xcrun -sdk macosx metal
+ metal
"$<LIST:TRANSFORM,${MTLLIB_INCLUDE_DIRS},PREPEND,-I>"
${MTLLIB_COMPILE_OPTIONS} ${MTLLIB_SOURCES} -o ${MTLLIB_BUILD_TARGET}
DEPENDS ${MTLLIB_DEPS} ${MTLLIB_SOURCES}
diff --git a/mlx/backend/metal/kernels/CMakeLists.txt b/mlx/backend/metal/kernels/CMakeLists.txt
index 262b0495..5c7446ad 100644
--- a/mlx/backend/metal/kernels/CMakeLists.txt
+++ b/mlx/backend/metal/kernels/CMakeLists.txt
@@ -29,7 +29,7 @@ function(build_kernel_base TARGET SRCFILE DEPS)
"-mmacosx-version-min=${CMAKE_OSX_DEPLOYMENT_TARGET}")
endif()
add_custom_command(
- COMMAND xcrun -sdk macosx metal ${METAL_FLAGS} -c ${SRCFILE}
+ COMMAND metal ${METAL_FLAGS} -c ${SRCFILE}
-I${PROJECT_SOURCE_DIR} -o ${TARGET}.air
DEPENDS ${SRCFILE} ${DEPS} ${BASE_HEADERS}
OUTPUT ${TARGET}.air
@@ -170,7 +170,7 @@ endif()
add_custom_command(
OUTPUT ${MLX_METAL_PATH}/mlx.metallib
- COMMAND xcrun -sdk macosx metallib ${KERNEL_AIR} -o
+ COMMAND metallib ${KERNEL_AIR} -o
${MLX_METAL_PATH}/mlx.metallib
DEPENDS ${KERNEL_AIR}
COMMENT "Building mlx.metallib"
diff --git a/mlx/backend/metal/make_compiled_preamble.sh b/mlx/backend/metal/make_compiled_preamble.sh
index bb55ed3a..94ea7dd7 100644
--- a/mlx/backend/metal/make_compiled_preamble.sh
+++ b/mlx/backend/metal/make_compiled_preamble.sh
@@ -31,7 +31,7 @@ OUTPUT_FILE=${OUTPUT_DIR}/${SRC_NAME}.cpp
mkdir -p "$OUTPUT_DIR"
# Use the metal compiler to get a list of headers (with depth)
-CCC="xcrun -sdk macosx metal -x metal"
+CCC="metal -x metal"
HDRS=$( $CCC -I"$SRC_DIR" -I"$JIT_INCLUDES" -DMLX_METAL_JIT -E -P -CC -C -H "$INPUT_FILE" $CFLAGS -w 2>&1 1>/dev/null )
# Remove any included system frameworks (for MetalPerformancePrimitive headers)

22
nix/metalWrapper.nix Normal file
View File

@@ -0,0 +1,22 @@
{ stdenv
, metalVersion
, xcodeBaseDir ? "/Applications/Xcode.app"
}:
assert stdenv.isDarwin;
stdenv.mkDerivation {
pname = "metal-wrapper-impure";
version = metalVersion;
__noChroot = true;
buildCommand = ''
DEVELOPER_DIR=${xcodeBaseDir}/Contents/Developer
[[ -x "$DEVELOPER_DIR/usr/bin/xcodebuild" ]] || (echo "Missing xcodebuild at $DEVELOPER_DIR/usr/bin/xcodebuild" && exit 1)
SDKROOT=${xcodeBaseDir}/Contents/Developer/Platforms/MacOSX.platform/Developer/SDKs/MacOSX.sdk
[[ -d "$SDKROOT" ]] || (echo "Missing SDKROOT at $SDKROOT" && exit 1)
export DEVELOPER_DIR SDKROOT
mkdir -p $out/bin && cd $out/bin
ln -s $(/usr/bin/xcrun --sdk macosx -f metal)
ln -s $(/usr/bin/xcrun --sdk macosx -f metallib)
[[ -f $out/bin/metal ]] && [[ -f $out/bin/metallib ]] || exit 1
'';
}

154
nix/mlx.nix Normal file
View File

@@ -0,0 +1,154 @@
{ stdenv
, lib
, buildPythonPackage
, fetchFromGitHub
, replaceVars
, fetchzip
, setuptools
, cmake
, nanobind
, pybind11
, nlohmann_json
, apple-sdk_26
, metal
, numpy
, pytestCheckHook
, python
, runCommand
, fmt
}:
assert stdenv.isDarwin;
let
# static dependencies included directly during compilation
gguf-tools = fetchFromGitHub {
owner = "antirez";
repo = "gguf-tools";
rev = "8fa6eb65236618e28fd7710a0fba565f7faa1848";
hash = "sha256-15FvyPOFqTOr5vdWQoPnZz+mYH919++EtghjozDlnSA=";
};
metal_cpp = fetchzip {
url = "https://developer.apple.com/metal/cpp/files/metal-cpp_26.zip";
hash = "sha256-7n2eI2lw/S+Us6l7YPAATKwcIbRRpaQ8VmES7S8ZjY8=";
};
mlx = buildPythonPackage rec {
pname = "mlx";
version = "0.30.1";
pyproject = true;
src = fetchFromGitHub {
owner = "ml-explore";
repo = "mlx";
tag = "v${version}";
hash = "sha256-Vt0RH+70VBwUjXSfPTsNdRS3g0ookJHhzf2kvgEtgH8=";
};
patches = [
(replaceVars ./darwin-build-fixes.patch {
sdkVersion = apple-sdk_26.version;
metalVersion = metal.version;
})
];
postPatch = ''
substituteInPlace pyproject.toml \
--replace-fail "nanobind==2.10.2" "nanobind"
substituteInPlace mlx/backend/cpu/jit_compiler.cpp \
--replace-fail "g++" "$CXX"
'';
dontUseCmakeConfigure = true;
enableParallelBuilding = true;
# Allows multiple cores to be used in Python builds.
postUnpack = ''
export MAKEFLAGS+="''${enableParallelBuilding:+-j$NIX_BUILD_CORES}"
'';
# updates the wrong fetcher rev attribute
passthru.skipBulkUpdate = true;
env = {
DEV_RELEASE = 1;
# NOTE The `metal` command-line utility used to build the Metal kernels is not open-source.
# this is what the xcode wrapper is for - it patches in the system metal cli
CMAKE_ARGS = toString [
(lib.cmakeBool "USE_SYSTEM_FMT" true)
(lib.cmakeOptionType "filepath" "FETCHCONTENT_SOURCE_DIR_GGUFLIB" "${gguf-tools}")
(lib.cmakeOptionType "filepath" "FETCHCONTENT_SOURCE_DIR_JSON" "${nlohmann_json.src}")
(lib.cmakeBool "FETCHCONTENT_FULLY_DISCONNECTED" true)
(lib.cmakeBool "MLX_BUILD_METAL" true)
(lib.cmakeOptionType "filepath" "METAL_LIB"
"${metal}/Metal.framework")
(lib.cmakeOptionType "filepath" "FETCHCONTENT_SOURCE_DIR_METAL_CPP" "${metal_cpp}")
(lib.cmakeOptionType "string" "CMAKE_OSX_DEPLOYMENT_TARGET" "${apple-sdk_26.version}")
(lib.cmakeOptionType "filepath" "CMAKE_OSX_SYSROOT" "${apple-sdk_26.passthru.sdkroot}")
];
SDKROOT = apple-sdk_26.passthru.sdkroot;
MACOSX_DEPLOYMENT_TARGET = apple-sdk_26.version;
};
build-system = [
setuptools
];
nativeBuildInputs = [
cmake
metal
];
buildInputs = [
fmt
gguf-tools
nanobind
pybind11
apple-sdk_26
];
pythonImportsCheck = [ "mlx" ];
# Run the mlx Python test suite.
nativeCheckInputs = [
numpy
pytestCheckHook
];
enabledTestPaths = [
"python/tests/"
];
# Additional testing by executing the example Python scripts supplied with mlx
# using the version of the library we've built.
passthru.tests = {
mlxTest =
runCommand "run-mlx-examples"
{
buildInputs = [ mlx ];
nativeBuildInputs = [ python ];
}
''
cp ${src}/examples/python/logistic_regression.py .
${python.interpreter} logistic_regression.py
rm logistic_regression.py
cp ${src}/examples/python/linear_regression.py .
${python.interpreter} linear_regression.py
rm linear_regression.py
touch $out
'';
};
meta = {
homepage = "https://github.com/ml-explore/mlx";
description = "Array framework for Apple silicon";
changelog = "https://github.com/ml-explore/mlx/releases/tag/${src.tag}";
license = lib.licenses.mit;
platforms = [ "x86_64-linux" "aarch64-linux" "aarch64-darwin" ];
};
};
in
mlx

View File

@@ -23,7 +23,6 @@ dependencies = [
"tiktoken>=0.12.0", # required for kimi k2 tokenizer
"hypercorn>=0.18.0",
"openai-harmony>=0.0.8",
"tomlkit>=0.14.0",
]
[project.scripts]

View File

@@ -1,15 +0,0 @@
short_id = "deepseek-v3.1-4bit"
model_id = "mlx-community/DeepSeek-V3.1-4bit"
name = "DeepSeek V3.1 (4-bit)"
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/DeepSeek-V3.1-4bit"
pretty_name = "DeepSeek V3.1 (4-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 405874409472

View File

@@ -1,15 +0,0 @@
short_id = "deepseek-v3.1-8bit"
model_id = "mlx-community/DeepSeek-V3.1-8bit"
name = "DeepSeek V3.1 (8-bit)"
description = "DeepSeek V3.1 is a large language model trained on the DeepSeek V3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/DeepSeek-V3.1-8bit"
pretty_name = "DeepSeek V3.1 (8-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 765577920512

View File

@@ -1,15 +0,0 @@
short_id = "glm-4.5-air-8bit"
model_id = "mlx-community/GLM-4.5-Air-8bit"
name = "GLM 4.5 Air 8bit"
description = "GLM 4.5 Air 8bit"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.5-Air-8bit"
pretty_name = "GLM 4.5 Air 8bit"
n_layers = 46
hidden_size = 4096
supports_tensor = false
[metadata.storage_size]
in_bytes = 122406567936

View File

@@ -1,15 +0,0 @@
short_id = "glm-4.5-air-bf16"
model_id = "mlx-community/GLM-4.5-Air-bf16"
name = "GLM 4.5 Air bf16"
description = "GLM 4.5 Air bf16"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.5-Air-bf16"
pretty_name = "GLM 4.5 Air bf16"
n_layers = 46
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 229780750336

View File

@@ -1,15 +0,0 @@
short_id = "glm-4.7-4bit"
model_id = "mlx-community/GLM-4.7-4bit"
name = "GLM 4.7 4bit"
description = "GLM 4.7 4bit"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.7-4bit"
pretty_name = "GLM 4.7 4bit"
n_layers = 91
hidden_size = 5120
supports_tensor = true
[metadata.storage_size]
in_bytes = 198556925568

View File

@@ -1,15 +0,0 @@
short_id = "glm-4.7-6bit"
model_id = "mlx-community/GLM-4.7-6bit"
name = "GLM 4.7 6bit"
description = "GLM 4.7 6bit"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.7-6bit"
pretty_name = "GLM 4.7 6bit"
n_layers = 91
hidden_size = 5120
supports_tensor = true
[metadata.storage_size]
in_bytes = 286737579648

View File

@@ -1,15 +0,0 @@
short_id = "glm-4.7-8bit-gs32"
model_id = "mlx-community/GLM-4.7-8bit-gs32"
name = "GLM 4.7 8bit (gs32)"
description = "GLM 4.7 8bit (gs32)"
tags = []
[metadata]
model_id = "mlx-community/GLM-4.7-8bit-gs32"
pretty_name = "GLM 4.7 8bit (gs32)"
n_layers = 91
hidden_size = 5120
supports_tensor = true
[metadata.storage_size]
in_bytes = 396963397248

View File

@@ -1,15 +0,0 @@
short_id = "gpt-oss-120b-MXFP4-Q8"
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
description = "OpenAI's GPT-OSS 120B is a 117B-parameter Mixture-of-Experts model designed for high-reasoning and general-purpose use; this variant is a 4-bit MLX conversion for Apple Silicon."
tags = []
[metadata]
model_id = "mlx-community/gpt-oss-120b-MXFP4-Q8"
pretty_name = "GPT-OSS 120B (MXFP4-Q8, MLX)"
n_layers = 36
hidden_size = 2880
supports_tensor = true
[metadata.storage_size]
in_bytes = 70652212224

View File

@@ -1,15 +0,0 @@
short_id = "gpt-oss-20b-4bit"
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
description = "OpenAI's GPT-OSS 20B is a medium-sized MoE model for lower-latency and local or specialized use cases; this MLX variant uses MXFP4 4-bit quantization."
tags = []
[metadata]
model_id = "mlx-community/gpt-oss-20b-MXFP4-Q4"
pretty_name = "GPT-OSS 20B (MXFP4-Q4, MLX)"
n_layers = 24
hidden_size = 2880
supports_tensor = true
[metadata.storage_size]
in_bytes = 12025908224

View File

@@ -1,15 +0,0 @@
short_id = "kimi-k2-instruct-4bit"
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
name = "Kimi K2 Instruct (4-bit)"
description = "Kimi K2 is a large language model trained on the Kimi K2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Kimi-K2-Instruct-4bit"
pretty_name = "Kimi K2 Instruct (4-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 620622774272

View File

@@ -1,15 +0,0 @@
short_id = "kimi-k2-thinking"
model_id = "mlx-community/Kimi-K2-Thinking"
name = "Kimi K2 Thinking (4-bit)"
description = "Kimi K2 Thinking is the latest, most capable version of open-source thinking model."
tags = []
[metadata]
model_id = "mlx-community/Kimi-K2-Thinking"
pretty_name = "Kimi K2 Thinking (4-bit)"
n_layers = 61
hidden_size = 7168
supports_tensor = true
[metadata.storage_size]
in_bytes = 706522120192

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.1-70b"
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
name = "Llama 3.1 70B (4-bit)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-70B-Instruct-4bit"
pretty_name = "Llama 3.1 70B (4-bit)"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 40652242944

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.1-8b-8bit"
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
name = "Llama 3.1 8B (8-bit)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-8bit"
pretty_name = "Llama 3.1 8B (8-bit)"
n_layers = 32
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 8954839040

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.1-8b-bf16"
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
name = "Llama 3.1 8B (BF16)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-bf16"
pretty_name = "Llama 3.1 8B (BF16)"
n_layers = 32
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 16882073600

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.1-8b"
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
name = "Llama 3.1 8B (4-bit)"
description = "Llama 3.1 is a large language model trained on the Llama 3.1 dataset."
tags = []
[metadata]
model_id = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"
pretty_name = "Llama 3.1 8B (4-bit)"
n_layers = 32
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 4637851648

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.2-1b"
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
name = "Llama 3.2 1B (4-bit)"
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Llama-3.2-1B-Instruct-4bit"
pretty_name = "Llama 3.2 1B (4-bit)"
n_layers = 16
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 729808896

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.2-3b-8bit"
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
name = "Llama 3.2 3B (8-bit)"
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Llama-3.2-3B-Instruct-8bit"
pretty_name = "Llama 3.2 3B (8-bit)"
n_layers = 28
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 3501195264

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.2-3b"
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
name = "Llama 3.2 3B (4-bit)"
description = "Llama 3.2 is a large language model trained on the Llama 3.2 dataset."
tags = []
[metadata]
model_id = "mlx-community/Llama-3.2-3B-Instruct-4bit"
pretty_name = "Llama 3.2 3B (4-bit)"
n_layers = 28
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 1863319552

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.3-70b-8bit"
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
name = "Llama 3.3 70B (8-bit)"
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
tags = []
[metadata]
model_id = "mlx-community/Llama-3.3-70B-Instruct-8bit"
pretty_name = "Llama 3.3 70B (8-bit)"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 76799803392

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.3-70b-fp16"
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
name = "Llama 3.3 70B (FP16)"
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
tags = []
[metadata]
model_id = "mlx-community/llama-3.3-70b-instruct-fp16"
pretty_name = "Llama 3.3 70B (FP16)"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 144383672320

View File

@@ -1,15 +0,0 @@
short_id = "llama-3.3-70b"
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
name = "Llama 3.3 70B (4-bit)"
description = "The Meta Llama 3.3 multilingual large language model (LLM) is an instruction tuned generative model in 70B (text in/text out)"
tags = []
[metadata]
model_id = "mlx-community/Llama-3.3-70B-Instruct-4bit"
pretty_name = "Llama 3.3 70B"
n_layers = 80
hidden_size = 8192
supports_tensor = true
[metadata.storage_size]
in_bytes = 40652242944

View File

@@ -1,15 +0,0 @@
short_id = "minimax-m2.1-3bit"
model_id = "mlx-community/MiniMax-M2.1-3bit"
name = "MiniMax M2.1 3bit"
description = "MiniMax M2.1 3bit"
tags = []
[metadata]
model_id = "mlx-community/MiniMax-M2.1-3bit"
pretty_name = "MiniMax M2.1 3bit"
n_layers = 61
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 100086644736

View File

@@ -1,15 +0,0 @@
short_id = "minimax-m2.1-8bit"
model_id = "mlx-community/MiniMax-M2.1-8bit"
name = "MiniMax M2.1 8bit"
description = "MiniMax M2.1 8bit"
tags = []
[metadata]
model_id = "mlx-community/MiniMax-M2.1-8bit"
pretty_name = "MiniMax M2.1 8bit"
n_layers = 61
hidden_size = 3072
supports_tensor = true
[metadata.storage_size]
in_bytes = 242986745856

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-0.6b-8bit"
model_id = "mlx-community/Qwen3-0.6B-8bit"
name = "Qwen3 0.6B (8-bit)"
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-0.6B-8bit"
pretty_name = "Qwen3 0.6B (8-bit)"
n_layers = 28
hidden_size = 1024
supports_tensor = false
[metadata.storage_size]
in_bytes = 698351616

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-0.6b"
model_id = "mlx-community/Qwen3-0.6B-4bit"
name = "Qwen3 0.6B (4-bit)"
description = "Qwen3 0.6B is a large language model trained on the Qwen3 0.6B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-0.6B-4bit"
pretty_name = "Qwen3 0.6B (4-bit)"
n_layers = 28
hidden_size = 1024
supports_tensor = false
[metadata.storage_size]
in_bytes = 342884352

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-235b-a22b-4bit"
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
name = "Qwen3 235B A22B (4-bit)"
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-4bit"
pretty_name = "Qwen3 235B A22B (4-bit)"
n_layers = 94
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 141733920768

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-235b-a22b-8bit"
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
name = "Qwen3 235B A22B (8-bit)"
description = "Qwen3 235B (Active 22B) is a large language model trained on the Qwen3 235B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-235B-A22B-Instruct-2507-8bit"
pretty_name = "Qwen3 235B A22B (8-bit)"
n_layers = 94
hidden_size = 4096
supports_tensor = true
[metadata.storage_size]
in_bytes = 268435456000

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-30b-8bit"
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
name = "Qwen3 30B A3B (8-bit)"
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-30B-A3B-8bit"
pretty_name = "Qwen3 30B A3B (8-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 33279705088

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-30b"
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
name = "Qwen3 30B A3B (4-bit)"
description = "Qwen3 30B is a large language model trained on the Qwen3 30B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-30B-A3B-4bit"
pretty_name = "Qwen3 30B A3B (4-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 17612931072

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-80b-a3B-4bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
name = "Qwen3 80B A3B (4-bit)"
description = "Qwen3 80B"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-4bit"
pretty_name = "Qwen3 80B A3B (4-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 46976204800

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-80b-a3B-8bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
name = "Qwen3 80B A3B (8-bit)"
description = "Qwen3 80B"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Instruct-8bit"
pretty_name = "Qwen3 80B A3B (8-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 88814387200

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-80b-a3B-thinking-4bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
name = "Qwen3 80B A3B Thinking (4-bit)"
description = "Qwen3 80B Reasoning model"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-4bit"
pretty_name = "Qwen3 80B A3B (4-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 88814387200

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-80b-a3B-thinking-8bit"
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
name = "Qwen3 80B A3B Thinking (8-bit)"
description = "Qwen3 80B Reasoning model"
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Next-80B-A3B-Thinking-8bit"
pretty_name = "Qwen3 80B A3B (8-bit)"
n_layers = 48
hidden_size = 2048
supports_tensor = true
[metadata.storage_size]
in_bytes = 88814387200

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-coder-480b-a35b-4bit"
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
name = "Qwen3 Coder 480B A35B (4-bit)"
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-4bit"
pretty_name = "Qwen3 Coder 480B A35B (4-bit)"
n_layers = 62
hidden_size = 6144
supports_tensor = true
[metadata.storage_size]
in_bytes = 289910292480

View File

@@ -1,15 +0,0 @@
short_id = "qwen3-coder-480b-a35b-8bit"
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
name = "Qwen3 Coder 480B A35B (8-bit)"
description = "Qwen3 Coder 480B (Active 35B) is a large language model trained on the Qwen3 Coder 480B dataset."
tags = []
[metadata]
model_id = "mlx-community/Qwen3-Coder-480B-A35B-Instruct-8bit"
pretty_name = "Qwen3 Coder 480B A35B (8-bit)"
n_layers = 62
hidden_size = 6144
supports_tensor = true
[metadata.storage_size]
in_bytes = 579820584960

View File

@@ -1,145 +0,0 @@
{ inputs, ... }:
{
perSystem =
{ config, self', inputs', pkgs, lib, ... }:
let
# Fenix nightly toolchain with all components
fenixPkgs = inputs'.fenix.packages;
rustToolchain = fenixPkgs.complete.withComponents [
"cargo"
"rustc"
"clippy"
"rustfmt"
"rust-src"
"rust-analyzer"
];
# Crane with fenix toolchain
craneLib = (inputs.crane.mkLib pkgs).overrideToolchain rustToolchain;
# Source filtering - only include rust/ directory and root Cargo files
# This ensures changes to Python/docs/etc don't trigger Rust rebuilds
src = lib.cleanSourceWith {
src = inputs.self;
filter =
path: type:
let
baseName = builtins.baseNameOf path;
parentDir = builtins.dirOf path;
inRustDir =
(lib.hasInfix "/rust/" path)
|| (lib.hasSuffix "/rust" parentDir)
|| (baseName == "rust" && type == "directory");
isRootCargoFile =
(baseName == "Cargo.toml" || baseName == "Cargo.lock")
&& (builtins.dirOf path == toString inputs.self);
in
isRootCargoFile
|| (inRustDir && (craneLib.filterCargoSources path type || lib.hasSuffix ".toml" path || lib.hasSuffix ".md" path));
};
# Common arguments for all Rust builds
commonArgs = {
inherit src;
pname = "exo-rust";
version = "0.0.1";
strictDeps = true;
nativeBuildInputs = [
pkgs.pkg-config
pkgs.python313 # Required for pyo3-build-config
];
buildInputs = [
pkgs.openssl
pkgs.python313 # Required for pyo3 tests
];
OPENSSL_NO_VENDOR = "1";
# Required for pyo3 tests to find libpython
LD_LIBRARY_PATH = lib.makeLibraryPath [ pkgs.python313 ];
};
# Build dependencies once for caching
cargoArtifacts = craneLib.buildDepsOnly (
commonArgs
// {
cargoExtraArgs = "--workspace";
}
);
in
{
# Export toolchain for use in treefmt and devShell
options.rust = {
toolchain = lib.mkOption {
type = lib.types.package;
default = rustToolchain;
description = "The Rust toolchain to use";
};
};
config = {
packages = {
# Python bindings wheel via maturin
exo_pyo3_bindings = craneLib.buildPackage (
commonArgs
// {
inherit cargoArtifacts;
pname = "exo_pyo3_bindings";
nativeBuildInputs = commonArgs.nativeBuildInputs ++ [
pkgs.maturin
];
buildPhaseCargoCommand = ''
maturin build \
--release \
--manylinux off \
--manifest-path rust/exo_pyo3_bindings/Cargo.toml \
--features "pyo3/extension-module,pyo3/experimental-async" \
--interpreter ${pkgs.python313}/bin/python \
--out dist
'';
# Don't use crane's default install behavior
doNotPostBuildInstallCargoBinaries = true;
installPhaseCommand = ''
mkdir -p $out
cp dist/*.whl $out/
'';
}
);
};
checks = {
# Full workspace build (all crates)
cargo-build = craneLib.buildPackage (
commonArgs
// {
inherit cargoArtifacts;
cargoExtraArgs = "--workspace";
}
);
# Run tests with nextest
cargo-nextest = craneLib.cargoNextest (
commonArgs
// {
inherit cargoArtifacts;
cargoExtraArgs = "--workspace";
}
);
# Build documentation
cargo-doc = craneLib.cargoDoc (
commonArgs
// {
inherit cargoArtifacts;
cargoExtraArgs = "--workspace";
}
);
};
};
};
}

View File

@@ -0,0 +1,47 @@
[package]
name = "system_custodian"
version = { workspace = true }
edition = { workspace = true }
publish = false
[lib]
doctest = false
name = "system_custodian"
path = "src/lib.rs"
[[bin]]
path = "src/bin/main.rs"
name = "system_custodian"
doc = false
[lints]
workspace = true
[dependencies]
# datastructures
either = { workspace = true }
# macro dependencies
extend = { workspace = true }
delegate = { workspace = true }
impl-trait-for-tuples = { workspace = true }
derive_more = { workspace = true }
# async
tokio = { workspace = true, features = ["full"] }
futures = { workspace = true }
futures-timer = { workspace = true }
# utility dependencies
util = { workspace = true }
thiserror = { workspace = true }
#internment = { workspace = true }
#recursion = { workspace = true }
#generativity = { workspace = true }
#itertools = { workspace = true }
tracing-subscriber = { version = "0.3.19", features = ["default", "env-filter"] }
keccak-const = { workspace = true }
# tracing/logging
log = { workspace = true }

View File

@@ -0,0 +1,4 @@
//! TODO: documentation
//!
fn main() {}

View File

@@ -0,0 +1,69 @@
//! This crate defines the logic of, and ways to interact with, Exo's **_System Custodian_** daemon.
//!
//! The **_System Custodian_** daemon is supposed to be a long-living process that precedes the
//! launch of the Exo application, and responsible for ensuring the system (configuration, settings,
//! etc.) is in an appropriate state to facilitate the running of Exo application.
//! The **_System Custodian_** daemon shall expose a [D-Bus](https://www.freedesktop.org/wiki/Software/dbus/)
//! service which Exo application use to _control & query_ it.
//!
//! # Lifecycle
//! When the Exo application starts, it will _wake_ the **_System Custodian_** daemon for the
//! duration of its lifetime, and after it has terminated the daemon will go back to sleep. When
//! the daemon wakes up, it will configure the system into a state suitable for the Exo Application;
//! When the daemon goes to sleep, it will revert those changes as much as it can in case they were
//! destructive to the user's pre-existing configurations.
//!
//! # Responsibilities
//! TODO: these are purely on MacOS, but change to be more broad
//! The **_System Custodian_** daemon is responsible for using System Configuration framework to
//! 1. duplicate the current network set
//! 2. modify existing services to turn on IPv6 if not there
//! 3. remove any bridge services & add any missing services that AREN'T bridge
//! TODO: In the future:
//! 1. run a dummy AWDL service to [allow for macOS peer-to-peer wireless networking](https://yggdrasil-network.github.io/2019/08/19/awdl.html)
//! 2. toggle some GPU/memory configurations to speed up GPU (ask Alex what those configurations are)
//! 3. if we ever decide to provide our **own network interfaces** that abstract over some userland
//! logic, this would be the place to spin that up.
//!
//! Then it will watch the SCDynamicStore for:
//! 1. all __actual__ network interfaces -> collect information on them e.g. their BSD name, MAC
//! address, MTU, IPv6 addresses, etc. -> and set up watchers/notifiers to inform the DBus
//! interface of any changes
//! 2. watch for any __undesirable__ changes to configuration and revert it
//!
//! It should somehow (probably through system sockets and/or BSD interface) trigger IPv6 NDP on
//! each of the interfaces & also listen to/query for any changes on the OS routing cache??
//! Basically emulate the `ping6 ff02::1%enX` and `ndp -an` commands BUT BETTER!!!
//! 1. all that info should coalesce back to the overall state colleted -> should be queryable
//! over D-Bus
//! TODO:
//! 1. we might potentially add to this step a handshake of some kind...? To ensure that we can
//! ACTUALLY communicate with that machine over that link over e.g. TCP, UDP, etc. Will the
//! handshake require to know Node ID? Will the handshake require heartbeats? Who knows...
//! 2. if we ever decide to write proprietary L2/L3 protocols for quicker communication,
//! e.g. [AF_NDRV](https://www.zerotier.com/blog/how-zerotier-eliminated-kernel-extensions-on-macos/)
//! for raw ethernet frame communication, or even a [custom thunderbolt PCIe driver](https://developer.apple.com/documentation/pcidriverkit/creating-custom-pcie-drivers-for-thunderbolt-devices),
//! then this would be the place to carry out discovery and propper handshakes with devices
//! on the other end of the link.
//!
// enable Rust-unstable features for convenience
#![feature(trait_alias)]
#![feature(stmt_expr_attributes)]
#![feature(type_alias_impl_trait)]
#![feature(specialization)]
#![feature(unboxed_closures)]
#![feature(const_trait_impl)]
#![feature(fn_traits)]
pub(crate) mod private {
// sealed traits support
pub trait Sealed {}
impl<T: ?Sized> Sealed for T {}
}
/// Namespace for all the type/trait aliases used by this crate.
pub(crate) mod alias {}
/// Namespace for crate-wide extension traits/methods
pub(crate) mod ext {}

View File

@@ -1,6 +1,3 @@
from anyio import Path, open_file
import tomlkit
from exo.shared.types.memory import Memory
from exo.shared.types.models import ModelId, ModelMetadata
from exo.utils.pydantic_ext import CamelCaseModel
@@ -14,24 +11,35 @@ class ModelCard(CamelCaseModel):
tags: list[str]
metadata: ModelMetadata
@staticmethod
async def load(path: Path) -> "ModelCard":
async with await open_file(path) as f:
data = await f.read()
py = tomlkit.loads(data)
return ModelCard.model_validate(py)
async def save(self, path: Path):
async with await open_file(path, "w") as f:
py = self.model_dump()
data = tomlkit.dumps(py) # pyright: ignore[reportUnknownMemberType]
await f.write(data)
MODEL_CARDS: dict[str, ModelCard] = {
# deepseek v3
# "deepseek-v3-0324:4bit": ModelCard(
# short_id="deepseek-v3-0324:4bit",
# model_id="mlx-community/DeepSeek-V3-0324-4bit",
# name="DeepSeek V3 0324 (4-bit)",
# description="""DeepSeek V3 is a large language model trained on the DeepSeek V3 dataset.""",
# tags=[],
# metadata=ModelMetadata(
# model_id=ModelId("mlx-community/DeepSeek-V3-0324-4bit"),
# pretty_name="DeepSeek V3 0324 (4-bit)",
# storage_size=Memory.from_kb(409706307),
# n_layers=61,
# ),
# ),
# "deepseek-v3-0324": ModelCard(
# short_id="deepseek-v3-0324",
# model_id="mlx-community/DeepSeek-v3-0324-8bit",
# name="DeepSeek V3 0324 (8-bit)",
# description="""DeepSeek V3 is a large language model trained on the DeepSeek V3 dataset.""",
# tags=[],
# metadata=ModelMetadata(
# model_id=ModelId("mlx-community/DeepSeek-v3-0324-8bit"),
# pretty_name="DeepSeek V3 0324 (8-bit)",
# storage_size=Memory.from_kb(754706307),
# n_layers=61,
# ),
# ),
"deepseek-v3.1-4bit": ModelCard(
short_id="deepseek-v3.1-4bit",
model_id=ModelId("mlx-community/DeepSeek-V3.1-4bit"),
@@ -62,6 +70,63 @@ MODEL_CARDS: dict[str, ModelCard] = {
supports_tensor=True,
),
),
# "deepseek-v3.2": ModelCard(
# short_id="deepseek-v3.2",
# model_id=ModelId("mlx-community/DeepSeek-V3.2-8bit"),
# name="DeepSeek V3.2 (8-bit)",
# description="""DeepSeek V3.2 is a large language model trained on the DeepSeek V3.2 dataset.""",
# tags=[],
# metadata=ModelMetadata(
# model_id=ModelId("mlx-community/DeepSeek-V3.2-8bit"),
# pretty_name="DeepSeek V3.2 (8-bit)",
# storage_size=Memory.from_kb(754706307),
# n_layers=61,
# hidden_size=7168,
# ),
# ),
# "deepseek-v3.2-4bit": ModelCard(
# short_id="deepseek-v3.2-4bit",
# model_id=ModelId("mlx-community/DeepSeek-V3.2-4bit"),
# name="DeepSeek V3.2 (4-bit)",
# description="""DeepSeek V3.2 is a large language model trained on the DeepSeek V3.2 dataset.""",
# tags=[],
# metadata=ModelMetadata(
# model_id=ModelId("mlx-community/DeepSeek-V3.2-4bit"),
# pretty_name="DeepSeek V3.2 (4-bit)",
# storage_size=Memory.from_kb(754706307 // 2), # TODO !!!!!
# n_layers=61,
# hidden_size=7168,
# ),
# ),
# deepseek r1
# "deepseek-r1-0528-4bit": ModelCard(
# short_id="deepseek-r1-0528-4bit",
# model_id="mlx-community/DeepSeek-R1-0528-4bit",
# name="DeepSeek-R1-0528 (4-bit)",
# description="""DeepSeek R1 is a large language model trained on the DeepSeek R1 dataset.""",
# tags=[],
# metadata=ModelMetadata(
# model_id=ModelId("mlx-community/DeepSeek-R1-0528-4bit"),
# pretty_name="DeepSeek R1 671B (4-bit)",
# storage_size=Memory.from_kb(409706307),
# n_layers=61,
# hidden_size=7168,
# ),
# ),
# "deepseek-r1-0528": ModelCard(
# short_id="deepseek-r1-0528",
# model_id="mlx-community/DeepSeek-R1-0528-8bit",
# name="DeepSeek-R1-0528 (8-bit)",
# description="""DeepSeek R1 is a large language model trained on the DeepSeek R1 dataset.""",
# tags=[],
# metadata=ModelMetadata(
# model_id=ModelId("mlx-community/DeepSeek-R1-0528-8bit"),
# pretty_name="DeepSeek R1 671B (8-bit)",
# storage_size=Memory.from_bytes(754998771712),
# n_layers=61,
# . hidden_size=7168,
# ),
# ),
# kimi k2
"kimi-k2-instruct-4bit": ModelCard(
short_id="kimi-k2-instruct-4bit",
@@ -458,9 +523,8 @@ MODEL_CARDS: dict[str, ModelCard] = {
supports_tensor=True,
),
),
# glm 4.5
# Needs to be quantized g32 or g16.
"glm-4.5-air-8bit": ModelCard(
# Needs to be quantized g32 or g16 to work with tensor parallel
short_id="glm-4.5-air-8bit",
model_id=ModelId("mlx-community/GLM-4.5-Air-8bit"),
name="GLM 4.5 Air 8bit",
@@ -490,81 +554,19 @@ MODEL_CARDS: dict[str, ModelCard] = {
supports_tensor=True,
),
),
# glm 4.7
"glm-4.7-4bit": ModelCard(
short_id="glm-4.7-4bit",
model_id=ModelId("mlx-community/GLM-4.7-4bit"),
name="GLM 4.7 4bit",
description="GLM 4.7 4bit",
tags=[],
metadata=ModelMetadata(
model_id=ModelId("mlx-community/GLM-4.7-4bit"),
pretty_name="GLM 4.7 4bit",
storage_size=Memory.from_bytes(198556925568),
n_layers=91,
hidden_size=5120,
supports_tensor=True,
),
),
"glm-4.7-6bit": ModelCard(
short_id="glm-4.7-6bit",
model_id=ModelId("mlx-community/GLM-4.7-6bit"),
name="GLM 4.7 6bit",
description="GLM 4.7 6bit",
tags=[],
metadata=ModelMetadata(
model_id=ModelId("mlx-community/GLM-4.7-6bit"),
pretty_name="GLM 4.7 6bit",
storage_size=Memory.from_bytes(286737579648),
n_layers=91,
hidden_size=5120,
supports_tensor=True,
),
),
"glm-4.7-8bit-gs32": ModelCard(
short_id="glm-4.7-8bit-gs32",
model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
name="GLM 4.7 8bit (gs32)",
description="GLM 4.7 8bit (gs32)",
tags=[],
metadata=ModelMetadata(
model_id=ModelId("mlx-community/GLM-4.7-8bit-gs32"),
pretty_name="GLM 4.7 8bit (gs32)",
storage_size=Memory.from_bytes(396963397248),
n_layers=91,
hidden_size=5120,
supports_tensor=True,
),
),
# minimax-m2
"minimax-m2.1-8bit": ModelCard(
short_id="minimax-m2.1-8bit",
model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
name="MiniMax M2.1 8bit",
description="MiniMax M2.1 8bit",
tags=[],
metadata=ModelMetadata(
model_id=ModelId("mlx-community/MiniMax-M2.1-8bit"),
pretty_name="MiniMax M2.1 8bit",
storage_size=Memory.from_bytes(242986745856),
n_layers=61,
hidden_size=3072,
supports_tensor=True,
),
),
"minimax-m2.1-3bit": ModelCard(
short_id="minimax-m2.1-3bit",
model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
name="MiniMax M2.1 3bit",
description="MiniMax M2.1 3bit",
tags=[],
metadata=ModelMetadata(
model_id=ModelId("mlx-community/MiniMax-M2.1-3bit"),
pretty_name="MiniMax M2.1 3bit",
storage_size=Memory.from_bytes(100086644736),
n_layers=61,
hidden_size=3072,
supports_tensor=True,
),
),
# "devstral-2-123b-instruct-2512-8bit": ModelCard(
# short_id="devstral-2-123b-instruct-2512-8bit",
# model_id=ModelId("mlx-community/Devstral-2-123B-Instruct-2512-8bit"),
# name="Devstral 2 123B Instruct 2512 (8-bit, MLX)",
# description="""Mistral AI's Devstral 2 123B Instruct (2512) is an agentic coding model.""",
# tags=[],
# metadata=ModelMetadata(
# model_id=ModelId("mlx-community/Devstral-2-123B-Instruct-2512-8bit"),
# pretty_name="Devstral 2 123B Instruct 2512 (8-bit, MLX)",
# storage_size=Memory.from_kb(133_000_000),
# n_layers=88,
# hidden_size=12288,
# supports_tensor=True,
# ),
# ),
}

View File

@@ -10,24 +10,18 @@ from mlx.nn.layers.distributed import (
shard_linear,
sum_gradients,
)
from mlx_lm.models.cache import (
_BaseCache, # pyright: ignore[reportPrivateUsage]
)
from mlx_lm.models.deepseek_v3 import DeepseekV3MLP
from mlx_lm.models.deepseek_v3 import Model as DeepseekV3Model
from mlx_lm.models.deepseek_v32 import DeepseekV32MLP
from mlx_lm.models.deepseek_v32 import Model as DeepseekV32Model
from mlx_lm.models.glm4_moe import Model as Glm4MoeModel
from mlx_lm.models.glm4_moe import MoE
from mlx_lm.models.gpt_oss import GptOssMoeModel
from mlx_lm.models.gpt_oss import Model as GptOssModel
from mlx_lm.models.llama import Model as LlamaModel
from mlx_lm.models.minimax import Model as MiniMaxModel
from mlx_lm.models.ministral3 import Model as Ministral3Model
from mlx_lm.models.qwen3_moe import Model as Qwen3MoeModel
from mlx_lm.models.qwen3_moe import Qwen3MoeSparseMoeBlock
from mlx_lm.models.qwen3_next import Model as Qwen3NextModel
from mlx_lm.models.qwen3_next import Qwen3NextSparseMoeBlock
from exo.shared.logging import logger
from exo.shared.types.worker.shards import PipelineShardMetadata
from exo.shared.types.worker.shards import (
PipelineShardMetadata,
)
class _LayerCallable(Protocol):
@@ -97,6 +91,8 @@ class PipelineLastLayer(CustomMlxLayer):
x, *args, **kwargs
).arguments.get("cache", None)
assert cache is None or issubclass(type(cache), _BaseCache) # type: ignore
output: mx.array = self.original_layer(x, *args, **kwargs)
if self.r != self.s - 1:
@@ -104,6 +100,7 @@ class PipelineLastLayer(CustomMlxLayer):
output, (self.r + 1) % self.s, group=self.group
)
if cache is not None:
# This change happened upstream - check out mlx github somewhere??
cache.keys = mx.depends(cache.keys, output) # type: ignore[reportUnknownMemberType]
output = mx.distributed.all_gather(output, group=self.group)[-output.shape[0] :]
@@ -135,6 +132,24 @@ def _get_layers(inner_model_instance: nn.Module) -> list[_LayerCallable]:
return layers
def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
inner_model_instance = _inner_model(model)
if hasattr(inner_model_instance, "layers"):
inner_model_instance.layers = layers
# Update DeepSeek V3 specific parameters when layers are shrunk
if isinstance(model, DeepseekV3Model) and hasattr(
inner_model_instance, "num_layers"
):
inner_model_instance.start_idx = 0
inner_model_instance.end_idx = len(layers)
inner_model_instance.num_layers = len(layers)
elif hasattr(inner_model_instance, "h"):
inner_model_instance.h = layers
else:
raise ValueError("Model must have either a 'layers' or 'h' attribute")
def pipeline_auto_parallel(
model: nn.Module,
group: mx.distributed.Group,
@@ -150,7 +165,8 @@ def pipeline_auto_parallel(
"""
inner_model_instance: nn.Module = _inner_model(model)
layers = _get_layers(inner_model_instance)
# Handle both model.layers and model.h cases
layers: list[_LayerCallable] = _get_layers(inner_model_instance)
start_layer, end_layer = model_shard_meta.start_layer, model_shard_meta.end_layer
device_rank, world_size = model_shard_meta.device_rank, model_shard_meta.world_size
@@ -164,17 +180,6 @@ def pipeline_auto_parallel(
group=group,
)
if isinstance(inner_model_instance, GptOssMoeModel):
inner_model_instance.layer_types = inner_model_instance.layer_types[ # type: ignore
start_layer:end_layer
]
inner_model_instance.swa_idx = inner_model_instance.layer_types.index( # type: ignore
"sliding_attention"
)
inner_model_instance.ga_idx = inner_model_instance.layer_types.index( # type: ignore
"full_attention"
)
_set_layers(model, layers)
assert isinstance(layers, list), (
@@ -199,44 +204,18 @@ def tensor_auto_parallel(
group=group,
)
segments: int = 1
def _all_to_sharded(path: str, weight: mx.array):
if path.endswith("bias"):
logger.info(f"Sharding bias for {path} - all to sharded")
return weight.ndim - 1, segments
return max(weight.ndim - 2, 0), segments
all_to_sharded_linear_in_place = partial(
shard_inplace,
sharding=_all_to_sharded, # type: ignore
sharding="all-to-sharded",
group=group,
)
n = group.size()
def _sharded_to_all(path: str, weight: mx.array):
if path.endswith("bias"):
logger.info(f"Sharding bias for {path} - sharded to all")
weight /= n
return None
return -1, segments
sharded_to_all_linear_in_place = partial(
shard_inplace,
sharding=_sharded_to_all, # type: ignore
sharding="sharded-to-all",
group=group,
)
if hasattr(model, "shard"):
try:
model.shard(group) # type: ignore
return model
except (AttributeError, TypeError, NameError):
pass
if isinstance(model, (LlamaModel, Ministral3Model)):
logger.warning("shouldn't be hit - upstream sharding exists")
if isinstance(model, LlamaModel):
tensor_parallel_sharding_strategy = LlamaShardingStrategy(
group,
all_to_sharded_linear,
@@ -244,8 +223,7 @@ def tensor_auto_parallel(
all_to_sharded_linear_in_place,
sharded_to_all_linear_in_place,
)
elif isinstance(model, (DeepseekV3Model, DeepseekV32Model)):
logger.warning("shouldn't be hit - upstream sharding exists")
elif isinstance(model, DeepseekV3Model):
tensor_parallel_sharding_strategy = DeepSeekShardingStrategy(
group,
all_to_sharded_linear,
@@ -253,15 +231,7 @@ def tensor_auto_parallel(
all_to_sharded_linear_in_place,
sharded_to_all_linear_in_place,
)
elif isinstance(model, MiniMaxModel):
tensor_parallel_sharding_strategy = MiniMaxShardingStrategy(
group,
all_to_sharded_linear,
sharded_to_all_linear,
all_to_sharded_linear_in_place,
sharded_to_all_linear_in_place,
)
elif isinstance(model, (Qwen3MoeModel, Glm4MoeModel, Qwen3NextModel)):
elif isinstance(model, Qwen3MoeModel):
tensor_parallel_sharding_strategy = QwenShardingStrategy(
group,
all_to_sharded_linear,
@@ -269,15 +239,6 @@ def tensor_auto_parallel(
all_to_sharded_linear_in_place,
sharded_to_all_linear_in_place,
)
elif isinstance(model, GptOssModel):
tensor_parallel_sharding_strategy = GptOssShardingStrategy(
group,
all_to_sharded_linear,
sharded_to_all_linear,
all_to_sharded_linear_in_place,
sharded_to_all_linear_in_place,
)
else:
raise ValueError(f"Unsupported model type: {type(model)}")
@@ -323,32 +284,6 @@ class LlamaShardingStrategy(TensorParallelShardingStrategy):
return model
def _set_layers(model: nn.Module, layers: list[_LayerCallable]) -> None:
inner_model_instance = _inner_model(model)
if hasattr(inner_model_instance, "layers"):
inner_model_instance.layers = layers
# Update DeepSeek V3 specific parameters when layers are shrunk
if isinstance(
model, (DeepseekV3Model, DeepseekV32Model, Glm4MoeModel)
) and hasattr(inner_model_instance, "num_layers"):
logger.info(
f"Setting num_layers to {len(layers)} for model {model.model.__class__.__name__}"
)
inner_model_instance.start_idx = 0
inner_model_instance.end_idx = len(layers)
inner_model_instance.num_layers = len(layers)
elif isinstance(model, Qwen3MoeModel):
logger.info(
f"Setting num_hidden_layers to {len(layers)} for model {model.model.__class__.__name__}"
)
inner_model_instance.num_hidden_layers = len(layers)
elif hasattr(inner_model_instance, "h"):
inner_model_instance.h = layers
else:
raise ValueError("Model must have either a 'layers' or 'h' attribute")
class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
def shard_model(self, model: nn.Module) -> nn.Module:
model = cast(DeepseekV3Model, model)
@@ -369,7 +304,7 @@ class DeepSeekShardingStrategy(TensorParallelShardingStrategy):
layer.self_attn.num_heads //= self.N
# Shard the MLP
if isinstance(layer.mlp, (DeepseekV3MLP, DeepseekV32MLP)):
if isinstance(layer.mlp, DeepseekV3MLP):
layer.mlp.gate_proj = self.all_to_sharded_linear(layer.mlp.gate_proj)
layer.mlp.down_proj = self.sharded_to_all_linear(layer.mlp.down_proj)
layer.mlp.up_proj = self.all_to_sharded_linear(layer.mlp.up_proj)
@@ -403,35 +338,6 @@ class ShardedDeepseekV3MoE(CustomMlxLayer):
return y
class MiniMaxShardingStrategy(TensorParallelShardingStrategy):
def shard_model(self, model: nn.Module) -> nn.Module:
model = cast(MiniMaxModel, model)
for layer in model.layers:
# Shard the self attention
layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
layer.self_attn.num_attention_heads //= self.N
layer.self_attn.num_key_value_heads //= self.N
# Shard the MoE. Shard in place since the MoE should be responsible
# for aggregating the results.
self.all_to_sharded_linear_in_place(
layer.block_sparse_moe.switch_mlp.gate_proj
)
self.sharded_to_all_linear_in_place(
layer.block_sparse_moe.switch_mlp.down_proj
)
self.all_to_sharded_linear_in_place(
layer.block_sparse_moe.switch_mlp.up_proj
)
layer.block_sparse_moe = ShardedQwenMoE(layer.block_sparse_moe) # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
layer.block_sparse_moe.sharding_group = self.group
return model
class QwenShardingStrategy(TensorParallelShardingStrategy):
def shard_model(self, model: nn.Module) -> nn.Module:
model = cast(Qwen3MoeModel, model)
@@ -446,13 +352,11 @@ class QwenShardingStrategy(TensorParallelShardingStrategy):
# Shard the MoE. Shard in place since the MoE should be responsible
# for aggregating the results.
if isinstance(
layer.mlp, (Qwen3MoeSparseMoeBlock, MoE, Qwen3NextSparseMoeBlock)
):
if isinstance(layer.mlp, Qwen3MoeSparseMoeBlock):
self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.gate_proj)
self.sharded_to_all_linear_in_place(layer.mlp.switch_mlp.down_proj)
self.all_to_sharded_linear_in_place(layer.mlp.switch_mlp.up_proj)
layer.mlp = ShardedQwenMoE(layer.mlp) # pyright: ignore[reportAttributeAccessIssue, reportArgumentType]
layer.mlp = ShardedQwenMoE(layer.mlp) # type: ignore
layer.mlp.sharding_group = self.group
# Shard the MLP
@@ -476,50 +380,3 @@ class ShardedQwenMoE(CustomMlxLayer):
if self.sharding_group is not None:
y = mx.distributed.all_sum(y, group=self.sharding_group)
return y
class GptOssShardingStrategy(TensorParallelShardingStrategy):
def shard_model(self, model: nn.Module) -> nn.Module:
model = cast(GptOssMoeModel, model)
for layer in model.layers:
layer.self_attn.q_proj = self.all_to_sharded_linear(layer.self_attn.q_proj)
layer.self_attn.k_proj = self.all_to_sharded_linear(layer.self_attn.k_proj)
layer.self_attn.v_proj = self.all_to_sharded_linear(layer.self_attn.v_proj)
layer.self_attn.o_proj = self.sharded_to_all_linear(layer.self_attn.o_proj)
layer.self_attn.num_attention_heads //= self.N
layer.self_attn.num_key_value_heads //= self.N
layer.self_attn.num_key_value_groups = (
layer.self_attn.num_attention_heads
// layer.self_attn.num_key_value_heads
)
layer.self_attn.sinks = layer.self_attn.sinks[
layer.self_attn.num_attention_heads
* self.group.rank() : layer.self_attn.num_attention_heads
* (self.group.rank() + 1)
]
self.all_to_sharded_linear_in_place(layer.mlp.experts.gate_proj)
self.sharded_to_all_linear_in_place(layer.mlp.experts.down_proj)
self.all_to_sharded_linear_in_place(layer.mlp.experts.up_proj)
layer.mlp = ShardedGptOssMoE(layer.mlp) # type: ignore
layer.mlp.sharding_group = self.group
return model
class ShardedGptOssMoE(CustomMlxLayer):
def __init__(self, layer: nn.Module):
super().__init__(layer)
self.sharding_group: mx.distributed.Group | None = None
def __call__(self, x: mx.array) -> mx.array:
if self.sharding_group is not None:
x = sum_gradients(self.sharding_group)(x)
y = self.original_layer(x)
if self.sharding_group is not None:
y = mx.distributed.all_sum(y, group=self.sharding_group)
return y

View File

@@ -89,12 +89,6 @@ async def assert_downloads():
await sd.ensure_shard(
await build_full_shard(MODEL_CARDS["gpt-oss-20b-4bit"].model_id)
)
await sd.ensure_shard(
await build_full_shard(MODEL_CARDS["glm-4.7-8bit-gs32"].model_id)
)
await sd.ensure_shard(
await build_full_shard(MODEL_CARDS["minimax-m2.1-8bit"].model_id)
)
async def ring_backend(test: Tests):

1493
uv.lock generated
View File

File diff suppressed because it is too large Load Diff