mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-25 00:59:28 -04:00
Compare commits
6 Commits
master
...
feat/darwi
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4f7bf33b2d | ||
|
|
5e3774dfe3 | ||
|
|
bfb9a40d58 | ||
|
|
af7d0e8b40 | ||
|
|
7743a0abc0 | ||
|
|
3447b28bbd |
7
.github/backend-matrix.yml
vendored
7
.github/backend-matrix.yml
vendored
@@ -4974,6 +4974,13 @@ includeDarwin:
|
||||
- backend: "kitten-tts"
|
||||
tag-suffix: "-metal-darwin-arm64-kitten-tts"
|
||||
build-type: "mps"
|
||||
# vLLM on Apple Silicon via vllm-metal (MLX). The install is custom
|
||||
# (backend/python/vllm/install.sh has a darwin branch); lang stays python so
|
||||
# backend_build_darwin.yml drives it through build-darwin-python-backend ->
|
||||
# scripts/build/python-darwin.sh, which runs the backend's install.sh.
|
||||
- backend: "vllm"
|
||||
tag-suffix: "-metal-darwin-arm64-vllm"
|
||||
build-type: "mps"
|
||||
- backend: "liquid-audio"
|
||||
tag-suffix: "-metal-darwin-arm64-liquid-audio"
|
||||
build-type: "mps"
|
||||
|
||||
55
.github/bump_vllm_metal.sh
vendored
Executable file
55
.github/bump_vllm_metal.sh
vendored
Executable file
@@ -0,0 +1,55 @@
|
||||
#!/bin/bash
|
||||
# Bump the single vllm-metal pin (VLLM_METAL_VERSION) in the vLLM backend's
|
||||
# darwin (Apple Silicon) install path. The macOS/Metal build
|
||||
# (backend/python/vllm/install.sh, Darwin branch) installs vllm-metal, which is
|
||||
# version-locked to a specific vLLM source release. install.sh derives that vLLM
|
||||
# version at build time from vllm-metal's own installer (`vllm_v=`) at the pinned
|
||||
# tag, so there is only ONE value to bump here -- mirroring bump_vllm_wheel.sh,
|
||||
# which bumps the Linux cu130 wheel pin.
|
||||
#
|
||||
# This deliberately tracks vllm-project/vllm-metal, NOT vllm-project/vllm: the
|
||||
# darwin build can only use the exact vLLM version vllm-metal supports, so it may
|
||||
# lag the Linux pin (requirements-cublas13-after.txt) until vllm-metal catches up.
|
||||
set -xe
|
||||
REPO=$1 # vllm-project/vllm-metal
|
||||
FILE=$2 # backend/python/vllm/install.sh
|
||||
VAR=$3 # VLLM_METAL_VERSION (used for the workflow's output file names)
|
||||
|
||||
if [ -z "$FILE" ] || [ -z "$REPO" ] || [ -z "$VAR" ]; then
|
||||
echo "usage: $0 <repo> <install-file> <var-name>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# vllm-metal ships frequent dev releases, all flagged as non-prerelease, so
|
||||
# /releases/latest returns the newest one (with its cp312 wheel asset).
|
||||
LATEST_TAG=$(curl -sS -H "Accept: application/vnd.github+json" \
|
||||
"https://api.github.com/repos/$REPO/releases/latest" \
|
||||
| python3 -c "import json,sys; print(json.load(sys.stdin)['tag_name'])")
|
||||
|
||||
# The coupled vLLM source version lives in vllm-metal's installer at that tag.
|
||||
NEW_VLLM_VERSION=$(curl -fsSL \
|
||||
"https://raw.githubusercontent.com/$REPO/$LATEST_TAG/install.sh" \
|
||||
| grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -1 | cut -d'"' -f2)
|
||||
|
||||
if [ -z "$LATEST_TAG" ] || [ -z "$NEW_VLLM_VERSION" ]; then
|
||||
echo "Could not resolve vllm-metal tag ($LATEST_TAG) or its vllm_v ($NEW_VLLM_VERSION)." >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
set +e
|
||||
CURRENT_TAG=$(grep -oE 'VLLM_METAL_VERSION="[^"]*"' "$FILE" | head -1 | cut -d'"' -f2)
|
||||
set -e
|
||||
|
||||
# Rewrite the single pin. install.sh derives VLLM_VERSION from this tag at build
|
||||
# time, so there is nothing else to touch. peter-evans/create-pull-request opens
|
||||
# no PR on a clean tree, so a no-op rewrite (already current) is safe.
|
||||
sed -i "$FILE" \
|
||||
-e "s|VLLM_METAL_VERSION=\"[^\"]*\"|VLLM_METAL_VERSION=\"$LATEST_TAG\"|"
|
||||
|
||||
if [ -z "$CURRENT_TAG" ]; then
|
||||
echo "Could not find VLLM_METAL_VERSION=\"...\" in $FILE." >&2
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "vllm-metal ${CURRENT_TAG} -> ${LATEST_TAG} (builds vLLM ${NEW_VLLM_VERSION}): https://github.com/$REPO/releases/tag/${LATEST_TAG}" >> "${VAR}_message.txt"
|
||||
echo "${LATEST_TAG}" >> "${VAR}_commit.txt"
|
||||
36
.github/workflows/bump_deps.yaml
vendored
36
.github/workflows/bump_deps.yaml
vendored
@@ -154,3 +154,39 @@ jobs:
|
||||
branch: "update/VLLM_VERSION"
|
||||
body: ${{ steps.bump.outputs.message }}
|
||||
signoff: true
|
||||
|
||||
bump-vllm-metal:
|
||||
# The darwin (Apple Silicon) vLLM build installs vllm-metal, which is locked
|
||||
# to a specific vLLM source release. install.sh pins both VLLM_METAL_VERSION
|
||||
# (the wheel release) and VLLM_VERSION (the vLLM it builds against); this job
|
||||
# tracks vllm-project/vllm-metal and rewrites both atomically. Separate from
|
||||
# bump-vllm-wheel because darwin follows vllm-metal, not vllm/vllm latest.
|
||||
if: github.repository == 'mudler/LocalAI'
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v7
|
||||
- name: Bump vllm-metal pin 🔧
|
||||
id: bump
|
||||
run: |
|
||||
bash .github/bump_vllm_metal.sh vllm-project/vllm-metal backend/python/vllm/install.sh VLLM_METAL_VERSION
|
||||
{
|
||||
echo 'message<<EOF'
|
||||
cat "VLLM_METAL_VERSION_message.txt"
|
||||
echo EOF
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
{
|
||||
echo 'commit<<EOF'
|
||||
cat "VLLM_METAL_VERSION_commit.txt"
|
||||
echo EOF
|
||||
} >> "$GITHUB_OUTPUT"
|
||||
rm -rfv VLLM_METAL_VERSION_message.txt VLLM_METAL_VERSION_commit.txt
|
||||
- name: Create Pull Request
|
||||
uses: peter-evans/create-pull-request@v8
|
||||
with:
|
||||
token: ${{ secrets.UPDATE_BOT_TOKEN }}
|
||||
push-to-fork: ci-forks/LocalAI
|
||||
commit-message: ':arrow_up: Update vllm-project/vllm-metal (darwin)'
|
||||
title: 'chore: :arrow_up: Update vllm-metal (darwin) to `${{ steps.bump.outputs.commit }}`'
|
||||
branch: "update/VLLM_METAL_VERSION"
|
||||
body: ${{ steps.bump.outputs.message }}
|
||||
signoff: true
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
IK_LLAMA_VERSION?=d5507e33ae7ee2b7b41475f08044d3bde3b839ee
|
||||
IK_LLAMA_VERSION?=7ccf1d209588962b96eacca325b37e9b3e8faf5e
|
||||
LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# omnivoice.cpp version
|
||||
OMNIVOICE_REPO?=https://github.com/ServeurpersoCom/omnivoice.cpp
|
||||
OMNIVOICE_VERSION?=0f37401bebe9b20c0160a888e592108fc1d17607
|
||||
OMNIVOICE_VERSION?=96d30169afd5e6bb3fd6a0e9be0eb505bfe81fcd
|
||||
SO_TARGET?=libgomnivoicecpp.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -645,6 +645,7 @@
|
||||
nvidia-cuda-13: "cuda13-vllm"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm"
|
||||
cpu: "cpu-vllm"
|
||||
metal: "metal-vllm"
|
||||
- &sglang
|
||||
name: "sglang"
|
||||
license: apache-2.0
|
||||
@@ -2929,6 +2930,17 @@
|
||||
nvidia-cuda-13: "cuda13-vllm-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-vllm-development"
|
||||
cpu: "cpu-vllm-development"
|
||||
metal: "metal-vllm-development"
|
||||
- !!merge <<: *vllm
|
||||
name: "metal-vllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-metal-darwin-arm64-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-metal-darwin-arm64-vllm
|
||||
- !!merge <<: *vllm
|
||||
name: "metal-vllm-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-vllm"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-vllm
|
||||
- !!merge <<: *vllm
|
||||
name: "cuda12-vllm"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-vllm"
|
||||
|
||||
@@ -457,9 +457,14 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if last_output is None or not getattr(last_output, "prompt_logprobs", None):
|
||||
context.set_code(grpc.StatusCode.INTERNAL)
|
||||
context.set_details("vLLM did not return prompt_logprobs")
|
||||
_pl = getattr(last_output, "prompt_logprobs", None) if last_output is not None else None
|
||||
# Some engines accept the prompt_logprobs request but return a
|
||||
# list of all-None entries instead of computing them (observed
|
||||
# with vllm-metal's MLX backend on macOS). Treat that as
|
||||
# unsupported rather than silently scoring every candidate as 0.
|
||||
if not _pl or all(e is None for e in _pl):
|
||||
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
|
||||
context.set_details("This backend did not return prompt_logprobs; scoring is unsupported on this engine (e.g. vllm-metal / MLX on macOS).")
|
||||
return backend_pb2.ScoreResponse()
|
||||
|
||||
prompt_logprobs = last_output.prompt_logprobs
|
||||
|
||||
@@ -43,6 +43,24 @@ if [ "x${BUILD_PROFILE}" == "xcublas13" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --index-strategy=unsafe-best-match"
|
||||
fi
|
||||
|
||||
# Apple Silicon (Metal/MLX) via vllm-metal.
|
||||
# vllm-metal (github.com/vllm-project/vllm-metal) brings vLLM to macOS on Apple
|
||||
# Silicon: it registers through vLLM's platform-plugin entry point
|
||||
# (metal -> vllm_metal:register), MetalPlatform activates, and the vLLM v1
|
||||
# AsyncLLM engine runs on the GPU through MLX. LocalAI's backend.py is UNCHANGED
|
||||
# on darwin — AsyncEngineArgs(...) -> AsyncLLMEngine.from_engine_args transparently
|
||||
# resolves to the MLX engine (proven on a real M4 / macOS 26.5 against Qwen3-0.6B).
|
||||
#
|
||||
# vllm-metal REQUIRES Python 3.12, so force the portable CPython before the venv
|
||||
# is created (ensureVenv reads PYTHON_VERSION/PYTHON_PATCH/PY_STANDALONE_TAG).
|
||||
# The patch + standalone tag mirror the l4t13 cp312 pin — a known-good
|
||||
# python-build-standalone release that also ships an aarch64-apple-darwin asset.
|
||||
if [ "$(uname -s)" = "Darwin" ]; then
|
||||
PYTHON_VERSION="3.12"
|
||||
PYTHON_PATCH="12"
|
||||
PY_STANDALONE_TAG="20251120"
|
||||
fi
|
||||
|
||||
# JetPack 7 / L4T arm64 vllm + torch wheels come straight from PyPI now
|
||||
# (torch 2.11+ ships aarch64 + cu130 manylinux wheels and vllm 0.20+ ships
|
||||
# an aarch64 wheel pinned to that torch). They're cp312-only, so bump the
|
||||
@@ -57,11 +75,87 @@ if [ "x${BUILD_PROFILE}" == "xl4t13" ]; then
|
||||
PY_STANDALONE_TAG="20251120"
|
||||
fi
|
||||
|
||||
# ===================== Apple Silicon (Metal/MLX) =====================
|
||||
# Reproduce vllm-metal's upstream installer
|
||||
# (curl -fsSL https://raw.githubusercontent.com/vllm-project/vllm-metal/main/install.sh)
|
||||
# but INTO LocalAI's managed venv (ensureVenv) instead of a throwaway
|
||||
# ~/.venv-vllm-metal, so the backend integrates with LocalAI's venv lifecycle
|
||||
# (portable CPython, _makeVenvPortable relocation, runtime activation). The
|
||||
# normal CUDA/CPU installRequirements is skipped on darwin — there is no
|
||||
# macOS/arm64 vLLM wheel on PyPI; vLLM is built from source and the MLX engine
|
||||
# is layered on by the vllm-metal wheel.
|
||||
if [ "$(uname -s)" = "Darwin" ]; then
|
||||
# Create/activate the portable 3.12 venv. On darwin USE_PIP=true and
|
||||
# PORTABLE_PYTHON=true (set by scripts/build/python-darwin.sh), so this is a
|
||||
# `python -m venv` based, relocatable venv.
|
||||
ensureVenv
|
||||
|
||||
# vllm-metal's installer drives everything through `uv`: building vLLM from
|
||||
# the CPU requirements needs `--index-strategy unsafe-best-match` (mixes the
|
||||
# pytorch CPU channel with PyPI), a flag plain pip does not have. The darwin
|
||||
# venv is pip-based, so bootstrap uv into it. uv honours $VIRTUAL_ENV (set by
|
||||
# libbackend's _activateVenv) and installs into THIS venv — same pattern the
|
||||
# intel branch below relies on.
|
||||
pip install uv
|
||||
|
||||
# The ONLY darwin version pin -- AUTO-BUMPED by .github/bump_vllm_metal.sh,
|
||||
# which tracks vllm-project/vllm-metal releases (NOT vllm/vllm latest). Keep
|
||||
# it as a plain double-quoted assignment on its own line so the bumper's sed
|
||||
# can rewrite it. Darwin therefore follows vllm-metal and can lag the Linux
|
||||
# vllm pin (requirements-cublas13-after.txt, bumped independently against
|
||||
# vllm/vllm) until vllm-metal supports a newer vLLM.
|
||||
VLLM_METAL_VERSION="v0.3.0.dev20260622062346"
|
||||
|
||||
# The coupled vLLM source version is whatever this vllm-metal release builds
|
||||
# against -- it declares it in its own installer as `vllm_v=`. Derive it from
|
||||
# the PINNED tag rather than hardcoding a second value that could drift. The
|
||||
# tag is immutable, so this stays reproducible across rebuilds.
|
||||
VLLM_VERSION=$(curl -fsSL "https://raw.githubusercontent.com/vllm-project/vllm-metal/${VLLM_METAL_VERSION}/install.sh" \
|
||||
| grep -oE 'vllm_v="[0-9]+\.[0-9]+\.[0-9]+"' | head -n1 | cut -d'"' -f2)
|
||||
if [ -z "${VLLM_VERSION}" ]; then
|
||||
echo "ERROR: could not derive the vLLM version from vllm-metal ${VLLM_METAL_VERSION}" >&2
|
||||
exit 1
|
||||
fi
|
||||
echo "vllm-metal ${VLLM_METAL_VERSION} builds against vLLM ${VLLM_VERSION}"
|
||||
|
||||
_vllm_src=$(mktemp -d)
|
||||
trap 'rm -rf "${_vllm_src}"' EXIT
|
||||
pushd "${_vllm_src}"
|
||||
# 1) Build vLLM ${VLLM_VERSION} from the release source tarball against
|
||||
# the CPU requirements. vllm-metal layers its MLX platform plugin on
|
||||
# top of this exact build.
|
||||
curl -fsSL -o "vllm-${VLLM_VERSION}.tar.gz" \
|
||||
"https://github.com/vllm-project/vllm/releases/download/v${VLLM_VERSION}/vllm-${VLLM_VERSION}.tar.gz"
|
||||
tar -xzf "vllm-${VLLM_VERSION}.tar.gz"
|
||||
pushd "vllm-${VLLM_VERSION}"
|
||||
uv pip install -r requirements/cpu.txt --index-strategy unsafe-best-match
|
||||
# -Wno-parentheses: clang on macOS treats one of vLLM's C++ warnings
|
||||
# as an error without it (matches the upstream installer's CXXFLAGS).
|
||||
CXXFLAGS="-Wno-parentheses" uv pip install .
|
||||
popd
|
||||
popd
|
||||
|
||||
# 2) Install the prebuilt vllm-metal wheel for the PINNED release. It pulls
|
||||
# mlx / mlx-metal as deps and registers the `metal` platform plugin that
|
||||
# backend.py resolves to at engine-init time. Build the release-asset URL
|
||||
# deterministically (tag + the cp312/arm64 wheel name) rather than querying
|
||||
# api.github.com, whose unauthenticated rate limit (60/hr per IP) 403s on
|
||||
# shared CI runners. The wheel version is the tag without its leading 'v'.
|
||||
_metal_wheel="vllm_metal-${VLLM_METAL_VERSION#v}-cp312-cp312-macosx_11_0_arm64.whl"
|
||||
_metal_wheel_url="https://github.com/vllm-project/vllm-metal/releases/download/${VLLM_METAL_VERSION}/${_metal_wheel}"
|
||||
echo "Installing vllm-metal wheel: ${_metal_wheel_url}"
|
||||
uv pip install "${_metal_wheel_url}"
|
||||
|
||||
# Generate the gRPC stubs (backend_pb2*). installRequirements normally does
|
||||
# this via runProtogen at the end; we skipped installRequirements on darwin,
|
||||
# so call it explicitly here.
|
||||
runProtogen
|
||||
|
||||
# Intel XPU has no upstream-published vllm wheels, so we always build vllm
|
||||
# from source against torch-xpu and replace the default triton with
|
||||
# triton-xpu (matching torch 2.11). Mirrors the upstream procedure:
|
||||
# https://github.com/vllm-project/vllm/blob/main/docs/getting_started/installation/gpu.xpu.inc.md
|
||||
if [ "x${BUILD_TYPE}" == "xintel" ]; then
|
||||
elif [ "x${BUILD_TYPE}" == "xintel" ]; then
|
||||
# Hide requirements-intel-after.txt so installRequirements doesn't
|
||||
# try `pip install vllm` (would either fail or grab a non-XPU wheel).
|
||||
_intel_after="${backend_dir}/requirements-intel-after.txt"
|
||||
|
||||
@@ -4,4 +4,7 @@
|
||||
# instead — the cublas13 case in install.sh adds --index-strategy=unsafe-best-match
|
||||
# so uv consults this index alongside PyPI.
|
||||
--extra-index-url https://wheels.vllm.ai/0.23.0/cu130
|
||||
# VERSION COUPLING: darwin/Apple-Silicon builds use vllm-metal (see install.sh),
|
||||
# which pins this exact vLLM version. Bumping vllm here means coordinating with a
|
||||
# vllm-metal release that supports the new version, or macOS/Metal builds break.
|
||||
vllm==0.23.0
|
||||
|
||||
@@ -82,7 +82,6 @@
|
||||
"tier": {
|
||||
"cpu": "CPU-only",
|
||||
"gpu-small": "GPU",
|
||||
"gpu-mid": "GPU",
|
||||
"gpu-large": "GPU"
|
||||
},
|
||||
"cpuNote": "No GPU detected — these small models stay responsive on CPU.",
|
||||
|
||||
@@ -2,16 +2,6 @@
|
||||
"title": "Install Models",
|
||||
"subtitle": "Browse and install AI models from the gallery",
|
||||
"models": "Models",
|
||||
"recommended": {
|
||||
"title": "Recommended for your hardware",
|
||||
"cpuNote": "No GPU detected - small models that stay responsive on CPU.",
|
||||
"gpuNote": "Sized to fit your available VRAM with room for context.",
|
||||
"install": "Install",
|
||||
"installing": "Installing",
|
||||
"installStarted": "Installing {{model}}…",
|
||||
"installFailed": "Install failed: {{message}}",
|
||||
"dismiss": "Dismiss recommendations"
|
||||
},
|
||||
"stats": {
|
||||
"available": "Available",
|
||||
"installed": "Installed"
|
||||
|
||||
@@ -6409,9 +6409,6 @@ select.input {
|
||||
font-size: 0.875rem;
|
||||
word-break: break-all;
|
||||
}
|
||||
.home-starters-badge {
|
||||
font-size: 0.625rem;
|
||||
}
|
||||
.home-starters-size {
|
||||
margin-left: auto;
|
||||
font-size: 0.75rem;
|
||||
@@ -6419,74 +6416,6 @@ select.input {
|
||||
white-space: nowrap;
|
||||
}
|
||||
|
||||
/* ──────────────────── Models gallery: recommended-for-your-hardware strip ──────────────────── */
|
||||
|
||||
.rec-models {
|
||||
margin-bottom: var(--spacing-md);
|
||||
padding: var(--spacing-md) var(--spacing-lg);
|
||||
}
|
||||
.rec-models-head {
|
||||
display: flex;
|
||||
align-items: flex-start;
|
||||
justify-content: space-between;
|
||||
gap: var(--spacing-md);
|
||||
}
|
||||
.rec-models-title {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: var(--spacing-sm);
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
.rec-models-title i {
|
||||
color: var(--color-primary);
|
||||
}
|
||||
.rec-models-note {
|
||||
font-size: 0.8125rem;
|
||||
color: var(--color-text-secondary);
|
||||
}
|
||||
.rec-models-dismiss {
|
||||
background: none;
|
||||
border: none;
|
||||
color: var(--color-text-muted);
|
||||
cursor: pointer;
|
||||
padding: 4px;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
.rec-models-dismiss:hover {
|
||||
color: var(--color-text-primary);
|
||||
}
|
||||
.rec-models-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fill, minmax(220px, 1fr));
|
||||
gap: var(--spacing-sm);
|
||||
margin-top: var(--spacing-md);
|
||||
}
|
||||
.rec-models-item {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: var(--spacing-xs);
|
||||
padding: var(--spacing-sm) var(--spacing-md);
|
||||
border: 1px solid var(--color-border-subtle);
|
||||
border-radius: var(--radius-md);
|
||||
background: var(--color-bg-primary);
|
||||
}
|
||||
.rec-models-item-name {
|
||||
font-weight: 500;
|
||||
font-size: 0.8125rem;
|
||||
word-break: break-all;
|
||||
}
|
||||
.rec-models-item-meta {
|
||||
display: flex;
|
||||
gap: var(--spacing-sm);
|
||||
font-size: 0.75rem;
|
||||
color: var(--color-text-muted);
|
||||
}
|
||||
.rec-models-item-fit {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
gap: 4px;
|
||||
}
|
||||
|
||||
/* ──────────────────── Home: drop-in endpoint / API compatibility ──────────────────── */
|
||||
|
||||
.home-connect {
|
||||
|
||||
@@ -1,86 +0,0 @@
|
||||
import { useState } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { modelsApi } from '../utils/api'
|
||||
import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
|
||||
|
||||
const DISMISS_KEY = 'localai_rec_models_dismissed'
|
||||
|
||||
// "Recommended for your hardware" strip at the top of the Models gallery. Shares
|
||||
// the hardware-fit ranking with the empty-state starter widget via
|
||||
// useRecommendedModels, but styled for the gallery page and dismissible (the
|
||||
// gallery is a repeat-visit surface, so it shouldn't nag).
|
||||
export default function RecommendedModels({ addToast }) {
|
||||
const { t } = useTranslation('models')
|
||||
const { recommended, tier, loading } = useRecommendedModels({ count: 4 })
|
||||
const [installing, setInstalling] = useState(() => new Set())
|
||||
const [dismissed, setDismissed] = useState(() => {
|
||||
try { return localStorage.getItem(DISMISS_KEY) === '1' } catch { return false }
|
||||
})
|
||||
|
||||
if (loading || dismissed) return null
|
||||
if (!recommended || recommended.length === 0) return null
|
||||
|
||||
const dismiss = () => {
|
||||
try { localStorage.setItem(DISMISS_KEY, '1') } catch { /* ignore */ }
|
||||
setDismissed(true)
|
||||
}
|
||||
|
||||
const install = async (name) => {
|
||||
setInstalling(prev => new Set(prev).add(name))
|
||||
try {
|
||||
await modelsApi.install(name)
|
||||
addToast?.(t('recommended.installStarted', { model: name }), 'success')
|
||||
} catch (err) {
|
||||
addToast?.(t('recommended.installFailed', { message: err.message }), 'error')
|
||||
setInstalling(prev => {
|
||||
const next = new Set(prev)
|
||||
next.delete(name)
|
||||
return next
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
const isGpu = tier.id !== 'cpu'
|
||||
|
||||
return (
|
||||
<div className="rec-models card">
|
||||
<div className="rec-models-head">
|
||||
<div className="rec-models-title">
|
||||
<i className={`fas ${isGpu ? 'fa-microchip' : 'fa-memory'}`} aria-hidden="true" />
|
||||
<strong>{t('recommended.title')}</strong>
|
||||
<span className="rec-models-note">{isGpu ? t('recommended.gpuNote') : t('recommended.cpuNote')}</span>
|
||||
</div>
|
||||
<button type="button" className="rec-models-dismiss" onClick={dismiss} aria-label={t('recommended.dismiss')} title={t('recommended.dismiss')}>
|
||||
<i className="fas fa-times" aria-hidden="true" />
|
||||
</button>
|
||||
</div>
|
||||
<div className="rec-models-grid">
|
||||
{recommended.map(m => {
|
||||
const busy = installing.has(m.name)
|
||||
return (
|
||||
<div key={m.name} className="rec-models-item">
|
||||
<div className="rec-models-item-name">{m.name}</div>
|
||||
<div className="rec-models-item-meta">
|
||||
{isNvfp4Name(m.name) && <span className="badge badge-info">NVFP4</span>}
|
||||
{m.sizeDisplay && <span>{m.sizeDisplay}</span>}
|
||||
{isGpu && m.vramDisplay && (
|
||||
<span className="rec-models-item-fit"><i className="fas fa-microchip" aria-hidden="true" /> {m.vramDisplay}</span>
|
||||
)}
|
||||
</div>
|
||||
<button
|
||||
type="button"
|
||||
className="btn btn-primary btn-sm"
|
||||
disabled={busy}
|
||||
onClick={() => install(m.name)}
|
||||
>
|
||||
{busy
|
||||
? (<><i className="fas fa-spinner fa-spin" aria-hidden="true" /> {t('recommended.installing')}</>)
|
||||
: (<><i className="fas fa-download" aria-hidden="true" /> {t('recommended.install')}</>)}
|
||||
</button>
|
||||
</div>
|
||||
)
|
||||
})}
|
||||
</div>
|
||||
</div>
|
||||
)
|
||||
}
|
||||
@@ -1,78 +1,79 @@
|
||||
import { useState } from 'react'
|
||||
import { useState, useEffect, useMemo } from 'react'
|
||||
import { useTranslation } from 'react-i18next'
|
||||
import { modelsApi } from '../utils/api'
|
||||
import { useRecommendedModels, isNvfp4Name } from '../hooks/useRecommendedModels'
|
||||
import { useResources } from '../hooks/useResources'
|
||||
|
||||
// Static fallback used only when the live gallery / estimates can't be reached
|
||||
// (offline, trimmed gallery). The hook is the primary, data-driven path; these
|
||||
// are real gallery names kept as a safety net so onboarding never shows nothing.
|
||||
// Gemma picks use the QAT (quantization-aware-trained) Q4 builds. NVIDIA boxes
|
||||
// get NVFP4 + MTP variants at the mid/large tiers (see NVIDIA below).
|
||||
const BASE = {
|
||||
cpu: [
|
||||
{ name: 'gemma-4-e2b-it-qat-q4_0', size: '~1.5 GB' },
|
||||
{ name: 'qwen3.5-4b-claude-4.6-opus-reasoning-distilled', size: '~2.5 GB' },
|
||||
{ name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
|
||||
{ name: 'lfm2.5-1.2b-instruct', size: '~0.8 GB' },
|
||||
],
|
||||
'gpu-small': [
|
||||
{ name: 'gemma-4-e4b-it-qat-q4_0', size: '~3 GB' },
|
||||
{ name: 'lfm2.5-8b-a1b', size: '~5 GB' },
|
||||
{ name: 'qwen3.5-9b', size: '~5.5 GB' },
|
||||
{ name: 'gemma-4-12b-it-qat-q4_0', size: '~7 GB' },
|
||||
],
|
||||
'gpu-mid': [
|
||||
{ name: 'qwen3.6-27b', size: '~16 GB' },
|
||||
{ name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
|
||||
{ name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
|
||||
{ name: 'qwen3.5-27b', size: '~16 GB' },
|
||||
],
|
||||
'gpu-large': [
|
||||
{ name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
|
||||
{ name: 'qwen3.6-35b-a3b-claude-4.6-opus-reasoning-distilled', size: '~20 GB' },
|
||||
{ name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
|
||||
{ name: 'qwen3.5-35b-a3b-apex', size: '~20 GB' },
|
||||
],
|
||||
}
|
||||
// Curated, hardware-tiered starter models for the empty-state onboarding. Names
|
||||
// are real gallery entries (gallery/index.yaml); we intersect them against the
|
||||
// live gallery at render time so a custom/trimmed gallery degrades gracefully
|
||||
// (unmatched entries simply don't render).
|
||||
//
|
||||
// The guiding rule the maintainer asked for: CPU-only machines should be
|
||||
// steered to genuinely small models (1-4B, Q4) that stay responsive without a
|
||||
// GPU. GPU tiers scale the suggestion up with available VRAM.
|
||||
const SMALL = [
|
||||
{ name: 'llama-3.2-1b-instruct:q4_k_m', size: '~0.8 GB' },
|
||||
{ name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
|
||||
{ name: 'qwen3-1.7b', size: '~1.4 GB' },
|
||||
{ name: 'gemma-3-1b-it', size: '~0.8 GB' },
|
||||
]
|
||||
const MID = [
|
||||
{ name: 'qwen3-4b', size: '~2.5 GB' },
|
||||
{ name: 'gemma-3-4b-it', size: '~3 GB' },
|
||||
{ name: 'llama-3.2-3b-instruct:q4_k_m', size: '~2 GB' },
|
||||
]
|
||||
const LARGE = [
|
||||
{ name: 'meta-llama-3.1-8b-instruct', size: '~5 GB' },
|
||||
{ name: 'qwen3-4b', size: '~2.5 GB' },
|
||||
{ name: 'mistral-7b-instruct-v0.3', size: '~4 GB' },
|
||||
]
|
||||
|
||||
// NVIDIA-only overrides: NVFP4 is a Blackwell-optimised 4-bit format paired with
|
||||
// MTP (multi-token prediction) for speed. Only the mid/large tiers have these.
|
||||
const NVIDIA = {
|
||||
'gpu-mid': [
|
||||
{ name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
|
||||
{ name: 'qwen3.6-27b-mtp-pi-tune', size: '~16 GB' },
|
||||
{ name: 'gemma-4-26b-a4b-it-qat-q4_0', size: '~16 GB' },
|
||||
{ name: 'qwen3.6-27b', size: '~16 GB' },
|
||||
],
|
||||
'gpu-large': [
|
||||
{ name: 'qwen3.6-35b-a3b-nvfp4-mtp', size: '~18 GB' },
|
||||
{ name: 'qwen3.6-27b-nvfp4-mtp', size: '~14 GB' },
|
||||
{ name: 'qwen3.6-35b-a3b-apex', size: '~20 GB' },
|
||||
{ name: 'gemma-4-31b-it-qat-q4_0', size: '~18 GB' },
|
||||
],
|
||||
}
|
||||
const GB = 1024 * 1024 * 1024
|
||||
|
||||
function fallbackFor(tierId, isNvidia) {
|
||||
if (isNvidia && NVIDIA[tierId]) return NVIDIA[tierId]
|
||||
return BASE[tierId] || BASE.cpu
|
||||
// Pick a tier from detected hardware. total_memory is GPU VRAM in bytes (0 when
|
||||
// CPU-only). Thresholds are deliberately conservative so a suggestion that
|
||||
// "fits" really does.
|
||||
function pickTier(resources) {
|
||||
const isGpu = resources?.type === 'gpu'
|
||||
const vram = resources?.aggregate?.total_memory || 0
|
||||
if (!isGpu || vram <= 0) return { id: 'cpu', list: SMALL }
|
||||
if (vram < 8 * GB) return { id: 'gpu-small', list: MID }
|
||||
return { id: 'gpu-large', list: LARGE }
|
||||
}
|
||||
|
||||
export default function StarterModels({ addToast, onInstallStarted }) {
|
||||
const { t } = useTranslation('home')
|
||||
const { recommended, tier, isNvidia, loading } = useRecommendedModels({ count: 4 })
|
||||
const { resources } = useResources()
|
||||
const [available, setAvailable] = useState(null) // Set of gallery names, or null while loading
|
||||
const [installing, setInstalling] = useState(() => new Set())
|
||||
|
||||
// While the hardware probe + gallery query are in flight, render nothing
|
||||
// rather than flashing fallback content that may be replaced a moment later.
|
||||
if (loading) return null
|
||||
const tier = useMemo(() => pickTier(resources), [resources])
|
||||
const candidates = tier.list
|
||||
|
||||
// Prefer live recommendations; fall back to the static list only when the
|
||||
// gallery yielded nothing.
|
||||
const items = (recommended && recommended.length > 0)
|
||||
? recommended.map(r => ({ name: r.name, size: r.sizeDisplay }))
|
||||
: fallbackFor(tier.id, isNvidia)
|
||||
// Verify candidates exist in the live gallery. One search per name (the tier
|
||||
// has at most a handful) keeps this resilient to gallery customization.
|
||||
useEffect(() => {
|
||||
let cancelled = false
|
||||
const names = [...new Set(candidates.map(c => c.name))]
|
||||
Promise.all(names.map(name =>
|
||||
modelsApi.list({ search: name, page: 1 })
|
||||
.then(data => (data?.models || []).some(m => (m.name || m.id) === name) ? name : null)
|
||||
.catch(() => null)
|
||||
)).then(found => {
|
||||
if (cancelled) return
|
||||
const hits = found.filter(Boolean)
|
||||
// If verification yielded nothing (e.g. gallery unreachable), fall back to
|
||||
// showing the curated list rather than an empty widget.
|
||||
setAvailable(hits.length > 0 ? new Set(hits) : null)
|
||||
})
|
||||
return () => { cancelled = true }
|
||||
}, [candidates])
|
||||
|
||||
if (items.length === 0) return null
|
||||
const visible = available === null
|
||||
? candidates
|
||||
: candidates.filter(c => available.has(c.name))
|
||||
|
||||
if (visible.length === 0) return null
|
||||
|
||||
const install = async (name) => {
|
||||
setInstalling(prev => new Set(prev).add(name))
|
||||
@@ -103,13 +104,12 @@ export default function StarterModels({ addToast, onInstallStarted }) {
|
||||
{tier.id === 'cpu' ? t('starters.cpuNote') : t('starters.gpuNote')}
|
||||
</p>
|
||||
<ul className="home-starters-list">
|
||||
{items.map(c => {
|
||||
{visible.map(c => {
|
||||
const busy = installing.has(c.name)
|
||||
return (
|
||||
<li key={c.name} className="home-starters-item">
|
||||
<span className="home-starters-name">{c.name}</span>
|
||||
{isNvfp4Name(c.name) && <span className="badge badge-info home-starters-badge">NVFP4</span>}
|
||||
{c.size && <span className="home-starters-size">{c.size}</span>}
|
||||
<span className="home-starters-size">{c.size}</span>
|
||||
<button
|
||||
type="button"
|
||||
className="btn btn-primary btn-sm"
|
||||
|
||||
108
core/http/react-ui/src/hooks/useRecommendedModels.js
vendored
108
core/http/react-ui/src/hooks/useRecommendedModels.js
vendored
@@ -1,108 +0,0 @@
|
||||
import { useState, useEffect } from 'react'
|
||||
import { modelsApi } from '../utils/api'
|
||||
import { useResources } from './useResources'
|
||||
|
||||
// Data-driven "recommended for your hardware" model picks. The gallery exposes
|
||||
// no popularity/download signal and the list response carries no size, so we:
|
||||
// 1. ask the server for chat-capable models in their natural (curated) order,
|
||||
// 2. estimate size/VRAM for the top candidates (same endpoint the Models page
|
||||
// uses), and
|
||||
// 3. rank by hardware fit — smallest on CPU-only boxes, largest-that-fits on
|
||||
// GPUs (bigger == better quality while still fitting VRAM).
|
||||
//
|
||||
// Returns `recommended === null` while loading, `[]` when nothing could be
|
||||
// resolved (gallery/estimates unavailable) so callers can fall back.
|
||||
|
||||
const GB = 1024 * 1024 * 1024
|
||||
const DEFAULT_CTX = 4096
|
||||
|
||||
// NVFP4 is a Blackwell/NVIDIA-specific 4-bit format — only worth suggesting on
|
||||
// NVIDIA hardware, and to be filtered out elsewhere.
|
||||
export const isNvfp4Name = (name) => /nvfp4/i.test(name || '')
|
||||
|
||||
export function hasNvidiaGpu(resources) {
|
||||
return Array.isArray(resources?.gpus) &&
|
||||
resources.gpus.some(g => (g?.vendor || '').toLowerCase() === 'nvidia')
|
||||
}
|
||||
|
||||
export function recommendTier(resources) {
|
||||
const isGpu = resources?.type === 'gpu'
|
||||
const vram = resources?.aggregate?.total_memory || 0
|
||||
if (!isGpu || vram <= 0) return { id: 'cpu', vram: 0 }
|
||||
if (vram < 8 * GB) return { id: 'gpu-small', vram }
|
||||
if (vram < 24 * GB) return { id: 'gpu-mid', vram }
|
||||
return { id: 'gpu-large', vram }
|
||||
}
|
||||
|
||||
function rank(candidates, tier, count, isNvidia) {
|
||||
// NVFP4 only runs on NVIDIA (Blackwell) — drop it everywhere else, and prefer
|
||||
// it on NVIDIA boxes where it's the fastest path.
|
||||
const pool = candidates.filter(c => c.sizeBytes != null && (isNvidia || !isNvfp4Name(c.name)))
|
||||
if (tier.id === 'cpu') {
|
||||
// No GPU: smallest models stay responsive on CPU.
|
||||
return [...pool].sort((a, b) => a.sizeBytes - b.sizeBytes).slice(0, count)
|
||||
}
|
||||
const limit = tier.vram * 0.95
|
||||
const fits = pool.filter(c => c.vramBytes != null && c.vramBytes <= limit)
|
||||
const base = fits.length > 0 ? fits : pool // tiny GPU where nothing fits → fall through to smallest
|
||||
const byPreference = (a, b) => {
|
||||
// On NVIDIA, surface NVFP4 first; then largest-that-fits (best quality).
|
||||
if (isNvidia) {
|
||||
const an = isNvfp4Name(a.name), bn = isNvfp4Name(b.name)
|
||||
if (an !== bn) return an ? -1 : 1
|
||||
}
|
||||
return fits.length > 0 ? b.sizeBytes - a.sizeBytes : a.sizeBytes - b.sizeBytes
|
||||
}
|
||||
return [...base].sort(byPreference).slice(0, count)
|
||||
}
|
||||
|
||||
export function useRecommendedModels({ count = 4, candidatePool = 10 } = {}) {
|
||||
const { resources } = useResources()
|
||||
const [recommended, setRecommended] = useState(null)
|
||||
const [error, setError] = useState(null)
|
||||
|
||||
const resReady = resources !== null
|
||||
const tier = recommendTier(resources)
|
||||
const isNvidia = hasNvidiaGpu(resources)
|
||||
|
||||
useEffect(() => {
|
||||
if (!resReady) return
|
||||
let cancelled = false
|
||||
setRecommended(null)
|
||||
setError(null)
|
||||
;(async () => {
|
||||
try {
|
||||
const data = await modelsApi.list({ tag: 'chat', items: candidatePool, page: 1 })
|
||||
// Recommend models the user hasn't installed yet.
|
||||
const models = (data?.models || []).filter(m => !m.installed)
|
||||
const estimated = await Promise.all(models.map(async (m) => {
|
||||
const name = m.name || m.id
|
||||
try {
|
||||
const e = await modelsApi.estimate(name, [DEFAULT_CTX])
|
||||
const ctx = e?.estimates?.[String(DEFAULT_CTX)]
|
||||
return {
|
||||
name,
|
||||
description: m.description,
|
||||
sizeBytes: e?.sizeBytes ?? null,
|
||||
sizeDisplay: e?.sizeDisplay ?? null,
|
||||
vramBytes: ctx?.vramBytes ?? null,
|
||||
vramDisplay: ctx?.vramDisplay ?? null,
|
||||
}
|
||||
} catch {
|
||||
return { name, sizeBytes: null }
|
||||
}
|
||||
}))
|
||||
if (cancelled) return
|
||||
setRecommended(rank(estimated, tier, count, isNvidia))
|
||||
} catch (e) {
|
||||
if (cancelled) return
|
||||
setError(e.message)
|
||||
setRecommended([])
|
||||
}
|
||||
})()
|
||||
return () => { cancelled = true }
|
||||
// tier.id / tier.vram / isNvidia are primitives, so resource polling doesn't re-run this.
|
||||
}, [resReady, tier.id, tier.vram, isNvidia, count, candidatePool])
|
||||
|
||||
return { recommended, tier, isNvidia, error, loading: recommended === null }
|
||||
}
|
||||
@@ -13,7 +13,6 @@ import ConfirmDialog from '../components/ConfirmDialog'
|
||||
import GalleryLoader from '../components/GalleryLoader'
|
||||
import Toggle from '../components/Toggle'
|
||||
import ResponsiveTable from '../components/ResponsiveTable'
|
||||
import RecommendedModels from '../components/RecommendedModels'
|
||||
import React from 'react'
|
||||
|
||||
|
||||
@@ -302,8 +301,6 @@ export default function Models() {
|
||||
}
|
||||
/>
|
||||
|
||||
<RecommendedModels addToast={addToast} />
|
||||
|
||||
{/* Search */}
|
||||
<div className="search-bar" style={{ marginBottom: 'var(--spacing-md)' }}>
|
||||
<i className="fas fa-search search-icon" />
|
||||
|
||||
Reference in New Issue
Block a user