mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-16 12:59:33 -04:00
* feat(backend): add tinygrad multimodal backend
Wire tinygrad as a new Python backend covering LLM text generation with
native tool-call extraction, embeddings, Stable Diffusion 1.x image
generation, and Whisper speech-to-text from a single self-contained
container.
Backend (`backend/python/tinygrad/`):
- `backend.py` gRPC servicer with LLM Predict/PredictStream (auto-detects
Llama / Qwen2 / Mistral architecture from `config.json`, supports
safetensors and GGUF), Embedding via mean-pooled last hidden state,
GenerateImage via the vendored SD1.x pipeline, AudioTranscription +
AudioTranscriptionStream via the vendored Whisper inference loop, plus
Tokenize / ModelMetadata / Status / Free.
- Vendored upstream model code under `vendor/` (MIT, headers preserved):
llama.py with an added `qkv_bias` flag for Qwen2-family bias support
and an `embed()` method that returns the last hidden state, plus
clip.py, unet.py, stable_diffusion.py (trimmed to drop the MLPerf
training branch that pulls `mlperf.initializers`), audio_helpers.py
and whisper.py (trimmed to drop the pyaudio listener).
- Pluggable tool-call parsers under `tool_parsers/`: hermes (Qwen2.5 /
Hermes), llama3_json (Llama 3.1+), qwen3_xml (Qwen 3), mistral
(Mistral / Mixtral). Auto-selected from model architecture or `Options`.
- `install.sh` pins Python 3.11.14 (tinygrad >=0.12 needs >=3.11; the
default portable python is 3.10).
- `package.sh` bundles libLLVM.so.1 + libedit/libtinfo/libgomp/libsndfile
into the scratch image. `run.sh` sets `CPU_LLVM=1` and `LLVM_PATH` so
tinygrad's CPU device uses the in-process libLLVM JIT instead of
shelling out to the missing `clang` binary.
- Local unit tests for Health and the four parsers in `test.py`.
Build wiring:
- Root `Makefile`: `.NOTPARALLEL`, `prepare-test-extra`, `test-extra`,
`BACKEND_TINYGRAD = tinygrad|python|.|false|true`,
docker-build-target eval, and `docker-build-backends` aggregator.
- `.github/workflows/backend.yml`: cpu / cuda12 / cuda13 build matrix
entries (mirrors the transformers backend placement).
- `backend/index.yaml`: `&tinygrad` meta + cpu/cuda12/cuda13 image
entries (latest + development).
E2E test wiring:
- `tests/e2e-backends/backend_test.go` gains an `image` capability that
exercises GenerateImage and asserts a non-empty PNG is written to
`dst`. New `BACKEND_TEST_IMAGE_PROMPT` / `BACKEND_TEST_IMAGE_STEPS`
knobs.
- Five new make targets next to `test-extra-backend-vllm`:
- `test-extra-backend-tinygrad` — Qwen2.5-0.5B-Instruct + hermes,
mirrors the vllm target 1:1 (5/9 specs in ~57s).
- `test-extra-backend-tinygrad-embeddings` — same model, embeddings
via LLM hidden state (3/9 in ~10s).
- `test-extra-backend-tinygrad-sd` — stable-diffusion-v1-5 mirror,
health/load/image (3/9 in ~10min, 4 diffusion steps on CPU).
- `test-extra-backend-tinygrad-whisper` — openai/whisper-tiny.en
against jfk.wav from whisper.cpp samples (4/9 in ~49s).
- `test-extra-backend-tinygrad-all` aggregate.
All four targets land green on the first MVP pass: 15 specs total, 0
failures across LLM+tools, embeddings, image generation, and speech
transcription.
* refactor(tinygrad): collapse to a single backend image
tinygrad generates its own GPU kernels (PTX renderer for CUDA, the
autogen ctypes wrappers for HIP / Metal / WebGPU) and never links
against cuDNN, cuBLAS, or any toolkit-version-tied library. The only
runtime dependency that varies across hosts is the driver's libcuda.so.1
/ libamdhip64.so, which are injected into the container at run time by
the nvidia-container / rocm runtimes. So unlike torch- or vLLM-based
backends, there is no reason to ship per-CUDA-version images.
- Drop the cuda12-tinygrad and cuda13-tinygrad build-matrix entries
from .github/workflows/backend.yml. The sole remaining entry is
renamed to -tinygrad (from -cpu-tinygrad) since it is no longer
CPU-only.
- Collapse backend/index.yaml to a single meta + development pair.
The meta anchor carries the latest uri directly; the development
entry points at the master tag.
- run.sh picks the tinygrad device at launch time by probing
/usr/lib/... for libcuda.so.1 / libamdhip64.so. When libcuda is
visible we set CUDA=1 + CUDA_PTX=1 so tinygrad uses its own PTX
renderer (avoids any nvrtc/toolkit dependency); otherwise we fall
back to HIP or CLANG. CPU_LLVM=1 + LLVM_PATH keep the in-process
libLLVM JIT for the CLANG path.
- backend.py's _select_tinygrad_device() is trimmed to a CLANG-only
fallback since production device selection happens in run.sh.
Re-ran test-extra-backend-tinygrad after the change:
Ran 5 of 9 Specs in 56.541 seconds — 5 Passed, 0 Failed
84 lines
2.5 KiB
Python
84 lines
2.5 KiB
Python
# Vendored verbatim from tinygrad examples/audio_helpers.py (MIT license).
|
|
# Upstream: https://github.com/tinygrad/tinygrad/blob/master/examples/audio_helpers.py
|
|
# Copyright (c) 2023- the tinygrad authors
|
|
# SPDX-License-Identifier: MIT
|
|
from typing import Optional
|
|
from tinygrad import Tensor
|
|
from tinygrad.dtype import DTypeLike, dtypes
|
|
import math
|
|
|
|
# rewritten from numpy
|
|
def rfftfreq(n: int, d: float = 1.0, device=None) -> Tensor:
|
|
val = 1.0 / (n * d)
|
|
N = n // 2 + 1
|
|
results = Tensor.arange(N, device=device)
|
|
return results * val
|
|
|
|
# just like in librosa
|
|
def fft_frequencies(sr: float, n_fft: int) -> Tensor:
|
|
return rfftfreq(n=n_fft, d=1.0 / sr)
|
|
|
|
def hz_to_mel(freq: Tensor) -> Tensor:
|
|
# linear part
|
|
f_min = 0.0
|
|
f_sp = 200.0 / 3
|
|
mels = (freq - f_min) / f_sp
|
|
|
|
# log-scale part
|
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
|
mask = freq >= min_log_hz
|
|
return mask.where(((min_log_hz - f_min) / f_sp) + (freq / min_log_hz).log() / (math.log(6.4) / 27.0), mels)
|
|
|
|
def mel_to_hz(mels: Tensor) -> Tensor:
|
|
# linear scale
|
|
f_min = 0.0
|
|
f_sp = 200.0 / 3
|
|
freqs = f_min + f_sp * mels
|
|
|
|
# nonlinear scale
|
|
min_log_hz = 1000.0 # beginning of log region (Hz)
|
|
min_log_mel = (min_log_hz - f_min) / f_sp # same (Mels)
|
|
logstep = math.log(6.4) / 27.0 # step size for log region
|
|
|
|
log_t = mels >= min_log_mel
|
|
freqs = log_t.where(min_log_hz * ((logstep * (mels - min_log_mel)).exp()), freqs)
|
|
return freqs
|
|
|
|
def mel_frequencies(n_mels: int = 128, *, fmin: float = 0.0, fmax: float = 11025.0) -> Tensor:
|
|
# center freqs of mel bands - uniformly spaced between limits
|
|
min_max_mel = hz_to_mel(Tensor([fmin, fmax]))
|
|
|
|
mels = Tensor.linspace(min_max_mel[0], min_max_mel[1], n_mels)
|
|
hz = mel_to_hz(mels)
|
|
return hz
|
|
|
|
def mel(
|
|
*,
|
|
sr: float,
|
|
n_fft: int,
|
|
n_mels: int = 128,
|
|
fmin: float = 0.0,
|
|
fmax: Optional[float] = None,
|
|
dtype: DTypeLike = dtypes.default_float,
|
|
) -> Tensor:
|
|
if fmax is None:
|
|
fmax = float(sr) / 2
|
|
|
|
n_mels = int(n_mels)
|
|
|
|
fftfreqs = fft_frequencies(sr=sr, n_fft=n_fft) # center freqs of each FFT bin
|
|
mel_f = mel_frequencies(n_mels + 2, fmin=fmin, fmax=fmax) # center freqs of mel bands
|
|
|
|
fdiff = mel_f[1:] - mel_f[:-1]
|
|
ramps = mel_f[None].T.expand(-1, fftfreqs.shape[-1]) - fftfreqs
|
|
|
|
lower = -ramps[:n_mels] / fdiff[:n_mels][None].T
|
|
upper = ramps[2 : n_mels + 2] / fdiff[1 : n_mels + 1][None].T
|
|
weights = lower.minimum(upper).maximum(0)
|
|
|
|
# Slaney-style mel is scaled to be approx constant energy per channel
|
|
enorm = 2.0 / (mel_f[2 : n_mels + 2] - mel_f[:n_mels])
|
|
weights *= enorm[:, None]
|
|
|
|
return weights
|