mirror of
https://github.com/mudler/LocalAI.git
synced 2026-05-20 22:58:34 -04:00
Compare commits
24 Commits
faster-qwe
...
fix/ci-tes
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2aaddbb3b8 | ||
|
|
0063e5d68f | ||
|
|
c7c4a20a9e | ||
|
|
94539f3992 | ||
|
|
525278658d | ||
|
|
919f801e25 | ||
|
|
362eb261c5 | ||
|
|
d407f4ead5 | ||
|
|
1fc8ad854f | ||
|
|
f49a8edd87 | ||
|
|
510b830d2b | ||
|
|
ddb36468ed | ||
|
|
983db7bedc | ||
|
|
b260378694 | ||
|
|
b10443ab5a | ||
|
|
b647b6caf1 | ||
|
|
c187b160e7 | ||
|
|
42e580bed0 | ||
|
|
5e13193d84 | ||
|
|
1c5dc83232 | ||
|
|
73b997686a | ||
|
|
00abf1be1f | ||
|
|
959458f0db | ||
|
|
dfc6efb88d |
2
.github/gallery-agent/agent.go
vendored
2
.github/gallery-agent/agent.go
vendored
@@ -141,7 +141,7 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
|
||||
result = result.AddMessage("user", "Describe the model in a clear and concise way that can be shared in a model gallery.")
|
||||
|
||||
// Get a response
|
||||
newFragment, err := llm.Ask(ctx, result)
|
||||
_, err = llm.Ask(ctx, result)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
8
.github/gallery-agent/tools.go
vendored
8
.github/gallery-agent/tools.go
vendored
@@ -13,16 +13,16 @@ type HFReadmeTool struct {
|
||||
client *hfapi.Client
|
||||
}
|
||||
|
||||
func (s *HFReadmeTool) Execute(args map[string]any) (string, error) {
|
||||
func (s *HFReadmeTool) Execute(args map[string]any) (string, any, error) {
|
||||
q, ok := args["repository"].(string)
|
||||
if !ok {
|
||||
return "", fmt.Errorf("no query")
|
||||
return "", nil, fmt.Errorf("no query")
|
||||
}
|
||||
readme, err := s.client.GetReadmeContent(q, "README.md")
|
||||
if err != nil {
|
||||
return "", err
|
||||
return "", nil, err
|
||||
}
|
||||
return readme, nil
|
||||
return readme, nil, nil
|
||||
}
|
||||
|
||||
func (s *HFReadmeTool) Tool() openai.Tool {
|
||||
|
||||
54
.github/workflows/backend.yml
vendored
54
.github/workflows/backend.yml
vendored
@@ -210,6 +210,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-12-faster-qwen3-tts'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "faster-qwen3-tts"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "8"
|
||||
@@ -575,6 +588,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/amd64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-gpu-nvidia-cuda-13-faster-qwen3-tts'
|
||||
runs-on: 'ubuntu-latest'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
backend: "faster-qwen3-tts"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2404'
|
||||
- build-type: 'cublas'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
@@ -705,6 +731,19 @@ jobs:
|
||||
backend: "qwen-tts"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
- build-type: 'l4t'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "ubuntu:24.04"
|
||||
skip-drivers: 'false'
|
||||
ubuntu-version: '2404'
|
||||
backend: "faster-qwen3-tts"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
- build-type: 'l4t'
|
||||
cuda-major-version: "13"
|
||||
cuda-minor-version: "0"
|
||||
@@ -1306,6 +1345,19 @@ jobs:
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: 'l4t'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
platforms: 'linux/arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-nvidia-l4t-faster-qwen3-tts'
|
||||
runs-on: 'ubuntu-24.04-arm'
|
||||
base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
|
||||
skip-drivers: 'true'
|
||||
backend: "faster-qwen3-tts"
|
||||
dockerfile: "./backend/Dockerfile.python"
|
||||
context: "./"
|
||||
ubuntu-version: '2204'
|
||||
- build-type: 'l4t'
|
||||
cuda-major-version: "12"
|
||||
cuda-minor-version: "0"
|
||||
@@ -1905,7 +1957,7 @@ jobs:
|
||||
- build-type: ''
|
||||
cuda-major-version: ""
|
||||
cuda-minor-version: ""
|
||||
platforms: 'linux/amd64'
|
||||
platforms: 'linux/amd64,linux/arm64'
|
||||
tag-latest: 'auto'
|
||||
tag-suffix: '-cpu-voxcpm'
|
||||
runs-on: 'ubuntu-latest'
|
||||
|
||||
27
.github/workflows/test.yml
vendored
27
.github/workflows/test.yml
vendored
@@ -93,30 +93,15 @@ jobs:
|
||||
- name: Dependencies
|
||||
run: |
|
||||
sudo apt-get update
|
||||
sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
|
||||
sudo apt-get install -y libgmock-dev clang
|
||||
# Install UV
|
||||
curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
sudo apt-get install -y ca-certificates cmake patch python3-pip unzip
|
||||
sudo apt-get install -y libopencv-dev
|
||||
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
|
||||
curl -O https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
|
||||
sudo dpkg -i cuda-keyring_1.1-1_all.deb
|
||||
sudo apt-get update
|
||||
sudo apt-get install -y cuda-nvcc-${CUDA_VERSION} libcublas-dev-${CUDA_VERSION}
|
||||
export CUDACXX=/usr/local/cuda/bin/nvcc
|
||||
make -C backend/python/transformers
|
||||
|
||||
sudo apt-get install curl ffmpeg
|
||||
- name: Build backends
|
||||
run: |
|
||||
make backends/transformers
|
||||
mv backends/transformer external/transformers
|
||||
make backends/huggingface backends/llama-cpp backends/local-store backends/silero-vad backends/piper backends/whisper backends/stablediffusion-ggml
|
||||
env:
|
||||
CUDA_VERSION: 12-4
|
||||
- name: Test
|
||||
run: |
|
||||
PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
|
||||
TRANSFORMER_BACKEND=$(abspath ./)/external/transformers/run.sh PATH="$PATH:/root/go/bin" GO_TAGS="tts" make --jobs 5 --output-sync=target test
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
uses: mxschmitt/action-tmate@v3.23
|
||||
|
||||
10
Makefile
10
Makefile
@@ -1,5 +1,5 @@
|
||||
# Disable parallel execution for backend builds
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral
|
||||
.NOTPARALLEL: backends/diffusers backends/llama-cpp backends/outetts backends/piper backends/stablediffusion-ggml backends/whisper backends/faster-whisper backends/silero-vad backends/local-store backends/huggingface backends/rfdetr backends/kitten-tts backends/kokoro backends/chatterbox backends/llama-cpp-darwin backends/neutts build-darwin-python-backend build-darwin-go-backend backends/mlx backends/diffuser-darwin backends/mlx-vlm backends/mlx-audio backends/stablediffusion-ggml-darwin backends/vllm backends/vllm-omni backends/moonshine backends/pocket-tts backends/qwen-tts backends/faster-qwen3-tts backends/qwen-asr backends/nemo backends/voxcpm backends/whisperx backends/ace-step backends/voxtral
|
||||
|
||||
GOCMD=go
|
||||
GOTEST=$(GOCMD) test
|
||||
@@ -149,7 +149,7 @@ test: test-models/testmodel.ggml protogen-go
|
||||
@echo 'Running tests'
|
||||
export GO_TAGS="debug"
|
||||
$(MAKE) prepare-test
|
||||
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models BACKENDS_PATH=$(abspath ./)/backends \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||
$(MAKE) test-llama-gguf
|
||||
$(MAKE) test-tts
|
||||
@@ -317,6 +317,7 @@ prepare-test-extra: protogen-python
|
||||
$(MAKE) -C backend/python/moonshine
|
||||
$(MAKE) -C backend/python/pocket-tts
|
||||
$(MAKE) -C backend/python/qwen-tts
|
||||
$(MAKE) -C backend/python/faster-qwen3-tts
|
||||
$(MAKE) -C backend/python/qwen-asr
|
||||
$(MAKE) -C backend/python/nemo
|
||||
$(MAKE) -C backend/python/voxcpm
|
||||
@@ -334,6 +335,7 @@ test-extra: prepare-test-extra
|
||||
$(MAKE) -C backend/python/moonshine test
|
||||
$(MAKE) -C backend/python/pocket-tts test
|
||||
$(MAKE) -C backend/python/qwen-tts test
|
||||
$(MAKE) -C backend/python/faster-qwen3-tts test
|
||||
$(MAKE) -C backend/python/qwen-asr test
|
||||
$(MAKE) -C backend/python/nemo test
|
||||
$(MAKE) -C backend/python/voxcpm test
|
||||
@@ -473,6 +475,7 @@ BACKEND_VIBEVOICE = vibevoice|python|.|--progress=plain|true
|
||||
BACKEND_MOONSHINE = moonshine|python|.|false|true
|
||||
BACKEND_POCKET_TTS = pocket-tts|python|.|false|true
|
||||
BACKEND_QWEN_TTS = qwen-tts|python|.|false|true
|
||||
BACKEND_FASTER_QWEN3_TTS = faster-qwen3-tts|python|.|false|true
|
||||
BACKEND_QWEN_ASR = qwen-asr|python|.|false|true
|
||||
BACKEND_NEMO = nemo|python|.|false|true
|
||||
BACKEND_VOXCPM = voxcpm|python|.|false|true
|
||||
@@ -525,6 +528,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_VIBEVOICE)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_MOONSHINE)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_POCKET_TTS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_QWEN_TTS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_FASTER_QWEN3_TTS)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_QWEN_ASR)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_NEMO)))
|
||||
$(eval $(call generate-docker-build-target,$(BACKEND_VOXCPM)))
|
||||
@@ -535,7 +539,7 @@ $(eval $(call generate-docker-build-target,$(BACKEND_ACE_STEP)))
|
||||
docker-save-%: backend-images
|
||||
docker save local-ai-backend:$* -o backend-images/$*.tar
|
||||
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral
|
||||
docker-build-backends: docker-build-llama-cpp docker-build-rerankers docker-build-vllm docker-build-vllm-omni docker-build-transformers docker-build-outetts docker-build-diffusers docker-build-kokoro docker-build-faster-whisper docker-build-coqui docker-build-chatterbox docker-build-vibevoice docker-build-moonshine docker-build-pocket-tts docker-build-qwen-tts docker-build-faster-qwen3-tts docker-build-qwen-asr docker-build-nemo docker-build-voxcpm docker-build-whisperx docker-build-ace-step docker-build-voxtral
|
||||
|
||||
########################################################
|
||||
### Mock Backend for E2E Tests
|
||||
|
||||
@@ -334,7 +334,7 @@ Other:
|
||||
- Langchain: https://python.langchain.com/docs/integrations/providers/localai/
|
||||
- Terminal utility https://github.com/djcopley/ShellOracle
|
||||
- Local Smart assistant https://github.com/mudler/LocalAGI
|
||||
- Home Assistant https://github.com/sammcj/homeassistant-localai / https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-gpt4vision
|
||||
- Home Assistant https://github.com/drndos/hass-openai-custom-conversation / https://github.com/valentinfrlch/ha-llmvision / https://github.com/loryanstrant/HA-LocalAI-Monitor
|
||||
- Discord bot https://github.com/mudler/LocalAGI/tree/main/examples/discord
|
||||
- Slack bot https://github.com/mudler/LocalAGI/tree/main/examples/slack
|
||||
- Shell-Pilot(Interact with LLM using LocalAI models via pure shell scripts on your Linux or MacOS system) https://github.com/reid41/shell-pilot
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
|
||||
LLAMA_VERSION?=723c71064da0908c19683f8c344715fbf6d986fd
|
||||
LLAMA_VERSION?=05728db18eea59de81ee3a7699739daaf015206b
|
||||
LLAMA_REPO?=https://github.com/ggerganov/llama.cpp
|
||||
|
||||
CMAKE_ARGS?=
|
||||
|
||||
@@ -362,7 +362,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt
|
||||
params.mmproj.path = request->mmproj();
|
||||
}
|
||||
// params.model_alias ??
|
||||
params.model_alias = request->modelfile();
|
||||
params.model_alias.insert(request->modelfile());
|
||||
if (!request->cachetypekey().empty()) {
|
||||
params.cache_type_k = kv_cache_type_from_str(request->cachetypekey());
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=21411d81ea736ed5d9cdea4df360d3c4b60a4adb
|
||||
WHISPER_CPP_VERSION?=9453b4b9be9b73adfc35051083f37cefa039acee
|
||||
SO_TARGET?=libgowhisper.so
|
||||
|
||||
CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
|
||||
|
||||
@@ -528,6 +528,28 @@
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-qwen-tts"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-qwen-tts"
|
||||
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
|
||||
- &faster-qwen3-tts
|
||||
urls:
|
||||
- https://github.com/andimarafioti/faster-qwen3-tts
|
||||
- https://pypi.org/project/faster-qwen3-tts/
|
||||
description: |
|
||||
Real-time Qwen3-TTS inference using CUDA graph capture. Voice clone only; requires NVIDIA GPU with CUDA.
|
||||
tags:
|
||||
- text-to-speech
|
||||
- TTS
|
||||
- voice-clone
|
||||
license: apache-2.0
|
||||
name: "faster-qwen3-tts"
|
||||
alias: "faster-qwen3-tts"
|
||||
capabilities:
|
||||
nvidia: "cuda12-faster-qwen3-tts"
|
||||
default: "cuda12-faster-qwen3-tts"
|
||||
nvidia-cuda-13: "cuda13-faster-qwen3-tts"
|
||||
nvidia-cuda-12: "cuda12-faster-qwen3-tts"
|
||||
nvidia-l4t: "nvidia-l4t-faster-qwen3-tts"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-faster-qwen3-tts"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts"
|
||||
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/620760a26e3b7210c2ff1943/-s1gyJfvbE1RgO5iBeNOi.png
|
||||
- &qwen-asr
|
||||
urls:
|
||||
- https://github.com/QwenLM/Qwen3-ASR
|
||||
@@ -2279,6 +2301,57 @@
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-metal-darwin-arm64-qwen-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-metal-darwin-arm64-qwen-tts
|
||||
## faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "faster-qwen3-tts-development"
|
||||
capabilities:
|
||||
nvidia: "cuda12-faster-qwen3-tts-development"
|
||||
default: "cuda12-faster-qwen3-tts-development"
|
||||
nvidia-cuda-13: "cuda13-faster-qwen3-tts-development"
|
||||
nvidia-cuda-12: "cuda12-faster-qwen3-tts-development"
|
||||
nvidia-l4t: "nvidia-l4t-faster-qwen3-tts-development"
|
||||
nvidia-l4t-cuda-12: "nvidia-l4t-faster-qwen3-tts-development"
|
||||
nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts-development"
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "cuda12-faster-qwen3-tts"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-12-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-12-faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "cuda12-faster-qwen3-tts-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-12-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-12-faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "cuda13-faster-qwen3-tts"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-gpu-nvidia-cuda-13-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-gpu-nvidia-cuda-13-faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "cuda13-faster-qwen3-tts-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-gpu-nvidia-cuda-13-faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "nvidia-l4t-faster-qwen3-tts"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "nvidia-l4t-faster-qwen3-tts-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:latest-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:latest-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts
|
||||
- !!merge <<: *faster-qwen3-tts
|
||||
name: "cuda13-nvidia-l4t-arm64-faster-qwen3-tts-development"
|
||||
uri: "quay.io/go-skynet/local-ai-backends:master-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts"
|
||||
mirrors:
|
||||
- localai/localai-backends:master-nvidia-l4t-cuda-13-arm64-faster-qwen3-tts
|
||||
## qwen-asr
|
||||
- !!merge <<: *qwen-asr
|
||||
name: "qwen-asr-development"
|
||||
|
||||
23
backend/python/faster-qwen3-tts/Makefile
Normal file
23
backend/python/faster-qwen3-tts/Makefile
Normal file
@@ -0,0 +1,23 @@
|
||||
.PHONY: faster-qwen3-tts
|
||||
faster-qwen3-tts:
|
||||
bash install.sh
|
||||
|
||||
.PHONY: run
|
||||
run: faster-qwen3-tts
|
||||
@echo "Running faster-qwen3-tts..."
|
||||
bash run.sh
|
||||
@echo "faster-qwen3-tts run."
|
||||
|
||||
.PHONY: test
|
||||
test: faster-qwen3-tts
|
||||
@echo "Testing faster-qwen3-tts..."
|
||||
bash test.sh
|
||||
@echo "faster-qwen3-tts tested."
|
||||
|
||||
.PHONY: protogen-clean
|
||||
protogen-clean:
|
||||
$(RM) backend_pb2_grpc.py backend_pb2.py
|
||||
|
||||
.PHONY: clean
|
||||
clean: protogen-clean
|
||||
rm -rf venv __pycache__
|
||||
193
backend/python/faster-qwen3-tts/backend.py
Normal file
193
backend/python/faster-qwen3-tts/backend.py
Normal file
@@ -0,0 +1,193 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
gRPC server of LocalAI for Faster Qwen3-TTS (CUDA graph capture, voice clone only).
|
||||
"""
|
||||
from concurrent import futures
|
||||
import time
|
||||
import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
import traceback
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
import torch
|
||||
import soundfile as sf
|
||||
|
||||
import grpc
|
||||
|
||||
|
||||
def is_float(s):
|
||||
try:
|
||||
float(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
def is_int(s):
|
||||
try:
|
||||
int(s)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
|
||||
|
||||
|
||||
class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
def Health(self, request, context):
|
||||
return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
|
||||
|
||||
def LoadModel(self, request, context):
|
||||
if not torch.cuda.is_available():
|
||||
return backend_pb2.Result(
|
||||
success=False,
|
||||
message="faster-qwen3-tts requires NVIDIA GPU with CUDA"
|
||||
)
|
||||
|
||||
self.options = {}
|
||||
for opt in request.Options:
|
||||
if ":" not in opt:
|
||||
continue
|
||||
key, value = opt.split(":", 1)
|
||||
if is_float(value):
|
||||
value = float(value)
|
||||
elif is_int(value):
|
||||
value = int(value)
|
||||
elif value.lower() in ["true", "false"]:
|
||||
value = value.lower() == "true"
|
||||
self.options[key] = value
|
||||
|
||||
model_path = request.Model or "Qwen/Qwen3-TTS-12Hz-0.6B-Base"
|
||||
self.audio_path = request.AudioPath if hasattr(request, 'AudioPath') and request.AudioPath else None
|
||||
self.model_file = request.ModelFile if hasattr(request, 'ModelFile') and request.ModelFile else None
|
||||
self.model_path = request.ModelPath if hasattr(request, 'ModelPath') and request.ModelPath else None
|
||||
|
||||
from faster_qwen3_tts import FasterQwen3TTS
|
||||
print(f"Loading model from: {model_path}", file=sys.stderr)
|
||||
try:
|
||||
self.model = FasterQwen3TTS.from_pretrained(model_path)
|
||||
except Exception as e:
|
||||
print(f"[ERROR] Loading model: {type(e).__name__}: {e}", file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
return backend_pb2.Result(success=False, message=str(e))
|
||||
|
||||
print(f"Model loaded successfully: {model_path}", file=sys.stderr)
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
def _get_ref_audio_path(self, request):
|
||||
if not self.audio_path:
|
||||
return None
|
||||
if os.path.isabs(self.audio_path):
|
||||
return self.audio_path
|
||||
if self.model_file:
|
||||
model_file_base = os.path.dirname(self.model_file)
|
||||
ref_path = os.path.join(model_file_base, self.audio_path)
|
||||
if os.path.exists(ref_path):
|
||||
return ref_path
|
||||
if self.model_path:
|
||||
ref_path = os.path.join(self.model_path, self.audio_path)
|
||||
if os.path.exists(ref_path):
|
||||
return ref_path
|
||||
return self.audio_path
|
||||
|
||||
def TTS(self, request, context):
|
||||
try:
|
||||
if not request.dst:
|
||||
return backend_pb2.Result(
|
||||
success=False,
|
||||
message="dst (output path) is required"
|
||||
)
|
||||
text = request.text.strip()
|
||||
if not text:
|
||||
return backend_pb2.Result(
|
||||
success=False,
|
||||
message="Text is empty"
|
||||
)
|
||||
|
||||
language = request.language if hasattr(request, 'language') and request.language else None
|
||||
if not language or language == "":
|
||||
language = "English"
|
||||
|
||||
ref_audio = self._get_ref_audio_path(request)
|
||||
if not ref_audio:
|
||||
return backend_pb2.Result(
|
||||
success=False,
|
||||
message="AudioPath is required for voice clone (set in LoadModel)"
|
||||
)
|
||||
ref_text = self.options.get("ref_text")
|
||||
if not ref_text and hasattr(request, 'ref_text') and request.ref_text:
|
||||
ref_text = request.ref_text
|
||||
if not ref_text:
|
||||
return backend_pb2.Result(
|
||||
success=False,
|
||||
message="ref_text is required for voice clone (set via LoadModel Options, e.g. ref_text:Your reference transcript)"
|
||||
)
|
||||
|
||||
chunk_size = self.options.get("chunk_size")
|
||||
generation_kwargs = {}
|
||||
if chunk_size is not None:
|
||||
generation_kwargs["chunk_size"] = int(chunk_size)
|
||||
|
||||
audio_list, sr = self.model.generate_voice_clone(
|
||||
text=text,
|
||||
language=language,
|
||||
ref_audio=ref_audio,
|
||||
ref_text=ref_text,
|
||||
**generation_kwargs
|
||||
)
|
||||
|
||||
if audio_list is None or (isinstance(audio_list, list) and len(audio_list) == 0):
|
||||
return backend_pb2.Result(
|
||||
success=False,
|
||||
message="No audio output generated"
|
||||
)
|
||||
audio_data = audio_list[0] if isinstance(audio_list, list) else audio_list
|
||||
sf.write(request.dst, audio_data, sr)
|
||||
print(f"Saved output to {request.dst}", file=sys.stderr)
|
||||
|
||||
except Exception as err:
|
||||
print(f"Error in TTS: {err}", file=sys.stderr)
|
||||
print(traceback.format_exc(), file=sys.stderr)
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
return backend_pb2.Result(success=True)
|
||||
|
||||
|
||||
def serve(address):
|
||||
server = grpc.server(
|
||||
futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
|
||||
options=[
|
||||
('grpc.max_message_length', 50 * 1024 * 1024),
|
||||
('grpc.max_send_message_length', 50 * 1024 * 1024),
|
||||
('grpc.max_receive_message_length', 50 * 1024 * 1024),
|
||||
]
|
||||
)
|
||||
backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
|
||||
server.add_insecure_port(address)
|
||||
server.start()
|
||||
print("Server started. Listening on: " + address, file=sys.stderr)
|
||||
|
||||
def signal_handler(sig, frame):
|
||||
print("Received termination signal. Shutting down...")
|
||||
server.stop(0)
|
||||
sys.exit(0)
|
||||
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
try:
|
||||
while True:
|
||||
time.sleep(_ONE_DAY_IN_SECONDS)
|
||||
except KeyboardInterrupt:
|
||||
server.stop(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser(description="Run the gRPC server.")
|
||||
parser.add_argument("--addr", default="localhost:50051", help="The address to bind the server to.")
|
||||
args = parser.parse_args()
|
||||
serve(args.addr)
|
||||
13
backend/python/faster-qwen3-tts/install.sh
Normal file
13
backend/python/faster-qwen3-tts/install.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
EXTRA_PIP_INSTALL_FLAGS="--no-build-isolation"
|
||||
|
||||
backend_dir=$(dirname $0)
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
installRequirements
|
||||
@@ -0,0 +1,4 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu121
|
||||
torch
|
||||
torchaudio
|
||||
faster-qwen3-tts
|
||||
@@ -0,0 +1,4 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||
torch
|
||||
torchaudio
|
||||
faster-qwen3-tts
|
||||
1
backend/python/faster-qwen3-tts/requirements-install.txt
Normal file
1
backend/python/faster-qwen3-tts/requirements-install.txt
Normal file
@@ -0,0 +1 @@
|
||||
setuptools
|
||||
4
backend/python/faster-qwen3-tts/requirements-l4t12.txt
Normal file
4
backend/python/faster-qwen3-tts/requirements-l4t12.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
--extra-index-url https://pypi.jetson-ai-lab.io/jp6/cu129/
|
||||
torch
|
||||
torchaudio
|
||||
faster-qwen3-tts
|
||||
4
backend/python/faster-qwen3-tts/requirements-l4t13.txt
Normal file
4
backend/python/faster-qwen3-tts/requirements-l4t13.txt
Normal file
@@ -0,0 +1,4 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu130
|
||||
torch
|
||||
torchaudio
|
||||
faster-qwen3-tts
|
||||
8
backend/python/faster-qwen3-tts/requirements.txt
Normal file
8
backend/python/faster-qwen3-tts/requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
||||
grpcio==1.71.0
|
||||
protobuf
|
||||
certifi
|
||||
packaging==24.1
|
||||
soundfile
|
||||
setuptools
|
||||
six
|
||||
sox
|
||||
9
backend/python/faster-qwen3-tts/run.sh
Normal file
9
backend/python/faster-qwen3-tts/run.sh
Normal file
@@ -0,0 +1,9 @@
|
||||
#!/bin/bash
|
||||
backend_dir=$(dirname $0)
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
startBackend $@
|
||||
104
backend/python/faster-qwen3-tts/test.py
Normal file
104
backend/python/faster-qwen3-tts/test.py
Normal file
@@ -0,0 +1,104 @@
|
||||
"""
|
||||
Tests for the faster-qwen3-tts gRPC backend.
|
||||
"""
|
||||
import unittest
|
||||
import subprocess
|
||||
import time
|
||||
import os
|
||||
import sys
|
||||
import tempfile
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
import grpc
|
||||
|
||||
|
||||
class TestBackendServicer(unittest.TestCase):
|
||||
def setUp(self):
|
||||
self.service = subprocess.Popen(
|
||||
["python3", "backend.py", "--addr", "localhost:50052"],
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
cwd=os.path.dirname(os.path.abspath(__file__)),
|
||||
)
|
||||
time.sleep(15)
|
||||
|
||||
def tearDown(self):
|
||||
self.service.terminate()
|
||||
try:
|
||||
self.service.communicate(timeout=5)
|
||||
except subprocess.TimeoutExpired:
|
||||
self.service.kill()
|
||||
self.service.communicate()
|
||||
|
||||
def test_health(self):
|
||||
with grpc.insecure_channel("localhost:50052") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
reply = stub.Health(backend_pb2.HealthMessage(), timeout=5.0)
|
||||
self.assertEqual(reply.message, b"OK")
|
||||
|
||||
def test_load_model_requires_cuda(self):
|
||||
with grpc.insecure_channel("localhost:50052") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
response = stub.LoadModel(
|
||||
backend_pb2.ModelOptions(
|
||||
Model="Qwen/Qwen3-TTS-12Hz-0.6B-Base",
|
||||
CUDA=True,
|
||||
),
|
||||
timeout=10.0,
|
||||
)
|
||||
self.assertFalse(response.success)
|
||||
|
||||
@unittest.skipUnless(
|
||||
__import__("torch").cuda.is_available(),
|
||||
"faster-qwen3-tts TTS requires CUDA",
|
||||
)
|
||||
def test_tts(self):
|
||||
import soundfile as sf
|
||||
try:
|
||||
with grpc.insecure_channel("localhost:50052") as channel:
|
||||
stub = backend_pb2_grpc.BackendStub(channel)
|
||||
ref_audio = tempfile.NamedTemporaryFile(suffix='.wav', delete=False)
|
||||
ref_audio.close()
|
||||
try:
|
||||
sr = 22050
|
||||
duration = 1.0
|
||||
samples = int(sr * duration)
|
||||
sf.write(ref_audio.name, [0.0] * samples, sr)
|
||||
|
||||
response = stub.LoadModel(
|
||||
backend_pb2.ModelOptions(
|
||||
Model="Qwen/Qwen3-TTS-12Hz-0.6B-Base",
|
||||
AudioPath=ref_audio.name,
|
||||
Options=["ref_text:Hello world"],
|
||||
),
|
||||
timeout=600.0,
|
||||
)
|
||||
self.assertTrue(response.success, response.message)
|
||||
|
||||
with tempfile.NamedTemporaryFile(suffix='.wav', delete=False) as out:
|
||||
output_path = out.name
|
||||
try:
|
||||
tts_response = stub.TTS(
|
||||
backend_pb2.TTSRequest(
|
||||
text="Test output.",
|
||||
dst=output_path,
|
||||
language="English",
|
||||
),
|
||||
timeout=120.0,
|
||||
)
|
||||
self.assertTrue(tts_response.success, tts_response.message)
|
||||
self.assertTrue(os.path.exists(output_path))
|
||||
self.assertGreater(os.path.getsize(output_path), 0)
|
||||
finally:
|
||||
if os.path.exists(output_path):
|
||||
os.unlink(output_path)
|
||||
finally:
|
||||
if os.path.exists(ref_audio.name):
|
||||
os.unlink(ref_audio.name)
|
||||
except Exception as err:
|
||||
self.fail(f"TTS test failed: {err}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
unittest.main()
|
||||
11
backend/python/faster-qwen3-tts/test.sh
Normal file
11
backend/python/faster-qwen3-tts/test.sh
Normal file
@@ -0,0 +1,11 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
backend_dir=$(dirname $0)
|
||||
if [ -d $backend_dir/common ]; then
|
||||
source $backend_dir/common/libbackend.sh
|
||||
else
|
||||
source $backend_dir/../common/libbackend.sh
|
||||
fi
|
||||
|
||||
runUnittests
|
||||
@@ -25,6 +25,39 @@ const (
|
||||
runFile = "run.sh"
|
||||
)
|
||||
|
||||
// Environment variables for configurable fallback URI patterns
|
||||
const (
|
||||
// Default fallback tag values
|
||||
defaultLatestTag = "latest"
|
||||
defaultMasterTag = "master"
|
||||
defaultDevSuffix = "development"
|
||||
|
||||
// Environment variable names
|
||||
envLatestTag = "LOCALAI_BACKEND_IMAGES_RELEASE_TAG"
|
||||
envMasterTag = "LOCALAI_BACKEND_IMAGES_BRANCH_TAG"
|
||||
envDevSuffix = "LOCALAI_BACKEND_DEV_SUFFIX"
|
||||
)
|
||||
|
||||
// getFallbackTagValues returns the configurable fallback tag values from environment variables
|
||||
func getFallbackTagValues() (latestTag, masterTag, devSuffix string) {
|
||||
latestTag = os.Getenv(envLatestTag)
|
||||
masterTag = os.Getenv(envMasterTag)
|
||||
devSuffix = os.Getenv(envDevSuffix)
|
||||
|
||||
// Use defaults if environment variables are not set
|
||||
if latestTag == "" {
|
||||
latestTag = defaultLatestTag
|
||||
}
|
||||
if masterTag == "" {
|
||||
masterTag = defaultMasterTag
|
||||
}
|
||||
if devSuffix == "" {
|
||||
devSuffix = defaultDevSuffix
|
||||
}
|
||||
|
||||
return latestTag, masterTag, devSuffix
|
||||
}
|
||||
|
||||
// backendCandidate represents an installed concrete backend option for a given alias
|
||||
type backendCandidate struct {
|
||||
name string
|
||||
@@ -139,6 +172,9 @@ func InstallBackendFromGallery(ctx context.Context, galleries []config.Gallery,
|
||||
}
|
||||
|
||||
func InstallBackend(ctx context.Context, systemState *system.SystemState, modelLoader *model.ModelLoader, config *GalleryBackend, downloadStatus func(string, string, string, float64)) error {
|
||||
// Get configurable fallback tag values from environment variables
|
||||
latestTag, masterTag, devSuffix := getFallbackTagValues()
|
||||
|
||||
// Create base path if it doesn't exist
|
||||
err := os.MkdirAll(systemState.Backend.BackendsPath, 0750)
|
||||
if err != nil {
|
||||
@@ -166,6 +202,12 @@ func InstallBackend(ctx context.Context, systemState *system.SystemState, modelL
|
||||
} else {
|
||||
xlog.Debug("Downloading backend", "uri", config.URI, "backendPath", backendPath)
|
||||
if err := uri.DownloadFileWithContext(ctx, backendPath, "", 1, 1, downloadStatus); err != nil {
|
||||
// Clean up the partially downloaded backend directory on failure
|
||||
xlog.Debug("Backend download failed, cleaning up", "backendPath", backendPath, "error", err)
|
||||
if cleanupErr := os.RemoveAll(backendPath); cleanupErr != nil {
|
||||
xlog.Warn("Failed to clean up backend directory", "backendPath", backendPath, "error", cleanupErr)
|
||||
}
|
||||
|
||||
success := false
|
||||
// Try to download from mirrors
|
||||
for _, mirror := range config.Mirrors {
|
||||
@@ -182,6 +224,36 @@ func InstallBackend(ctx context.Context, systemState *system.SystemState, modelL
|
||||
}
|
||||
}
|
||||
|
||||
// Try fallback: replace latestTag + "-" with masterTag + "-" in the URI
|
||||
fallbackURI := strings.Replace(string(config.URI), latestTag + "-", masterTag + "-", 1)
|
||||
if fallbackURI != string(config.URI) {
|
||||
xlog.Debug("Trying fallback URI", "original", config.URI, "fallback", fallbackURI)
|
||||
if err := downloader.URI(fallbackURI).DownloadFileWithContext(ctx, backendPath, "", 1, 1, downloadStatus); err == nil {
|
||||
xlog.Debug("Downloaded backend using fallback URI", "uri", fallbackURI, "backendPath", backendPath)
|
||||
success = true
|
||||
} else {
|
||||
// Try another fallback: add "-" + devSuffix suffix to the backend name
|
||||
// For example: master-gpu-nvidia-cuda-13-ace-step -> master-gpu-nvidia-cuda-13-ace-step-development
|
||||
if !strings.Contains(fallbackURI, "-" + devSuffix) {
|
||||
// Extract backend name from URI and add -development
|
||||
parts := strings.Split(fallbackURI, "-")
|
||||
if len(parts) >= 2 {
|
||||
// Find where the backend name ends (usually the last part before the tag)
|
||||
// Pattern: quay.io/go-skynet/local-ai-backends:master-gpu-nvidia-cuda-13-ace-step
|
||||
lastDash := strings.LastIndex(fallbackURI, "-")
|
||||
if lastDash > 0 {
|
||||
devFallbackURI := fallbackURI[:lastDash] + "-" + devSuffix
|
||||
xlog.Debug("Trying development fallback URI", "fallback", devFallbackURI)
|
||||
if err := downloader.URI(devFallbackURI).DownloadFileWithContext(ctx, backendPath, "", 1, 1, downloadStatus); err == nil {
|
||||
xlog.Debug("Downloaded backend using development fallback URI", "uri", devFallbackURI, "backendPath", backendPath)
|
||||
success = true
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !success {
|
||||
xlog.Error("Failed to download backend", "uri", config.URI, "backendPath", backendPath, "error", err)
|
||||
return fmt.Errorf("failed to download backend %q: %v", config.URI, err)
|
||||
|
||||
@@ -916,7 +916,7 @@ parameters:
|
||||
|
||||
application, err := application.New(
|
||||
append(commonOpts,
|
||||
config.WithExternalBackend("transformers", os.Getenv("HUGGINGFACE_GRPC")),
|
||||
config.WithExternalBackend("transformers", os.Getenv("TRANSFORMER_BACKEND")),
|
||||
config.WithContext(c),
|
||||
config.WithSystemState(systemState),
|
||||
)...)
|
||||
|
||||
@@ -125,13 +125,21 @@ func handleAnthropicNonStream(c echo.Context, id string, input *schema.Anthropic
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("model inference failed: %v", err))
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Anthropic prediction failed", "error", err)
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
|
||||
const maxEmptyRetries = 5
|
||||
var prediction backend.LLMResponse
|
||||
var result string
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
prediction, err = predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Anthropic prediction failed", "error", err)
|
||||
return sendAnthropicError(c, 500, "api_error", fmt.Sprintf("prediction failed: %v", err))
|
||||
}
|
||||
result = backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
if result != "" || !shouldUseFn {
|
||||
break
|
||||
}
|
||||
xlog.Warn("Anthropic: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
|
||||
// Check if the result contains tool calls
|
||||
toolCalls := functions.ParseFunctionCall(result, cfg.FunctionsConfig)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
package localai
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
@@ -8,6 +9,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/labstack/echo/v4"
|
||||
@@ -18,6 +20,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
"github.com/mudler/LocalAI/pkg/vram"
|
||||
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
@@ -37,6 +40,31 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
|
||||
return fmt.Errorf("failed to discover model config: %w", err)
|
||||
}
|
||||
|
||||
resp := schema.GalleryResponse{
|
||||
StatusURL: fmt.Sprintf("%smodels/jobs/%s", httpUtils.BaseURL(c), ""),
|
||||
}
|
||||
|
||||
if len(modelConfig.Files) > 0 {
|
||||
files := make([]vram.FileInput, 0, len(modelConfig.Files))
|
||||
for _, f := range modelConfig.Files {
|
||||
files = append(files, vram.FileInput{URI: f.URI, Size: 0})
|
||||
}
|
||||
estCtx, cancel := context.WithTimeout(c.Request().Context(), 5*time.Second)
|
||||
defer cancel()
|
||||
opts := vram.EstimateOptions{ContextLength: 8192}
|
||||
result, err := vram.Estimate(estCtx, files, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
|
||||
if err == nil {
|
||||
if result.SizeBytes > 0 {
|
||||
resp.EstimatedSizeBytes = result.SizeBytes
|
||||
resp.EstimatedSizeDisplay = result.SizeDisplay
|
||||
}
|
||||
if result.VRAMBytes > 0 {
|
||||
resp.EstimatedVRAMBytes = result.VRAMBytes
|
||||
resp.EstimatedVRAMDisplay = result.VRAMDisplay
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
uuid, err := uuid.NewUUID()
|
||||
if err != nil {
|
||||
return err
|
||||
@@ -63,10 +91,9 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl
|
||||
BackendGalleries: appConfig.BackendGalleries,
|
||||
}
|
||||
|
||||
return c.JSON(200, schema.GalleryResponse{
|
||||
ID: uuid.String(),
|
||||
StatusURL: fmt.Sprintf("%smodels/jobs/%s", httpUtils.BaseURL(c), uuid.String()),
|
||||
})
|
||||
resp.ID = uuid.String()
|
||||
resp.StatusURL = fmt.Sprintf("%smodels/jobs/%s", httpUtils.BaseURL(c), uuid.String())
|
||||
return c.JSON(200, resp)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -270,7 +270,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
}
|
||||
responses <- initialMessage
|
||||
|
||||
result, err := handleQuestion(config, cl, req, ml, startupOptions, functionResults, result, prompt)
|
||||
result, err := handleQuestion(config, functionResults, result, prompt)
|
||||
if err != nil {
|
||||
xlog.Error("error handling question", "error", err)
|
||||
return err
|
||||
@@ -388,6 +388,14 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
|
||||
strictMode := false
|
||||
|
||||
xlog.Debug("Tool call routing decision",
|
||||
"shouldUseFn", shouldUseFn,
|
||||
"len(input.Functions)", len(input.Functions),
|
||||
"len(input.Tools)", len(input.Tools),
|
||||
"config.ShouldUseFunctions()", config.ShouldUseFunctions(),
|
||||
"config.FunctionToCall()", config.FunctionToCall(),
|
||||
)
|
||||
|
||||
for _, f := range input.Functions {
|
||||
if f.Strict {
|
||||
strictMode = true
|
||||
@@ -648,12 +656,13 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
xlog.Debug("Thinking start token", "thinkingStartToken", thinkingStartToken, "template", template)
|
||||
|
||||
var emptyRetryNeeded bool
|
||||
|
||||
tokenCallback := func(s string, c *[]schema.Choice) {
|
||||
// Prepend thinking token if needed, then extract reasoning from the response
|
||||
reasoning, s := reason.ExtractReasoningWithConfig(s, thinkingStartToken, config.ReasoningConfig)
|
||||
|
||||
if !shouldUseFn {
|
||||
// no function is called, just reply and use stop as finish reason
|
||||
stopReason := FinishReasonStop
|
||||
message := &schema.Message{Role: "assistant", Content: &s}
|
||||
if reasoning != "" {
|
||||
@@ -671,9 +680,15 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
|
||||
switch {
|
||||
case noActionsToRun:
|
||||
result, err := handleQuestion(config, cl, input, ml, startupOptions, results, s, predInput)
|
||||
if s == "" && textContentToReturn == "" {
|
||||
xlog.Warn("Backend returned empty content in tool-calling context, will retry")
|
||||
emptyRetryNeeded = true
|
||||
return
|
||||
}
|
||||
result, err := handleQuestion(config, results, s, predInput)
|
||||
if err != nil {
|
||||
xlog.Error("error handling question", "error", err)
|
||||
emptyRetryNeeded = true
|
||||
return
|
||||
}
|
||||
|
||||
@@ -745,19 +760,42 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
// Echo properly supports context cancellation via c.Request().Context()
|
||||
// No workaround needed!
|
||||
|
||||
result, tokenUsage, err := ComputeChoices(
|
||||
input,
|
||||
predInput,
|
||||
config,
|
||||
cl,
|
||||
startupOptions,
|
||||
ml,
|
||||
tokenCallback,
|
||||
nil,
|
||||
)
|
||||
const maxEmptyRetries = 5
|
||||
var result []schema.Choice
|
||||
var tokenUsage backend.TokenUsage
|
||||
var err error
|
||||
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
emptyRetryNeeded = false
|
||||
result, tokenUsage, err = ComputeChoices(
|
||||
input,
|
||||
predInput,
|
||||
config,
|
||||
cl,
|
||||
startupOptions,
|
||||
ml,
|
||||
tokenCallback,
|
||||
nil,
|
||||
)
|
||||
if err != nil || !emptyRetryNeeded {
|
||||
break
|
||||
}
|
||||
xlog.Warn("Retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
if emptyRetryNeeded {
|
||||
xlog.Warn("All retries exhausted, backend still returning empty content")
|
||||
stopReason := FinishReasonStop
|
||||
empty := ""
|
||||
result = append(result, schema.Choice{
|
||||
FinishReason: &stopReason,
|
||||
Index: 0,
|
||||
Message: &schema.Message{Role: "assistant", Content: &empty},
|
||||
})
|
||||
}
|
||||
usage := schema.OpenAIUsage{
|
||||
PromptTokens: tokenUsage.Prompt,
|
||||
CompletionTokens: tokenUsage.Completion,
|
||||
@@ -785,7 +823,7 @@ func ChatEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator
|
||||
}
|
||||
}
|
||||
|
||||
func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, input *schema.OpenAIRequest, ml *model.ModelLoader, o *config.ApplicationConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
|
||||
func handleQuestion(config *config.ModelConfig, funcResults []functions.FuncCallResults, result, prompt string) (string, error) {
|
||||
|
||||
if len(funcResults) == 0 && result != "" {
|
||||
xlog.Debug("nothing function results but we had a message from the LLM")
|
||||
@@ -818,73 +856,6 @@ func handleQuestion(config *config.ModelConfig, cl *config.ModelConfigLoader, in
|
||||
}
|
||||
|
||||
xlog.Debug("No action received from LLM, without a message, computing a reply")
|
||||
// Otherwise ask the LLM to understand the JSON output and the context, and return a message
|
||||
// Note: This costs (in term of CPU/GPU) another computation
|
||||
config.Grammar = ""
|
||||
images := []string{}
|
||||
for _, m := range input.Messages {
|
||||
images = append(images, m.StringImages...)
|
||||
}
|
||||
videos := []string{}
|
||||
for _, m := range input.Messages {
|
||||
videos = append(videos, m.StringVideos...)
|
||||
}
|
||||
audios := []string{}
|
||||
for _, m := range input.Messages {
|
||||
audios = append(audios, m.StringAudios...)
|
||||
}
|
||||
|
||||
// Serialize tools and tool_choice to JSON strings
|
||||
toolsJSON := ""
|
||||
if len(input.Tools) > 0 {
|
||||
toolsBytes, err := json.Marshal(input.Tools)
|
||||
if err == nil {
|
||||
toolsJSON = string(toolsBytes)
|
||||
}
|
||||
}
|
||||
toolChoiceJSON := ""
|
||||
if input.ToolsChoice != nil {
|
||||
toolChoiceBytes, err := json.Marshal(input.ToolsChoice)
|
||||
if err == nil {
|
||||
toolChoiceJSON = string(toolChoiceBytes)
|
||||
}
|
||||
}
|
||||
|
||||
// Extract logprobs from request
|
||||
// According to OpenAI API: logprobs is boolean, top_logprobs (0-20) controls how many top tokens per position
|
||||
var logprobs *int
|
||||
var topLogprobs *int
|
||||
if input.Logprobs.IsEnabled() {
|
||||
// If logprobs is enabled, use top_logprobs if provided, otherwise default to 1
|
||||
if input.TopLogprobs != nil {
|
||||
topLogprobs = input.TopLogprobs
|
||||
// For backend compatibility, set logprobs to the top_logprobs value
|
||||
logprobs = input.TopLogprobs
|
||||
} else {
|
||||
// Default to 1 if logprobs is true but top_logprobs not specified
|
||||
val := 1
|
||||
logprobs = &val
|
||||
topLogprobs = &val
|
||||
}
|
||||
}
|
||||
|
||||
// Extract logit_bias from request
|
||||
// According to OpenAI API: logit_bias is a map of token IDs (as strings) to bias values (-100 to 100)
|
||||
var logitBias map[string]float64
|
||||
if len(input.LogitBias) > 0 {
|
||||
logitBias = input.LogitBias
|
||||
}
|
||||
|
||||
predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, cl, o, nil, toolsJSON, toolChoiceJSON, logprobs, topLogprobs, logitBias)
|
||||
if err != nil {
|
||||
xlog.Error("model inference failed", "error", err)
|
||||
return "", err
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("prediction failed", "error", err)
|
||||
return "", err
|
||||
}
|
||||
return backend.Finetune(*config, prompt, prediction.Response), nil
|
||||
return "", fmt.Errorf("no action received from LLM, without a message, computing a reply")
|
||||
}
|
||||
|
||||
@@ -800,13 +800,26 @@ func handleBackgroundNonStream(ctx context.Context, store *ResponseStore, respon
|
||||
default:
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("prediction failed: %w", err)
|
||||
const maxEmptyRetries = 5
|
||||
var prediction backend.LLMResponse
|
||||
var result string
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
prediction, err = predFunc()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("prediction failed: %w", err)
|
||||
}
|
||||
result = backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
if result != "" || !shouldUseFn {
|
||||
break
|
||||
}
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil, ctx.Err()
|
||||
default:
|
||||
}
|
||||
xlog.Warn("Open Responses background: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
|
||||
// Parse tool calls if using functions (same logic as regular handler)
|
||||
var outputItems []schema.ORItemField
|
||||
var toolCalls []schema.ToolCall
|
||||
@@ -1475,13 +1488,21 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
|
||||
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("model inference failed: %v", err), "")
|
||||
}
|
||||
|
||||
prediction, err := predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Open Responses prediction failed", "error", err)
|
||||
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
|
||||
const maxEmptyRetries = 5
|
||||
var prediction backend.LLMResponse
|
||||
var result string
|
||||
for attempt := 0; attempt <= maxEmptyRetries; attempt++ {
|
||||
prediction, err = predFunc()
|
||||
if err != nil {
|
||||
xlog.Error("Open Responses prediction failed", "error", err)
|
||||
return sendOpenResponsesError(c, 500, "model_error", fmt.Sprintf("prediction failed: %v", err), "")
|
||||
}
|
||||
result = backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
if result != "" || !shouldUseFn {
|
||||
break
|
||||
}
|
||||
xlog.Warn("Open Responses: retrying prediction due to empty backend response", "attempt", attempt+1, "maxRetries", maxEmptyRetries)
|
||||
}
|
||||
|
||||
result := backend.Finetune(*cfg, predInput, prediction.Response)
|
||||
xlog.Debug("Open Responses - Raw model result", "result", result, "shouldUseFn", shouldUseFn)
|
||||
|
||||
// Detect if thinking token is already in prompt or template
|
||||
|
||||
@@ -1,14 +1,19 @@
|
||||
package routes
|
||||
|
||||
import "os"
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"math"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"path"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/google/uuid"
|
||||
"github.com/labstack/echo/v4"
|
||||
@@ -20,6 +25,7 @@ import (
|
||||
"github.com/mudler/LocalAI/core/p2p"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/vram"
|
||||
"github.com/mudler/LocalAI/pkg/xsysinfo"
|
||||
"github.com/mudler/xlog"
|
||||
)
|
||||
@@ -32,6 +38,25 @@ const (
|
||||
ascSortOrder = "asc"
|
||||
)
|
||||
|
||||
// getDirectorySize calculates the total size of files in a directory
|
||||
func getDirectorySize(path string) (int64, error) {
|
||||
var totalSize int64
|
||||
entries, err := os.ReadDir(path)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
for _, entry := range entries {
|
||||
info, err := entry.Info()
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
if !info.IsDir() {
|
||||
totalSize += info.Size()
|
||||
}
|
||||
}
|
||||
return totalSize, nil
|
||||
}
|
||||
|
||||
// RegisterUIAPIRoutes registers JSON API routes for the web UI
|
||||
func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig, galleryService *services.GalleryService, opcache *services.OpCache, applicationInstance *application.Application) {
|
||||
|
||||
@@ -242,6 +267,22 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
|
||||
modelsJSON := make([]map[string]interface{}, 0, len(models))
|
||||
seenIDs := make(map[string]bool)
|
||||
|
||||
weightExts := map[string]bool{".gguf": true, ".safetensors": true, ".bin": true, ".pt": true}
|
||||
hasWeightFiles := func(files []gallery.File) bool {
|
||||
for _, f := range files {
|
||||
ext := strings.ToLower(path.Ext(path.Base(f.URI)))
|
||||
if weightExts[ext] {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
const estimateTimeout = 3 * time.Second
|
||||
const estimateConcurrency = 3
|
||||
sem := make(chan struct{}, estimateConcurrency)
|
||||
var wg sync.WaitGroup
|
||||
|
||||
for _, m := range models {
|
||||
modelID := m.ID()
|
||||
|
||||
@@ -265,7 +306,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
|
||||
|
||||
_, trustRemoteCodeExists := m.Overrides["trust_remote_code"]
|
||||
|
||||
modelsJSON = append(modelsJSON, map[string]interface{}{
|
||||
obj := map[string]interface{}{
|
||||
"id": modelID,
|
||||
"name": m.Name,
|
||||
"description": m.Description,
|
||||
@@ -280,9 +321,48 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
|
||||
"isDeletion": isDeletionOp,
|
||||
"trustRemoteCode": trustRemoteCodeExists,
|
||||
"additionalFiles": m.AdditionalFiles,
|
||||
})
|
||||
}
|
||||
|
||||
if hasWeightFiles(m.AdditionalFiles) {
|
||||
files := make([]gallery.File, len(m.AdditionalFiles))
|
||||
copy(files, m.AdditionalFiles)
|
||||
wg.Add(1)
|
||||
go func(files []gallery.File, out map[string]interface{}) {
|
||||
defer wg.Done()
|
||||
sem <- struct{}{}
|
||||
defer func() { <-sem }()
|
||||
inputs := make([]vram.FileInput, 0, len(files))
|
||||
for _, f := range files {
|
||||
ext := strings.ToLower(path.Ext(path.Base(f.URI)))
|
||||
if weightExts[ext] {
|
||||
inputs = append(inputs, vram.FileInput{URI: f.URI, Size: 0})
|
||||
}
|
||||
}
|
||||
if len(inputs) == 0 {
|
||||
return
|
||||
}
|
||||
ctx, cancel := context.WithTimeout(context.Background(), estimateTimeout)
|
||||
defer cancel()
|
||||
opts := vram.EstimateOptions{ContextLength: 8192}
|
||||
result, err := vram.Estimate(ctx, inputs, opts, vram.DefaultCachedSizeResolver(), vram.DefaultCachedGGUFReader())
|
||||
if err == nil {
|
||||
if result.SizeBytes > 0 {
|
||||
out["estimated_size_bytes"] = result.SizeBytes
|
||||
out["estimated_size_display"] = result.SizeDisplay
|
||||
}
|
||||
if result.VRAMBytes > 0 {
|
||||
out["estimated_vram_bytes"] = result.VRAMBytes
|
||||
out["estimated_vram_display"] = result.VRAMDisplay
|
||||
}
|
||||
}
|
||||
}(files, obj)
|
||||
}
|
||||
|
||||
modelsJSON = append(modelsJSON, obj)
|
||||
}
|
||||
|
||||
wg.Wait()
|
||||
|
||||
prevPage := pageNum - 1
|
||||
nextPage := pageNum + 1
|
||||
if prevPage < 1 {
|
||||
@@ -297,6 +377,8 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
|
||||
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
|
||||
installedModelsCount := len(modelConfigs) + len(modelsWithoutConfig)
|
||||
|
||||
ramInfo, _ := xsysinfo.GetSystemRAMInfo()
|
||||
|
||||
return c.JSON(200, map[string]interface{}{
|
||||
"models": modelsJSON,
|
||||
"repositories": appConfig.Galleries,
|
||||
@@ -305,6 +387,9 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
|
||||
"taskTypes": taskTypes,
|
||||
"availableModels": totalModels,
|
||||
"installedModels": installedModelsCount,
|
||||
"ramTotal": ramInfo.Total,
|
||||
"ramUsed": ramInfo.Used,
|
||||
"ramUsagePercent": ramInfo.UsagePercent,
|
||||
"currentPage": pageNum,
|
||||
"totalPages": totalPages,
|
||||
"prevPage": prevPage,
|
||||
@@ -936,12 +1021,15 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
|
||||
watchdogInterval = appConfig.WatchDogInterval.String()
|
||||
}
|
||||
|
||||
storageSize, _ := getDirectorySize(appConfig.SystemState.Model.ModelsPath)
|
||||
|
||||
response := map[string]interface{}{
|
||||
"type": resourceInfo.Type, // "gpu" or "ram"
|
||||
"available": resourceInfo.Available,
|
||||
"gpus": resourceInfo.GPUs,
|
||||
"ram": resourceInfo.RAM,
|
||||
"aggregate": resourceInfo.Aggregate,
|
||||
"storage_size": storageSize,
|
||||
"reclaimer_enabled": appConfig.MemoryReclaimerEnabled,
|
||||
"reclaimer_threshold": appConfig.MemoryReclaimerThreshold,
|
||||
"watchdog_interval": watchdogInterval,
|
||||
|
||||
@@ -141,6 +141,15 @@
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
<!-- Models storage (disk usage) -->
|
||||
<template x-if="resourceData.storage_size != null">
|
||||
<div class="mt-3 pt-3 border-t border-[var(--color-primary-border)]/20">
|
||||
<div class="flex justify-between text-xs">
|
||||
<span class="text-[var(--color-text-secondary)]">Models storage</span>
|
||||
<span class="font-mono text-[var(--color-text-primary)]" x-text="formatBytes(resourceData.storage_size)"></span>
|
||||
</div>
|
||||
</div>
|
||||
</template>
|
||||
</div>
|
||||
</template>
|
||||
</div>
|
||||
|
||||
@@ -59,6 +59,26 @@
|
||||
<!-- Alert Messages -->
|
||||
<div id="alertContainer" class="mb-6"></div>
|
||||
|
||||
<!-- Persistent estimate (stays visible so user can see size/VRAM even if alert is replaced) -->
|
||||
<div x-show="!isAdvancedMode && !isEditMode && lastEstimate && ((lastEstimate.sizeDisplay && lastEstimate.sizeDisplay !== '0 B') || (lastEstimate.vramDisplay && lastEstimate.vramDisplay !== '0 B'))"
|
||||
x-transition
|
||||
class="mb-6 p-4 rounded-xl border border-[var(--color-primary)]/30 bg-[var(--color-primary-light)]/30">
|
||||
<h3 class="text-sm font-semibold text-[var(--color-text-primary)] mb-2 flex items-center gap-2">
|
||||
<i class="fas fa-memory text-[var(--color-primary)]"></i>
|
||||
Estimated requirements
|
||||
</h3>
|
||||
<div class="flex flex-wrap gap-4 text-sm text-[var(--color-text-secondary)]">
|
||||
<span x-show="lastEstimate && lastEstimate.sizeDisplay && lastEstimate.sizeDisplay !== '0 B'">
|
||||
<i class="fas fa-download mr-1.5 text-[var(--color-primary)]"></i>
|
||||
Download size: <span class="font-medium text-[var(--color-text-primary)]" x-text="lastEstimate?.sizeDisplay"></span>
|
||||
</span>
|
||||
<span x-show="lastEstimate && lastEstimate.vramDisplay && lastEstimate.vramDisplay !== '0 B'">
|
||||
<i class="fas fa-microchip mr-1.5 text-[var(--color-primary)]"></i>
|
||||
VRAM: <span class="font-medium text-[var(--color-text-primary)]" x-text="lastEstimate?.vramDisplay"></span>
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Simple Import Mode -->
|
||||
<div x-show="!isAdvancedMode && !isEditMode"
|
||||
x-transition:enter="transition ease-out duration-200"
|
||||
@@ -731,6 +751,7 @@ function importModel() {
|
||||
jobPollInterval: null,
|
||||
yamlEditor: null,
|
||||
modelEditor: null,
|
||||
lastEstimate: null,
|
||||
|
||||
init() {
|
||||
// If in edit mode, always show advanced mode
|
||||
@@ -854,15 +875,36 @@ function importModel() {
|
||||
}
|
||||
|
||||
const result = await response.json();
|
||||
|
||||
|
||||
const hasSize = result.estimated_size_display && result.estimated_size_display !== '0 B';
|
||||
const hasVram = result.estimated_vram_display && result.estimated_vram_display !== '0 B';
|
||||
if (hasSize || hasVram) {
|
||||
this.lastEstimate = {
|
||||
sizeDisplay: result.estimated_size_display || '',
|
||||
vramDisplay: result.estimated_vram_display || '',
|
||||
sizeBytes: result.estimated_size_bytes || 0,
|
||||
vramBytes: result.estimated_vram_bytes || 0
|
||||
};
|
||||
} else {
|
||||
this.lastEstimate = null;
|
||||
}
|
||||
|
||||
let successMsg = 'Import started! Tracking progress...';
|
||||
if (hasSize || hasVram) {
|
||||
const parts = [];
|
||||
if (hasSize) parts.push('Size: ' + result.estimated_size_display);
|
||||
if (hasVram) parts.push('VRAM: ' + result.estimated_vram_display);
|
||||
successMsg += ' (' + parts.join(' · ') + ')';
|
||||
}
|
||||
|
||||
if (result.uuid) {
|
||||
this.currentJobId = result.uuid;
|
||||
this.showAlert('success', 'Import started! Tracking progress...');
|
||||
this.showAlert('success', successMsg);
|
||||
this.startJobPolling();
|
||||
} else if (result.ID) {
|
||||
// Fallback for different response format
|
||||
this.currentJobId = result.ID;
|
||||
this.showAlert('success', 'Import started! Tracking progress...');
|
||||
this.showAlert('success', successMsg);
|
||||
this.startJobPolling();
|
||||
} else {
|
||||
throw new Error('No job ID returned from server');
|
||||
|
||||
@@ -177,7 +177,7 @@
|
||||
</div>
|
||||
|
||||
<!-- Results Section -->
|
||||
<div id="search-results" class="transition-all duration-300">
|
||||
<div id="search-results" class="transition-all duration-300 relative">
|
||||
<div x-show="loading && models.length === 0" class="text-center py-12">
|
||||
<svg class="animate-spin h-12 w-12 text-[var(--color-primary)] mx-auto mb-4" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
|
||||
<circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
|
||||
@@ -191,6 +191,21 @@
|
||||
<p class="text-[var(--color-text-secondary)]">No models found matching your criteria</p>
|
||||
</div>
|
||||
|
||||
<!-- Loading overlay when switching pages (we have models but loading) -->
|
||||
<div x-show="loading && models.length > 0"
|
||||
x-transition:enter="transition ease-out duration-150"
|
||||
x-transition:enter-start="opacity-0"
|
||||
x-transition:enter-end="opacity-100"
|
||||
class="absolute inset-0 z-10 flex items-center justify-center rounded-2xl bg-[var(--color-bg-secondary)]/80 backdrop-blur-sm">
|
||||
<div class="flex flex-col items-center gap-3">
|
||||
<svg class="animate-spin h-12 w-12 text-[var(--color-primary)]" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
|
||||
<circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
|
||||
<path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
|
||||
</svg>
|
||||
<p class="text-sm text-[var(--color-text-secondary)]">Loading page...</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Table View -->
|
||||
<div x-show="models.length > 0" class="bg-[var(--color-bg-secondary)] rounded-2xl border border-[var(--color-border-subtle)] overflow-hidden shadow-xl backdrop-blur-sm">
|
||||
<div class="overflow-x-auto">
|
||||
@@ -209,26 +224,7 @@
|
||||
</div>
|
||||
</th>
|
||||
<th class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider">Description</th>
|
||||
<th @click="setSort('repository')"
|
||||
:class="sortBy === 'repository' ? 'bg-[var(--color-primary-light)]' : ''"
|
||||
class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider cursor-pointer hover:bg-[var(--color-bg-primary)] transition-colors">
|
||||
<div class="flex items-center gap-2">
|
||||
<span>Repository</span>
|
||||
<i :class="sortBy === 'repository' ? (sortOrder === 'asc' ? 'fas fa-sort-up' : 'fas fa-sort-down') : 'fas fa-sort'"
|
||||
:class="sortBy === 'repository' ? 'text-[var(--color-primary)]' : 'text-[var(--color-text-secondary)]'"
|
||||
class="text-xs"></i>
|
||||
</div>
|
||||
</th>
|
||||
<th @click="setSort('license')"
|
||||
:class="sortBy === 'license' ? 'bg-[var(--color-primary-light)]' : ''"
|
||||
class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider cursor-pointer hover:bg-[var(--color-bg-primary)] transition-colors">
|
||||
<div class="flex items-center gap-2">
|
||||
<span>License</span>
|
||||
<i :class="sortBy === 'license' ? (sortOrder === 'asc' ? 'fas fa-sort-up' : 'fas fa-sort-down') : 'fas fa-sort'"
|
||||
:class="sortBy === 'license' ? 'text-[var(--color-primary)]' : 'text-[var(--color-text-secondary)]'"
|
||||
class="text-xs"></i>
|
||||
</div>
|
||||
</th>
|
||||
<th class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider">Size / VRAM</th>
|
||||
<th @click="setSort('status')"
|
||||
:class="sortBy === 'status' ? 'bg-[var(--color-primary-light)]' : ''"
|
||||
class="px-6 py-4 text-left text-xs font-semibold text-[var(--color-text-primary)] uppercase tracking-wider cursor-pointer hover:bg-[var(--color-bg-primary)] transition-colors">
|
||||
@@ -275,21 +271,26 @@
|
||||
<div class="text-sm text-[var(--color-text-secondary)] max-w-xs truncate" x-text="model.description" :title="model.description"></div>
|
||||
</td>
|
||||
|
||||
<!-- Repository -->
|
||||
<!-- Size / VRAM -->
|
||||
<td class="px-6 py-4">
|
||||
<span class="inline-flex items-center text-xs px-2 py-1 rounded bg-[var(--color-primary-light)] text-[var(--color-text-primary)] border border-[var(--color-primary-border)]">
|
||||
<i class="fa-brands fa-git-alt mr-1"></i>
|
||||
<span x-text="model.gallery"></span>
|
||||
</span>
|
||||
</td>
|
||||
|
||||
<!-- License -->
|
||||
<td class="px-6 py-4">
|
||||
<span x-show="model.license" class="inline-flex items-center text-xs px-2 py-1 rounded bg-[var(--color-accent-light)] text-[var(--color-text-primary)] border border-[var(--color-accent)]/30">
|
||||
<i class="fas fa-book mr-1"></i>
|
||||
<span x-text="model.license"></span>
|
||||
</span>
|
||||
<span x-show="!model.license" class="text-xs text-[var(--color-text-secondary)]">-</span>
|
||||
<div class="flex flex-col gap-0.5">
|
||||
<template x-if="(model.estimated_size_display && model.estimated_size_display !== '0 B') || (model.estimated_vram_display && model.estimated_vram_display !== '0 B')">
|
||||
<div class="text-xs text-[var(--color-text-secondary)]">
|
||||
<span x-show="model.estimated_size_display && model.estimated_size_display !== '0 B'" x-text="'Size: ' + model.estimated_size_display"></span>
|
||||
<span x-show="(model.estimated_size_display && model.estimated_size_display !== '0 B') && (model.estimated_vram_display && model.estimated_vram_display !== '0 B')"> · </span>
|
||||
<span x-show="model.estimated_vram_display && model.estimated_vram_display !== '0 B'" x-text="'VRAM: ' + model.estimated_vram_display"></span>
|
||||
</div>
|
||||
</template>
|
||||
<template x-if="model.estimated_vram_bytes && totalMemory > 0">
|
||||
<span :title="(model.estimated_vram_bytes <= totalMemory * 0.95 ? 'Fits your GPU' : 'May not fit your GPU')"
|
||||
class="inline-flex items-center text-xs">
|
||||
<i class="fas fa-microchip mr-1"
|
||||
:class="model.estimated_vram_bytes <= totalMemory * 0.95 ? 'text-[var(--color-success)]' : 'text-[var(--color-error)]'"></i>
|
||||
<span x-text="model.estimated_vram_bytes <= totalMemory * 0.95 ? 'Fits' : 'May not fit'"></span>
|
||||
</span>
|
||||
</template>
|
||||
<span x-show="(!model.estimated_size_display || model.estimated_size_display === '0 B') && (!model.estimated_vram_display || model.estimated_vram_display === '0 B')" class="text-xs text-[var(--color-text-muted)]">-</span>
|
||||
</div>
|
||||
</td>
|
||||
|
||||
<!-- Status -->
|
||||
@@ -405,6 +406,36 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="text-base leading-relaxed text-[var(--color-text-secondary)] break-words max-w-full markdown-content" x-html="renderMarkdown(selectedModel?.description)"></div>
|
||||
<template x-if="(selectedModel?.estimated_size_display && selectedModel.estimated_size_display !== '0 B') || (selectedModel?.estimated_vram_display && selectedModel.estimated_vram_display !== '0 B')">
|
||||
<div class="space-y-1">
|
||||
<p x-show="selectedModel?.estimated_size_display && selectedModel.estimated_size_display !== '0 B'" class="text-sm text-[var(--color-text-secondary)]">
|
||||
<i class="fas fa-download mr-2 text-[var(--color-primary)]"></i>
|
||||
Estimated download size: <span x-text="selectedModel?.estimated_size_display" class="font-medium text-[var(--color-text-primary)]"></span>
|
||||
</p>
|
||||
<p x-show="selectedModel?.estimated_vram_display && selectedModel.estimated_vram_display !== '0 B'" class="text-sm text-[var(--color-text-secondary)]">
|
||||
<i class="fas fa-memory mr-2 text-[var(--color-primary)]"></i>
|
||||
Estimated VRAM: <span x-text="selectedModel?.estimated_vram_display" class="font-medium text-[var(--color-text-primary)]"></span>
|
||||
</p>
|
||||
<p x-show="selectedModel?.estimated_vram_bytes && totalMemory > 0" class="text-sm">
|
||||
<i class="fas fa-microchip mr-2"
|
||||
:class="selectedModel?.estimated_vram_bytes <= totalMemory * 0.95 ? 'text-[var(--color-success)]' : 'text-[var(--color-error)]'"></i>
|
||||
<span x-text="selectedModel?.estimated_vram_bytes <= totalMemory * 0.95 ? 'Fits your GPU' : 'May not fit your GPU'"
|
||||
:class="selectedModel?.estimated_vram_bytes <= totalMemory * 0.95 ? 'text-[var(--color-success)]' : 'text-[var(--color-error)]'"></span>
|
||||
</p>
|
||||
</div>
|
||||
</template>
|
||||
<template x-if="selectedModel?.gallery || selectedModel?.license">
|
||||
<div class="space-y-1">
|
||||
<p x-show="selectedModel?.gallery" class="text-sm text-[var(--color-text-secondary)]">
|
||||
<i class="fa-brands fa-git-alt mr-2 text-[var(--color-primary)]"></i>
|
||||
Repository: <span x-text="selectedModel?.gallery" class="font-medium text-[var(--color-text-primary)]"></span>
|
||||
</p>
|
||||
<p x-show="selectedModel?.license" class="text-sm text-[var(--color-text-secondary)]">
|
||||
<i class="fas fa-book mr-2 text-[var(--color-primary)]"></i>
|
||||
License: <span x-text="selectedModel?.license" class="font-medium text-[var(--color-text-primary)]"></span>
|
||||
</p>
|
||||
</div>
|
||||
</template>
|
||||
<hr>
|
||||
<template x-if="selectedModel?.urls && selectedModel.urls.length > 0">
|
||||
<div>
|
||||
@@ -605,6 +636,10 @@ function modelsGallery() {
|
||||
totalPages: 1,
|
||||
availableModels: 0,
|
||||
installedModels: 0,
|
||||
ramTotal: 0,
|
||||
ramUsed: 0,
|
||||
ramUsagePercent: 0,
|
||||
totalMemory: 0,
|
||||
selectedModel: null,
|
||||
jobProgress: {},
|
||||
notifications: [],
|
||||
@@ -613,10 +648,21 @@ function modelsGallery() {
|
||||
|
||||
init() {
|
||||
this.fetchModels();
|
||||
this.fetchResources();
|
||||
// Poll for job progress every 600ms
|
||||
setInterval(() => this.pollJobs(), 600);
|
||||
},
|
||||
|
||||
async fetchResources() {
|
||||
try {
|
||||
const response = await fetch('/api/resources');
|
||||
if (response.ok) {
|
||||
const data = await response.json();
|
||||
this.totalMemory = data.aggregate?.total_memory || 0;
|
||||
}
|
||||
} catch (e) {}
|
||||
},
|
||||
|
||||
addNotification(message, type = 'error') {
|
||||
const id = Date.now();
|
||||
this.notifications.push({ id, message, type });
|
||||
@@ -650,6 +696,9 @@ function modelsGallery() {
|
||||
this.totalPages = data.totalPages || 1;
|
||||
this.availableModels = data.availableModels || 0;
|
||||
this.installedModels = data.installedModels || 0;
|
||||
this.ramTotal = data.ramTotal || 0;
|
||||
this.ramUsed = data.ramUsed || 0;
|
||||
this.ramUsagePercent = data.ramUsagePercent || 0;
|
||||
} catch (error) {
|
||||
console.error('Error fetching models:', error);
|
||||
} finally {
|
||||
@@ -826,6 +875,14 @@ function modelsGallery() {
|
||||
this.selectedModel = model;
|
||||
},
|
||||
|
||||
formatBytes(bytes) {
|
||||
if (bytes === 0) return "0 B";
|
||||
const k = 1024;
|
||||
const sizes = ["B", "KB", "MB", "GB", "TB"];
|
||||
const i = Math.floor(Math.log(bytes) / Math.log(k));
|
||||
return parseFloat((bytes / Math.pow(k, i)).toFixed(2)) + " " + sizes[i];
|
||||
},
|
||||
|
||||
closeModal() {
|
||||
this.selectedModel = null;
|
||||
}
|
||||
|
||||
@@ -20,7 +20,7 @@
|
||||
<div class="flex items-center justify-between gap-2">
|
||||
<label class="text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide flex-shrink-0">Model</label>
|
||||
</div>
|
||||
<select x-data="{ link : '' }" x-model="link" x-init="$watch('link', value => window.location = link)"
|
||||
<select x-data="{ link : '{{if .Model}}video/{{.Model}}{{end}}' }" x-model="link" x-init="$watch('link', value => window.location = link)"
|
||||
id="model-select"
|
||||
class="input w-full p-1.5 text-xs"
|
||||
>
|
||||
|
||||
@@ -24,6 +24,11 @@ type BackendMonitorResponse struct {
|
||||
type GalleryResponse struct {
|
||||
ID string `json:"uuid"`
|
||||
StatusURL string `json:"status"`
|
||||
|
||||
EstimatedVRAMBytes uint64 `json:"estimated_vram_bytes,omitempty"`
|
||||
EstimatedVRAMDisplay string `json:"estimated_vram_display,omitempty"`
|
||||
EstimatedSizeBytes uint64 `json:"estimated_size_bytes,omitempty"`
|
||||
EstimatedSizeDisplay string `json:"estimated_size_display,omitempty"`
|
||||
}
|
||||
|
||||
type VideoRequest struct {
|
||||
|
||||
@@ -27,6 +27,24 @@ services:
|
||||
# or an URL pointing to a YAML configuration file, for example:
|
||||
# - https://gist.githubusercontent.com/mudler/ad601a0488b497b69ec549150d9edd18/raw/a8a8869ef1bb7e3830bf5c0bae29a0cce991ff8d/phi-2.yaml
|
||||
- phi-2
|
||||
# For NVIDIA GPU support with CDI (recommended for NVIDIA Container Toolkit 1.14+):
|
||||
# Uncomment the following deploy section and use driver: nvidia.com/gpu
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia.com/gpu
|
||||
# count: all
|
||||
# capabilities: [gpu]
|
||||
#
|
||||
# For legacy NVIDIA driver (for older NVIDIA Container Toolkit):
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: 1
|
||||
# capabilities: [gpu]
|
||||
|
||||
volumes:
|
||||
models:
|
||||
|
||||
148
docs/content/advanced/reverse-proxy-tls.md
Normal file
148
docs/content/advanced/reverse-proxy-tls.md
Normal file
@@ -0,0 +1,148 @@
|
||||
---
|
||||
title: TLS Reverse Proxy Configuration
|
||||
description: Configure LocalAI behind a TLS termination reverse proxy (HAProxy, Apache, Nginx)
|
||||
weight: 100
|
||||
---
|
||||
|
||||
# TLS Reverse Proxy Configuration
|
||||
|
||||
When running LocalAI behind a TLS termination reverse proxy, the Web UI may fail to load static assets (CSS, JS) correctly because the application doesn't automatically detect that it's being served over HTTPS. This guide explains how to properly configure your reverse proxy to work with LocalAI.
|
||||
|
||||
## How It Works
|
||||
|
||||
LocalAI uses the `X-Forwarded-Proto` HTTP header to determine the protocol used by clients. When this header is set to `https`, LocalAI will generate HTTPS URLs for static assets in the Web UI.
|
||||
|
||||
## Required Headers
|
||||
|
||||
Your reverse proxy must forward these headers to LocalAI:
|
||||
|
||||
| Header | Purpose |
|
||||
|--------|---------|
|
||||
| `X-Forwarded-Proto` | Set to `https` when TLS is terminated at the proxy |
|
||||
| `X-Forwarded-Host` | The original host requested by the client |
|
||||
| `X-Forwarded-Prefix` | Any path prefix if LocalAI is served under a sub-path |
|
||||
|
||||
## HAProxy Configuration
|
||||
|
||||
```haproxy
|
||||
frontend https-in
|
||||
bind *:443 ssl crt /path/to/cert.pem
|
||||
mode http
|
||||
|
||||
# Set the X-Forwarded-Proto header
|
||||
http-request set-header X-Forwarded-Proto https
|
||||
|
||||
# Pass the original host
|
||||
http-request set-header X-Forwarded-Host %[hdr(host)]
|
||||
|
||||
# If serving under a sub-path, set the prefix
|
||||
# http-request set-header X-Forwarded-Prefix /localai
|
||||
|
||||
default_backend localai
|
||||
|
||||
backend localai
|
||||
mode http
|
||||
server localai1 127.0.0.1:8080 check
|
||||
```
|
||||
|
||||
## Apache Configuration
|
||||
|
||||
```apache
|
||||
<VirtualHost *:443>
|
||||
ServerName your-domain.com
|
||||
SSLEngine on
|
||||
SSLCertificateFile /path/to/cert.pem
|
||||
SSLCertificateKeyFile /path/to/key.pem
|
||||
|
||||
# Enable proxy and headers modules
|
||||
ProxyRequests Off
|
||||
ProxyPreserveHost On
|
||||
|
||||
<Proxy *>
|
||||
Require all granted
|
||||
</Proxy>
|
||||
|
||||
# Set the X-Forwarded-Proto header
|
||||
RequestHeader set X-Forwarded-Proto "https"
|
||||
|
||||
# Set the X-Forwarded-Host header (optional, usually automatic)
|
||||
RequestHeader set X-Forwarded-Host "%{HTTP_HOST}s"
|
||||
|
||||
# If serving under a sub-path
|
||||
# RequestHeader set X-Forwarded-Prefix "/localai"
|
||||
|
||||
ProxyPass / http://127.0.0.1:8080/
|
||||
ProxyPassReverse / http://127.0.0.1:8080/
|
||||
</VirtualHost>
|
||||
```
|
||||
|
||||
## Nginx Configuration
|
||||
|
||||
```nginx
|
||||
server {
|
||||
listen 443 ssl;
|
||||
server_name your-domain.com;
|
||||
|
||||
ssl_certificate /path/to/cert.pem;
|
||||
ssl_certificate_key /path/to/key.pem;
|
||||
|
||||
# Set the X-Forwarded-Proto header
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
|
||||
# Pass the original host
|
||||
proxy_set_header X-Forwarded-Host $host;
|
||||
|
||||
# If serving under a sub-path
|
||||
# proxy_set_header X-Forwarded-Prefix /localai;
|
||||
|
||||
# Other proxy settings
|
||||
proxy_pass http://127.0.0.1:8080;
|
||||
proxy_http_version 1.1;
|
||||
proxy_set_header Upgrade $http_upgrade;
|
||||
proxy_set_header Connection "upgrade";
|
||||
proxy_set_header Host $host;
|
||||
proxy_cache_bypass $http_upgrade;
|
||||
}
|
||||
```
|
||||
|
||||
## Serving Under a Sub-Path
|
||||
|
||||
If you serve LocalAI under a sub-path (e.g., `https://your-domain.com/localai`), you need to:
|
||||
|
||||
1. Configure your reverse proxy to set the `X-Forwarded-Prefix` header
|
||||
|
||||
Example with Nginx:
|
||||
|
||||
```nginx
|
||||
proxy_set_header X-Forwarded-Prefix /localai;
|
||||
```
|
||||
|
||||
## Testing Your Configuration
|
||||
|
||||
1. Start LocalAI: `localai`
|
||||
2. Configure your reverse proxy as shown above
|
||||
3. Access the Web UI through the proxy
|
||||
4. Check the browser's developer console for any mixed content warnings or failed asset loads
|
||||
5. Verify that the HTML source contains `https://` URLs for static assets
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### Static Assets Not Loading
|
||||
|
||||
- Verify the `X-Forwarded-Proto` header is being forwarded
|
||||
- Check that the header value is exactly `https` (lowercase)
|
||||
- Inspect the network tab in your browser to see which requests are failing
|
||||
|
||||
### Mixed Content Warnings
|
||||
|
||||
- Ensure LocalAI is generating HTTPS URLs (check the BaseURL middleware is working)
|
||||
- Verify the `X-Forwarded-Proto` header is set before LocalAI processes the request
|
||||
|
||||
### Redirect Loops
|
||||
|
||||
- Check that your proxy is not adding duplicate headers
|
||||
- Verify `X-Forwarded-Proto` is not being set to both `http` and `https`
|
||||
|
||||
## Security Note
|
||||
|
||||
When using reverse proxies, ensure your proxy only accepts connections from trusted sources and properly validates SSL certificates. Never expose LocalAI directly to the internet without TLS termination.
|
||||
@@ -31,6 +31,15 @@ GPT and text generation models might have a license which is not permissive for
|
||||
|
||||
Navigate the WebUI interface in the "Models" section from the navbar at the top. Here you can find a list of models that can be installed, and you can install them by clicking the "Install" button.
|
||||
|
||||
## VRAM and download size estimates
|
||||
|
||||
When browsing the gallery or importing a model by URI, LocalAI can show **estimated download size** and **estimated VRAM** for models.
|
||||
|
||||
- **Where they appear**: In the model gallery table (Size / VRAM column), in the model detail modal, and after starting an import from URI (in the success message).
|
||||
- **How they are computed**: GGUF models use file size (HTTP HEAD or local stat) and optional GGUF metadata (HTTP Range) for KV cache and overhead; other formats use Hugging Face file sizes and optional config when available. If metadata is unavailable, a size-only heuristic is used.
|
||||
- **Hardware fit indicator**: When your system reports GPU or RAM capacity, the gallery shows whether the estimated VRAM fits (green) or may not fit (red) using a 95% headroom rule.
|
||||
- Estimates are best-effort and may be missing if the server does not support HEAD/Range or the request times out.
|
||||
|
||||
## Add other galleries
|
||||
|
||||
You can add other galleries by:
|
||||
|
||||
@@ -139,17 +139,16 @@ podman run -ti --name local-ai -p 8080:8080 --device gpu.intel.com/all localai/l
|
||||
|
||||
For a more manageable setup, especially with persistent volumes, use Docker Compose or Podman Compose:
|
||||
|
||||
### Using CDI (Container Device Interface) - Recommended for NVIDIA Container Toolkit 1.14+
|
||||
|
||||
The CDI approach is recommended for newer versions of the NVIDIA Container Toolkit (1.14 and later). It provides better compatibility and is the future-proof method:
|
||||
|
||||
```yaml
|
||||
version: "3.9"
|
||||
services:
|
||||
api:
|
||||
image: localai/localai:latest-aio-cpu
|
||||
# For GPU support, use one of:
|
||||
# image: localai/localai:latest-aio-gpu-nvidia-cuda-13
|
||||
# image: localai/localai:latest-aio-gpu-nvidia-cuda-12
|
||||
# image: localai/localai:latest-aio-gpu-nvidia-cuda-11
|
||||
# image: localai/localai:latest-aio-gpu-hipblas
|
||||
# image: localai/localai:latest-aio-gpu-intel
|
||||
image: localai/localai:latest-aio-gpu-nvidia-cuda-12
|
||||
# For CUDA 13, use: localai/localai:latest-aio-gpu-nvidia-cuda-13
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
|
||||
interval: 1m
|
||||
@@ -161,14 +160,15 @@ services:
|
||||
- DEBUG=false
|
||||
volumes:
|
||||
- ./models:/models:cached
|
||||
# For NVIDIA GPUs, uncomment:
|
||||
# deploy:
|
||||
# resources:
|
||||
# reservations:
|
||||
# devices:
|
||||
# - driver: nvidia
|
||||
# count: 1
|
||||
# capabilities: [gpu]
|
||||
# CDI driver configuration (recommended for NVIDIA Container Toolkit 1.14+)
|
||||
# This uses the nvidia.com/gpu resource API
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia.com/gpu
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
```
|
||||
|
||||
Save this as `compose.yaml` and run:
|
||||
@@ -179,6 +179,37 @@ docker compose up -d
|
||||
podman-compose up -d
|
||||
```
|
||||
|
||||
### Using Legacy NVIDIA Driver - For Older NVIDIA Container Toolkit
|
||||
|
||||
If you are using an older version of the NVIDIA Container Toolkit (before 1.14), or need backward compatibility, use the legacy approach:
|
||||
|
||||
```yaml
|
||||
version: "3.9"
|
||||
services:
|
||||
api:
|
||||
image: localai/localai:latest-aio-gpu-nvidia-cuda-12
|
||||
# For CUDA 13, use: localai/localai:latest-aio-gpu-nvidia-cuda-13
|
||||
healthcheck:
|
||||
test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
|
||||
interval: 1m
|
||||
timeout: 20m
|
||||
retries: 5
|
||||
ports:
|
||||
- 8080:8080
|
||||
environment:
|
||||
- DEBUG=false
|
||||
volumes:
|
||||
- ./models:/models:cached
|
||||
# Legacy NVIDIA driver configuration (for older NVIDIA Container Toolkit)
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
```
|
||||
|
||||
## Persistent Storage
|
||||
|
||||
To persist models and configurations, mount a volume:
|
||||
@@ -244,6 +275,35 @@ After installation:
|
||||
- For NVIDIA: Install [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html)
|
||||
- For AMD: Ensure devices are accessible: `ls -la /dev/kfd /dev/dri`
|
||||
|
||||
### NVIDIA Container fails to start with "Auto-detected mode as 'legacy'" error
|
||||
|
||||
If you encounter this error:
|
||||
```
|
||||
Error response from daemon: failed to create task for container: failed to create shim task: OCI runtime create failed: runc create failed: unable to start container process: error during container init: error running prestart hook #0: exit status 1, stdout: , stderr: Auto-detected mode as 'legacy'
|
||||
nvidia-container-cli: requirement error: invalid expression
|
||||
```
|
||||
|
||||
This indicates a Docker/NVIDIA Container Toolkit configuration issue. The container runtime's prestart hook fails before LocalAI starts. This is **not** a LocalAI code bug.
|
||||
|
||||
**Solutions:**
|
||||
|
||||
1. **Use CDI mode (recommended)**: Update your docker-compose.yaml to use the CDI driver configuration:
|
||||
```yaml
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia.com/gpu
|
||||
count: all
|
||||
capabilities: [gpu]
|
||||
```
|
||||
|
||||
2. **Upgrade NVIDIA Container Toolkit**: Ensure you have version 1.14 or later, which has better CDI support.
|
||||
|
||||
3. **Check NVIDIA Container Toolkit configuration**: Run `nvidia-container-cli --query-gpu` to verify your installation is working correctly outside of containers.
|
||||
|
||||
4. **Verify Docker GPU access**: Test with `docker run --rm --gpus all nvidia/cuda:12.0.0-base-ubuntu22.04 nvidia-smi`
|
||||
|
||||
### Models not downloading
|
||||
|
||||
- Check internet connection
|
||||
|
||||
@@ -1,4 +1,166 @@
|
||||
---
|
||||
- name: "qwen3.5-397b-a17b"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF
|
||||
description: |
|
||||
AI model: qwen3.5-397b-a17b
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00001-of-00006.gguf
|
||||
name: Qwen3.5-397B-A17B-GGUF
|
||||
backend: llama-cpp
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
known_usecases:
|
||||
- chat
|
||||
function:
|
||||
grammar:
|
||||
disable: true
|
||||
mmproj: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
description: Imported from https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF
|
||||
options:
|
||||
- use_jinja:true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00001-of-00006.gguf
|
||||
sha256: 1300b09fae0f87ee8dc10f2b17e0070eaf73a3561e8664a3fa307fcad50c55e3
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00001-of-00006.gguf
|
||||
- filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00002-of-00006.gguf
|
||||
sha256: 2bc58495b9108480cd9f3ceea0c323ddcb9fceffe354e56b71d48ef01c35ef60
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00002-of-00006.gguf
|
||||
- filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00003-of-00006.gguf
|
||||
sha256: 64954cb1376d1de1778ddad0c8231f4bbd15492627caf118a685ae475d3efa81
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00003-of-00006.gguf
|
||||
- filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00004-of-00006.gguf
|
||||
sha256: 554485298f616b0ff59e1ec2982167d55bece87f682827c68a32acd0fd03425f
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00004-of-00006.gguf
|
||||
- filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00005-of-00006.gguf
|
||||
sha256: 24d6f5668ea2c6eaddde5f08ea6325b495bc66be7217bb2de0a5c8b5eace1c51
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00005-of-00006.gguf
|
||||
- filename: llama-cpp/models/Qwen3.5-397B-A17B-Q4_K_M-00006-of-00006.gguf
|
||||
sha256: e36715e951da55d9e48b40aab61ba7829a7bfad5c6a155eb79aa13fe8b39347f
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/Q4_K_M/Qwen3.5-397B-A17B-Q4_K_M-00006-of-00006.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
sha256: e47df150363dd9d53b4ddf01e5477a6803f7fc2d2e0341064dcf39511ad5f110
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-397B-A17B-GGUF/resolve/main/mmproj-F32.gguf
|
||||
- name: "qwen3.5-27b"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/unsloth/Qwen3.5-27B-GGUF
|
||||
description: |
|
||||
AI model: qwen3.5-27b
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen3.5-27B-Q4_K_M.gguf
|
||||
name: Qwen3.5-27B-GGUF
|
||||
backend: llama-cpp
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
known_usecases:
|
||||
- chat
|
||||
function:
|
||||
grammar:
|
||||
disable: true
|
||||
mmproj: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
description: Imported from https://huggingface.co/unsloth/Qwen3.5-27B-GGUF
|
||||
options:
|
||||
- use_jinja:true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen3.5-27B-Q4_K_M.gguf
|
||||
sha256: 728960e4dda52d4f2af5bee09b2cbe86addfa93220fe9324bfac9dc727605c17
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/Qwen3.5-27B-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
sha256: c4efc54971085f29eecd433a8fba3edd2890584dfa2fc978933d1dd193f174dd
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-27B-GGUF/resolve/main/mmproj-F32.gguf
|
||||
- name: "qwen3.5-122b-a10b"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF
|
||||
description: |
|
||||
AI model: qwen3.5-122b-a10b
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf
|
||||
name: Qwen3.5-122B-A10B-GGUF
|
||||
backend: llama-cpp
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
known_usecases:
|
||||
- chat
|
||||
function:
|
||||
grammar:
|
||||
disable: true
|
||||
mmproj: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
description: Imported from https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF
|
||||
options:
|
||||
- use_jinja:true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf
|
||||
sha256: 914ac4aea369a78a16db389cd11293bd7ed4d2fe7960cdc7bc5140b21e5d8074
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00001-of-00003.gguf
|
||||
- filename: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf
|
||||
sha256: 073b82aaccefa6b360d4220299e488dc8810ad76d286b282c44ec374534e41d4
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00002-of-00003.gguf
|
||||
- filename: llama-cpp/models/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf
|
||||
sha256: 0c9eed4a95f8fac03cb57e3fb63a49dcf400f958d86a387b98f0e9b4fbb54fd6
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/Q4_K_M/Qwen3.5-122B-A10B-Q4_K_M-00003-of-00003.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
sha256: ba889ce164a6cc7ffe34296851d0f2bbe139bd27deeb7fe3830d08bd776a28a6
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-122B-A10B-GGUF/resolve/main/mmproj-F32.gguf
|
||||
- name: "qwen3.5-35b-a3b"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF
|
||||
description: |
|
||||
AI model: qwen3.5-35b-a3b
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen3.5-35B-A3B-UD-Q4_K_M.gguf
|
||||
name: Qwen3.5-35B-A3B-GGUF
|
||||
backend: llama-cpp
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
known_usecases:
|
||||
- chat
|
||||
function:
|
||||
grammar:
|
||||
disable: true
|
||||
mmproj: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
description: Imported from https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF
|
||||
options:
|
||||
- use_jinja:true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen3.5-35B-A3B-UD-Q4_K_M.gguf
|
||||
sha256: 223138866b87b12e68ffb43a1d45afb572921e9cd4c594e6a736df94c5130466
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/Qwen3.5-35B-A3B-UD-Q4_K_M.gguf
|
||||
- filename: llama-cpp/mmproj/mmproj-F32.gguf
|
||||
sha256: 40169fdbd92afb86ef298c8f535353c7cc1307e3493db4359454246bcfc92131
|
||||
uri: https://huggingface.co/unsloth/Qwen3.5-35B-A3B-GGUF/resolve/main/mmproj-F32.gguf
|
||||
- name: "qwen_qwen3-next-80b-a3b-thinking"
|
||||
url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/bartowski/Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF
|
||||
description: |
|
||||
Describe the model in a clear and concise way that can be shared in a model gallery.
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-cpp/models/Qwen_Qwen3-Next-80B-A3B-Thinking-Q4_K_M.gguf
|
||||
name: Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF
|
||||
backend: llama-cpp
|
||||
template:
|
||||
use_tokenizer_template: true
|
||||
known_usecases:
|
||||
- chat
|
||||
function:
|
||||
grammar:
|
||||
disable: true
|
||||
description: Imported from https://huggingface.co/bartowski/Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF
|
||||
options:
|
||||
- use_jinja:true
|
||||
files:
|
||||
- filename: llama-cpp/models/Qwen_Qwen3-Next-80B-A3B-Thinking-Q4_K_M.gguf
|
||||
sha256: 83481c75cc6c0837ba9afa52b59b4cd3f85f55dd7aa6c60e27230ff329c81367
|
||||
uri: https://huggingface.co/bartowski/Qwen_Qwen3-Next-80B-A3B-Thinking-GGUF/resolve/main/Qwen_Qwen3-Next-80B-A3B-Thinking-Q4_K_M.gguf
|
||||
- &nanbeige4
|
||||
name: "nanbeige4.1-3b-q8"
|
||||
url: "github:mudler/LocalAI/gallery/nanbeige4.1.yaml@master"
|
||||
|
||||
@@ -275,6 +275,68 @@ func (uri URI) checkSeverSupportsRangeHeader() (bool, error) {
|
||||
return resp.Header.Get("Accept-Ranges") == "bytes", nil
|
||||
}
|
||||
|
||||
// ContentLength returns the size in bytes of the resource at the URI.
|
||||
// For file:// it uses os.Stat on the resolved path; for HTTP/HTTPS it uses HEAD
|
||||
// and optionally a Range request if Content-Length is missing.
|
||||
func (u URI) ContentLength(ctx context.Context) (int64, error) {
|
||||
urlStr := u.ResolveURL()
|
||||
if strings.HasPrefix(string(u), LocalPrefix) {
|
||||
info, err := os.Stat(urlStr)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return info.Size(), nil
|
||||
}
|
||||
if !u.LooksLikeHTTPURL() {
|
||||
return 0, fmt.Errorf("unsupported URI scheme for ContentLength: %s", string(u))
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, "HEAD", urlStr, nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
resp, err := http.DefaultClient.Do(req)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
if resp.StatusCode >= 400 {
|
||||
return 0, fmt.Errorf("HEAD %s: status %d", urlStr, resp.StatusCode)
|
||||
}
|
||||
if resp.ContentLength >= 0 {
|
||||
return resp.ContentLength, nil
|
||||
}
|
||||
if resp.Header.Get("Accept-Ranges") != "bytes" {
|
||||
return 0, fmt.Errorf("HEAD %s: no Content-Length and server does not support Range", urlStr)
|
||||
}
|
||||
req2, err := http.NewRequestWithContext(ctx, "GET", urlStr, nil)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
req2.Header.Set("Range", "bytes=0-0")
|
||||
resp2, err := http.DefaultClient.Do(req2)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
defer resp2.Body.Close()
|
||||
if resp2.StatusCode != http.StatusPartialContent && resp2.StatusCode != http.StatusOK {
|
||||
return 0, fmt.Errorf("Range request %s: status %d", urlStr, resp2.StatusCode)
|
||||
}
|
||||
cr := resp2.Header.Get("Content-Range")
|
||||
// Content-Range: bytes 0-0/12345
|
||||
if cr == "" {
|
||||
return 0, fmt.Errorf("Range request %s: no Content-Range header", urlStr)
|
||||
}
|
||||
parts := strings.Split(cr, "/")
|
||||
if len(parts) != 2 {
|
||||
return 0, fmt.Errorf("invalid Content-Range: %s", cr)
|
||||
}
|
||||
size, err := strconv.ParseInt(strings.TrimSpace(parts[1]), 10, 64)
|
||||
if err != nil || size < 0 {
|
||||
return 0, fmt.Errorf("invalid Content-Range total length: %s", parts[1])
|
||||
}
|
||||
return size, nil
|
||||
}
|
||||
|
||||
func (uri URI) DownloadFile(filePath, sha string, fileN, total int, downloadStatus func(string, string, string, float64)) error {
|
||||
return uri.DownloadFileWithContext(context.Background(), filePath, sha, fileN, total, downloadStatus)
|
||||
}
|
||||
|
||||
@@ -1,12 +1,15 @@
|
||||
package downloader_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/rand"
|
||||
"crypto/sha256"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"strconv"
|
||||
|
||||
@@ -48,6 +51,86 @@ var _ = Describe("Gallery API tests", func() {
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("ContentLength", func() {
|
||||
Context("local file", func() {
|
||||
It("returns file size for existing file", func() {
|
||||
dir, err := os.MkdirTemp("", "contentlength-*")
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
defer os.RemoveAll(dir)
|
||||
fpath := filepath.Join(dir, "model.gguf")
|
||||
err = os.WriteFile(fpath, make([]byte, 1234), 0644)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
uri := URI("file://" + fpath)
|
||||
ctx := context.Background()
|
||||
size, err := uri.ContentLength(ctx)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(size).To(Equal(int64(1234)))
|
||||
})
|
||||
It("returns error for missing file", func() {
|
||||
uri := URI("file:///nonexistent/path/model.gguf")
|
||||
ctx := context.Background()
|
||||
_, err := uri.ContentLength(ctx)
|
||||
Expect(err).To(HaveOccurred())
|
||||
})
|
||||
})
|
||||
Context("HTTP", func() {
|
||||
It("returns Content-Length when present", func() {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
Expect(r.Method).To(Equal("HEAD"))
|
||||
w.Header().Set("Content-Length", "1000")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
uri := URI(server.URL)
|
||||
ctx := context.Background()
|
||||
size, err := uri.ContentLength(ctx)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(size).To(Equal(int64(1000)))
|
||||
})
|
||||
It("returns error on 404", func() {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.WriteHeader(http.StatusNotFound)
|
||||
}))
|
||||
defer server.Close()
|
||||
uri := URI(server.URL)
|
||||
ctx := context.Background()
|
||||
_, err := uri.ContentLength(ctx)
|
||||
Expect(err).To(HaveOccurred())
|
||||
})
|
||||
It("uses Range when Content-Length missing and Accept-Ranges bytes", func() {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
if r.Method == "HEAD" {
|
||||
w.Header().Set("Accept-Ranges", "bytes")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
return
|
||||
}
|
||||
Expect(r.Header.Get("Range")).To(Equal("bytes=0-0"))
|
||||
w.Header().Set("Content-Range", "bytes 0-0/5000")
|
||||
w.WriteHeader(http.StatusPartialContent)
|
||||
}))
|
||||
defer server.Close()
|
||||
uri := URI(server.URL)
|
||||
ctx := context.Background()
|
||||
size, err := uri.ContentLength(ctx)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(size).To(Equal(int64(5000)))
|
||||
})
|
||||
It("respects context cancellation", func() {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Length", "1000")
|
||||
w.WriteHeader(http.StatusOK)
|
||||
}))
|
||||
defer server.Close()
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
uri := URI(server.URL)
|
||||
_, err := uri.ContentLength(ctx)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(errors.Is(err, context.Canceled)).To(BeTrue())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
type RangeHeaderError struct {
|
||||
msg string
|
||||
}
|
||||
|
||||
@@ -305,6 +305,16 @@ func AllSpace(s string) bool {
|
||||
return strings.TrimSpace(s) == ""
|
||||
}
|
||||
|
||||
// allSpaceOrEscapedNewlines reports whether s is empty or contains only whitespace
|
||||
// and the two-character sequences \n and \r (as in escaped JSON or backtick strings).
|
||||
// Used for XML tool-call prelude checks so that content with literal \n between
|
||||
// tags is accepted like real newlines, matching behavior when input has actual newlines.
|
||||
func allSpaceOrEscapedNewlines(s string) bool {
|
||||
normalized := strings.ReplaceAll(s, "\\n", "")
|
||||
normalized = strings.ReplaceAll(normalized, "\\r", "")
|
||||
return strings.TrimSpace(normalized) == ""
|
||||
}
|
||||
|
||||
// TryConsumeJSON attempts to consume a JSON value from the current position
|
||||
// Returns the parsed JSON (can be object, array, or any JSON type), whether it's partial,
|
||||
// and the jsonDumpMarker (non-empty if JSON was healed)
|
||||
@@ -721,7 +731,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
|
||||
// No more scopes found, break
|
||||
break
|
||||
}
|
||||
if !AllSpace(tc.Prelude) {
|
||||
if !allSpaceOrEscapedNewlines(tc.Prelude) {
|
||||
// Non-whitespace before scope_start, stop parsing
|
||||
p.MoveTo(tc.Groups[0].Begin - len(tc.Prelude))
|
||||
break
|
||||
@@ -743,7 +753,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
|
||||
break
|
||||
}
|
||||
|
||||
if !AllSpace(tc.Prelude) {
|
||||
if !allSpaceOrEscapedNewlines(tc.Prelude) {
|
||||
// Non-whitespace before tool_start, stop parsing
|
||||
p.MoveTo(tc.Groups[0].Begin - len(tc.Prelude))
|
||||
break
|
||||
@@ -845,7 +855,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
|
||||
break
|
||||
}
|
||||
|
||||
if !AllSpace(keyStart.Prelude) {
|
||||
if !allSpaceOrEscapedNewlines(keyStart.Prelude) {
|
||||
// Non-whitespace before key_start, stop parsing parameters
|
||||
p.MoveTo(keyStart.Groups[0].Begin - len(keyStart.Prelude))
|
||||
break
|
||||
@@ -1009,7 +1019,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
|
||||
// Rewind to json_end and check if val_end follows
|
||||
p.MoveTo(jsonEnd)
|
||||
valEndSize, valEnd := tryFindValEnd()
|
||||
if valEnd != nil && AllSpace(valEnd.Prelude) && jsonHealingMarker == "" {
|
||||
if valEnd != nil && allSpaceOrEscapedNewlines(valEnd.Prelude) && jsonHealingMarker == "" {
|
||||
// val_end follows JSON
|
||||
if len(valEnd.Groups) > 0 {
|
||||
matchedSize := valEnd.Groups[0].End - valEnd.Groups[0].Begin
|
||||
@@ -1105,7 +1115,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
|
||||
return false, &ChatMsgPartialException{Message: "incomplete tool_call"}
|
||||
}
|
||||
|
||||
if !AllSpace(toolEnd.Prelude) {
|
||||
if !allSpaceOrEscapedNewlines(toolEnd.Prelude) {
|
||||
return returnError(errors.New("non-whitespace before tool_end"), recovery)
|
||||
}
|
||||
|
||||
@@ -1147,7 +1157,7 @@ func (p *ChatMsgParser) TryConsumeXMLToolCalls(format *XMLToolCallFormat) (bool,
|
||||
break
|
||||
}
|
||||
break
|
||||
} else if !AllSpace(tc.Prelude) {
|
||||
} else if !allSpaceOrEscapedNewlines(tc.Prelude) {
|
||||
// Non-whitespace before scope_end - this might be another scope_start
|
||||
// Check if it's actually another scope_start
|
||||
if format.ScopeStart != "" {
|
||||
|
||||
@@ -375,6 +375,20 @@ func getAllXMLFormats() []xmlFormatPreset {
|
||||
TrimRawArgVal: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "qwen3.5",
|
||||
format: &XMLToolCallFormat{
|
||||
ScopeStart: "<tool_call>",
|
||||
ToolStart: "<function=",
|
||||
ToolSep: ">",
|
||||
KeyStart: "<parameter=",
|
||||
KeyValSep: ">",
|
||||
ValEnd: "</parameter>",
|
||||
ToolEnd: "</function>",
|
||||
ScopeEnd: "</tool_call>",
|
||||
TrimRawArgVal: true,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "glm-4.5",
|
||||
format: &XMLToolCallFormat{
|
||||
@@ -483,9 +497,70 @@ func ParseXML(s string, format *XMLToolCallFormat) ([]FuncCallResults, error) {
|
||||
return parseXMLWithFormat(s, format)
|
||||
}
|
||||
|
||||
// getScopeOrToolStart returns the string to search for to start the tool-calls section
|
||||
// (ScopeStart if set, else ToolStart). Used to mimic llama.cpp's "content until <tool_call>" order.
|
||||
func getScopeOrToolStart(format *XMLToolCallFormat) string {
|
||||
if format == nil {
|
||||
return ""
|
||||
}
|
||||
if format.ScopeStart != "" {
|
||||
return format.ScopeStart
|
||||
}
|
||||
return format.ToolStart
|
||||
}
|
||||
|
||||
// tryParseXMLFromScopeStart finds the first occurrence of scopeStart (or format.ToolStart),
|
||||
// splits the input there, and parses only the suffix as XML tool calls. Returns (toolCalls, true)
|
||||
// if any tool calls were parsed, else (nil, false). This mimics llama.cpp's PEG order so that
|
||||
// reasoning or content before the tool block does not cause "whitespace only before scope" to fail.
|
||||
func tryParseXMLFromScopeStart(s string, format *XMLToolCallFormat, isPartial bool) ([]FuncCallResults, bool) {
|
||||
if format == nil {
|
||||
return nil, false
|
||||
}
|
||||
scopeStart := getScopeOrToolStart(format)
|
||||
if scopeStart == "" {
|
||||
return nil, false
|
||||
}
|
||||
idx := strings.Index(s, scopeStart)
|
||||
if idx < 0 {
|
||||
return nil, false
|
||||
}
|
||||
toolCallsPart := s[idx:]
|
||||
parser := NewChatMsgParser(toolCallsPart, isPartial)
|
||||
success, err := parser.TryConsumeXMLToolCalls(format)
|
||||
if err != nil {
|
||||
if _, ok := err.(*ChatMsgPartialException); ok && isPartial {
|
||||
return parser.ToolCalls(), len(parser.ToolCalls()) > 0
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
if success && len(parser.ToolCalls()) > 0 {
|
||||
return parser.ToolCalls(), true
|
||||
}
|
||||
return nil, false
|
||||
}
|
||||
|
||||
// ParseXMLIterative parses XML tool calls using the iterative parser
|
||||
// This provides better streaming and partial parsing support
|
||||
// This provides better streaming and partial parsing support.
|
||||
// When format is nil or when format is set, tries "find scope/tool start, split, parse suffix"
|
||||
// first (llama.cpp PEG order) so that content before the tool block does not cause parse failure.
|
||||
func ParseXMLIterative(s string, format *XMLToolCallFormat, isPartial bool) ([]FuncCallResults, error) {
|
||||
// Try split-on-scope first so reasoning/content before tool block is skipped
|
||||
if format != nil {
|
||||
if results, ok := tryParseXMLFromScopeStart(s, format, isPartial); ok {
|
||||
return results, nil
|
||||
}
|
||||
} else {
|
||||
formats := getAllXMLFormats()
|
||||
for _, fmtPreset := range formats {
|
||||
if fmtPreset.format != nil {
|
||||
if results, ok := tryParseXMLFromScopeStart(s, fmtPreset.format, isPartial); ok {
|
||||
return results, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
parser := NewChatMsgParser(s, isPartial)
|
||||
|
||||
// Auto-detect format if not provided
|
||||
@@ -1621,16 +1696,54 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
|
||||
// but we've already parsed it, so we shouldn't try XML parsing on the same content
|
||||
skipXMLParsing := (len(functionConfig.JSONRegexMatch) > 0 || len(functionConfig.ResponseRegex) > 0) && len(results) > 0
|
||||
if len(results) == 0 && !skipXMLParsing {
|
||||
xmlResults, err := ParseXML(llmresult, xmlFormat)
|
||||
if err == nil && len(xmlResults) > 0 {
|
||||
xlog.Debug("Found XML tool calls", "count", len(xmlResults))
|
||||
results = append(results, xmlResults...)
|
||||
// Mimic llama.cpp PEG order: try "find scope/tool start, split, parse suffix" first so that
|
||||
// reasoning or content before the tool block (e.g. <think>...</think>) does not cause parse failure.
|
||||
if xmlFormat != nil {
|
||||
if xmlResults, ok := tryParseXMLFromScopeStart(llmresult, xmlFormat, false); ok {
|
||||
xlog.Debug("Found XML tool calls (split-on-scope)", "count", len(xmlResults))
|
||||
results = append(results, xmlResults...)
|
||||
}
|
||||
} else {
|
||||
formats := getAllXMLFormats()
|
||||
for _, fmtPreset := range formats {
|
||||
if fmtPreset.format != nil {
|
||||
if xmlResults, ok := tryParseXMLFromScopeStart(llmresult, fmtPreset.format, false); ok {
|
||||
xlog.Debug("Found XML tool calls (split-on-scope, auto-detect)", "format", fmtPreset.name, "count", len(xmlResults))
|
||||
results = append(results, xmlResults...)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(results) == 0 {
|
||||
xmlResults, err := ParseXML(llmresult, xmlFormat)
|
||||
if err == nil && len(xmlResults) > 0 {
|
||||
xlog.Debug("Found XML tool calls", "count", len(xmlResults))
|
||||
results = append(results, xmlResults...)
|
||||
}
|
||||
}
|
||||
} else if len(results) > 0 && !skipXMLParsing {
|
||||
// Even if we found JSON results, check for XML tool calls in the response
|
||||
// This handles mixed content scenarios (text + JSON + XML)
|
||||
// But skip if JSONRegexMatch or ResponseRegex was used (they already extracted the content)
|
||||
xmlResults, err := ParseXML(llmresult, xmlFormat)
|
||||
// Try split-on-scope first (llama.cpp order), then full ParseXML
|
||||
var xmlResults []FuncCallResults
|
||||
var err error
|
||||
if xmlFormat != nil {
|
||||
xmlResults, _ = tryParseXMLFromScopeStart(llmresult, xmlFormat, false)
|
||||
}
|
||||
if len(xmlResults) == 0 && xmlFormat == nil {
|
||||
formats := getAllXMLFormats()
|
||||
for _, fmtPreset := range formats {
|
||||
if fmtPreset.format != nil {
|
||||
xmlResults, _ = tryParseXMLFromScopeStart(llmresult, fmtPreset.format, false)
|
||||
if len(xmlResults) > 0 {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if len(xmlResults) == 0 {
|
||||
xmlResults, err = ParseXML(llmresult, xmlFormat)
|
||||
}
|
||||
if err == nil && len(xmlResults) > 0 {
|
||||
// Check if JSON is inside XML tags, if so, skip it
|
||||
for _, result := range xmlResults {
|
||||
|
||||
@@ -779,6 +779,42 @@ value
|
||||
Expect(results[0].Name).To(Equal("glob"))
|
||||
Expect(results[0].Arguments).To(Equal(`{"pattern":"**/package.json"}`))
|
||||
})
|
||||
It("should parse tool calls when reasoning (<think>) precedes tool block (Qwen3.5-style)", func() {
|
||||
input := `<think>
|
||||
I need to run a command.
|
||||
</think>
|
||||
<tool_call>
|
||||
<function=bash>
|
||||
<parameter=script>
|
||||
echo hello
|
||||
</parameter>
|
||||
</function>
|
||||
</tool_call>`
|
||||
cfg := FunctionsConfig{}
|
||||
results := ParseFunctionCall(input, cfg)
|
||||
Expect(results).To(HaveLen(1))
|
||||
Expect(results[0].Name).To(Equal("bash"))
|
||||
Expect(results[0].Arguments).To(ContainSubstring("echo hello"))
|
||||
})
|
||||
|
||||
It("should parse tool calls when reasoning (<think>) precedes tool block (Qwen3.5-style)", func() {
|
||||
input := `<think>
|
||||
I need to run a command.
|
||||
</think>
|
||||
<tool_call>
|
||||
<function=bash>
|
||||
<parameter=script>
|
||||
echo hello
|
||||
</parameter>
|
||||
</function>
|
||||
</tool_call>`
|
||||
cfg := FunctionsConfig{}
|
||||
cfg.XMLFormatPreset = "qwen3.5"
|
||||
results := ParseFunctionCall(input, cfg)
|
||||
Expect(results).To(HaveLen(1))
|
||||
Expect(results[0].Name).To(Equal("bash"))
|
||||
Expect(results[0].Arguments).To(ContainSubstring("echo hello"))
|
||||
})
|
||||
|
||||
It("should parse XML tool calls alongside JSON tool calls", func() {
|
||||
input := `{"name": "add", "arguments": {"x": 5, "y": 3}}
|
||||
@@ -1690,6 +1726,24 @@ value
|
||||
// Arguments should contain partial flag
|
||||
Expect(results[0].Arguments).To(ContainSubstring("key"))
|
||||
})
|
||||
It("should return tool call when leading text precedes tool block (real newlines)", func() {
|
||||
input := "The memory reclaimer functionality already exists! Let me examine the watchdog to understand how it works and what might need to be implemented for \"auto-fit\" vs unloading.\n\n<tool_call>\n<function=bash>\n<parameter=script>\ncd /root/worktrees/LocalAI/task_8562 && cat core/application/watchdog.go\n</parameter>\n</function>\n</tool_call>"
|
||||
results, err := ParseXMLIterative(input, nil, true)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(results).NotTo(BeNil())
|
||||
Expect(results).To(HaveLen(1))
|
||||
Expect(results[0].Name).To(Equal("bash"))
|
||||
Expect(results[0].Arguments).To(ContainSubstring("task_8562"))
|
||||
})
|
||||
It("should return tool call when leading text precedes tool block (literal \\n between tags)", func() {
|
||||
input := `The memory reclaimer functionality already exists! Let me examine the watchdog to understand how it works and what might need to be implemented for "auto-fit" vs unloading.\n\n<tool_call>\n<function=bash>\n<parameter=script>\ncd /root/worktrees/LocalAI/task_8562 && cat core/application/watchdog.go\n</parameter>\n</function>\n</tool_call>`
|
||||
results, err := ParseXMLIterative(input, nil, false)
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(results).NotTo(BeNil())
|
||||
Expect(results).To(HaveLen(1))
|
||||
Expect(results[0].Name).To(Equal("bash"))
|
||||
Expect(results[0].Arguments).To(ContainSubstring("task_8562"))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("ParseJSONIterative", func() {
|
||||
|
||||
@@ -50,9 +50,9 @@ var (
|
||||
)
|
||||
|
||||
func init() {
|
||||
_, err := os.Stat(filepath.Join("usr", "local", "cuda-13"))
|
||||
_, err := os.Stat(filepath.Join(string(os.PathSeparator), "usr", "local", "cuda-13"))
|
||||
cuda13DirExists = err == nil
|
||||
_, err = os.Stat(filepath.Join("usr", "local", "cuda-12"))
|
||||
_, err = os.Stat(filepath.Join(string(os.PathSeparator), "usr", "local", "cuda-12"))
|
||||
cuda12DirExists = err == nil
|
||||
}
|
||||
|
||||
|
||||
96
pkg/vram/cache.go
Normal file
96
pkg/vram/cache.go
Normal file
@@ -0,0 +1,96 @@
|
||||
package vram
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sync"
|
||||
"time"
|
||||
)
|
||||
|
||||
const defaultEstimateCacheTTL = 15 * time.Minute
|
||||
|
||||
type sizeCacheEntry struct {
|
||||
size int64
|
||||
err error
|
||||
until time.Time
|
||||
}
|
||||
|
||||
type cachedSizeResolver struct {
|
||||
underlying SizeResolver
|
||||
ttl time.Duration
|
||||
mu sync.Mutex
|
||||
cache map[string]sizeCacheEntry
|
||||
}
|
||||
|
||||
func (c *cachedSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
|
||||
c.mu.Lock()
|
||||
e, ok := c.cache[uri]
|
||||
c.mu.Unlock()
|
||||
if ok && time.Now().Before(e.until) {
|
||||
return e.size, e.err
|
||||
}
|
||||
size, err := c.underlying.ContentLength(ctx, uri)
|
||||
c.mu.Lock()
|
||||
if c.cache == nil {
|
||||
c.cache = make(map[string]sizeCacheEntry)
|
||||
}
|
||||
c.cache[uri] = sizeCacheEntry{size: size, err: err, until: time.Now().Add(c.ttl)}
|
||||
c.mu.Unlock()
|
||||
return size, err
|
||||
}
|
||||
|
||||
type ggufCacheEntry struct {
|
||||
meta *GGUFMeta
|
||||
err error
|
||||
until time.Time
|
||||
}
|
||||
|
||||
type cachedGGUFReader struct {
|
||||
underlying GGUFMetadataReader
|
||||
ttl time.Duration
|
||||
mu sync.Mutex
|
||||
cache map[string]ggufCacheEntry
|
||||
}
|
||||
|
||||
func (c *cachedGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
|
||||
c.mu.Lock()
|
||||
e, ok := c.cache[uri]
|
||||
c.mu.Unlock()
|
||||
if ok && time.Now().Before(e.until) {
|
||||
return e.meta, e.err
|
||||
}
|
||||
meta, err := c.underlying.ReadMetadata(ctx, uri)
|
||||
c.mu.Lock()
|
||||
if c.cache == nil {
|
||||
c.cache = make(map[string]ggufCacheEntry)
|
||||
}
|
||||
c.cache[uri] = ggufCacheEntry{meta: meta, err: err, until: time.Now().Add(c.ttl)}
|
||||
c.mu.Unlock()
|
||||
return meta, err
|
||||
}
|
||||
|
||||
// CachedSizeResolver returns a SizeResolver that caches ContentLength results by URI for the given TTL.
|
||||
func CachedSizeResolver(underlying SizeResolver, ttl time.Duration) SizeResolver {
|
||||
return &cachedSizeResolver{underlying: underlying, ttl: ttl, cache: make(map[string]sizeCacheEntry)}
|
||||
}
|
||||
|
||||
// CachedGGUFReader returns a GGUFMetadataReader that caches ReadMetadata results by URI for the given TTL.
|
||||
func CachedGGUFReader(underlying GGUFMetadataReader, ttl time.Duration) GGUFMetadataReader {
|
||||
return &cachedGGUFReader{underlying: underlying, ttl: ttl, cache: make(map[string]ggufCacheEntry)}
|
||||
}
|
||||
|
||||
// DefaultCachedSizeResolver returns a cached SizeResolver using the default implementation and default TTL (15 min).
|
||||
// A single shared cache is used so repeated HEAD requests for the same URI are avoided across requests.
|
||||
func DefaultCachedSizeResolver() SizeResolver {
|
||||
return defaultCachedSizeResolver
|
||||
}
|
||||
|
||||
// DefaultCachedGGUFReader returns a cached GGUFMetadataReader using the default implementation and default TTL (15 min).
|
||||
// A single shared cache is used so repeated GGUF metadata fetches for the same URI are avoided across requests.
|
||||
func DefaultCachedGGUFReader() GGUFMetadataReader {
|
||||
return defaultCachedGGUFReader
|
||||
}
|
||||
|
||||
var (
|
||||
defaultCachedSizeResolver = CachedSizeResolver(defaultSizeResolver{}, defaultEstimateCacheTTL)
|
||||
defaultCachedGGUFReader = CachedGGUFReader(defaultGGUFReader{}, defaultEstimateCacheTTL)
|
||||
)
|
||||
152
pkg/vram/estimate.go
Normal file
152
pkg/vram/estimate.go
Normal file
@@ -0,0 +1,152 @@
|
||||
package vram
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"path"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
)
|
||||
|
||||
var weightExts = map[string]bool{
|
||||
".gguf": true, ".safetensors": true, ".bin": true, ".pt": true,
|
||||
}
|
||||
|
||||
func isWeightFile(nameOrURI string) bool {
|
||||
ext := strings.ToLower(path.Ext(path.Base(nameOrURI)))
|
||||
return weightExts[ext]
|
||||
}
|
||||
|
||||
func isGGUF(nameOrURI string) bool {
|
||||
return strings.ToLower(path.Ext(path.Base(nameOrURI))) == ".gguf"
|
||||
}
|
||||
|
||||
func Estimate(ctx context.Context, files []FileInput, opts EstimateOptions, sizeResolver SizeResolver, ggufReader GGUFMetadataReader) (EstimateResult, error) {
|
||||
if opts.ContextLength == 0 {
|
||||
opts.ContextLength = 8192
|
||||
}
|
||||
if opts.KVQuantBits == 0 {
|
||||
opts.KVQuantBits = 16
|
||||
}
|
||||
|
||||
var sizeBytes uint64
|
||||
var ggufSize uint64
|
||||
var firstGGUFURI string
|
||||
for i := range files {
|
||||
f := &files[i]
|
||||
if !isWeightFile(f.URI) {
|
||||
continue
|
||||
}
|
||||
sz := f.Size
|
||||
if sz <= 0 && sizeResolver != nil {
|
||||
var err error
|
||||
sz, err = sizeResolver.ContentLength(ctx, f.URI)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
}
|
||||
sizeBytes += uint64(sz)
|
||||
if isGGUF(f.URI) {
|
||||
ggufSize += uint64(sz)
|
||||
if firstGGUFURI == "" {
|
||||
firstGGUFURI = f.URI
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
sizeDisplay := FormatBytes(sizeBytes)
|
||||
|
||||
var vramBytes uint64
|
||||
if ggufSize > 0 {
|
||||
var meta *GGUFMeta
|
||||
if ggufReader != nil && firstGGUFURI != "" {
|
||||
meta, _ = ggufReader.ReadMetadata(ctx, firstGGUFURI)
|
||||
}
|
||||
if meta != nil && (meta.BlockCount > 0 || meta.EmbeddingLength > 0) {
|
||||
nLayers := meta.BlockCount
|
||||
if nLayers == 0 {
|
||||
nLayers = 32
|
||||
}
|
||||
dModel := meta.EmbeddingLength
|
||||
if dModel == 0 {
|
||||
dModel = 4096
|
||||
}
|
||||
headCountKV := meta.HeadCountKV
|
||||
if headCountKV == 0 {
|
||||
headCountKV = meta.HeadCount
|
||||
}
|
||||
if headCountKV == 0 {
|
||||
headCountKV = 8
|
||||
}
|
||||
gpuLayers := opts.GPULayers
|
||||
if gpuLayers <= 0 {
|
||||
gpuLayers = int(nLayers)
|
||||
}
|
||||
ctxLen := opts.ContextLength
|
||||
bKV := uint32(opts.KVQuantBits / 8)
|
||||
if bKV == 0 {
|
||||
bKV = 4
|
||||
}
|
||||
M_model := ggufSize
|
||||
M_KV := uint64(bKV) * uint64(dModel) * uint64(nLayers) * uint64(ctxLen)
|
||||
if headCountKV > 0 && meta.HeadCount > 0 {
|
||||
M_KV = uint64(bKV) * uint64(dModel) * uint64(headCountKV) * uint64(ctxLen)
|
||||
}
|
||||
P := M_model * 2
|
||||
M_overhead := uint64(0.02*float64(P) + 0.15*1e9)
|
||||
vramBytes = M_model + M_KV + M_overhead
|
||||
if nLayers > 0 && gpuLayers < int(nLayers) {
|
||||
layerRatio := float64(gpuLayers) / float64(nLayers)
|
||||
vramBytes = uint64(layerRatio*float64(M_model)) + M_KV + M_overhead
|
||||
}
|
||||
} else {
|
||||
vramBytes = sizeOnlyVRAM(ggufSize, opts.ContextLength)
|
||||
}
|
||||
} else if sizeBytes > 0 {
|
||||
vramBytes = sizeOnlyVRAM(sizeBytes, opts.ContextLength)
|
||||
}
|
||||
|
||||
return EstimateResult{
|
||||
SizeBytes: sizeBytes,
|
||||
SizeDisplay: sizeDisplay,
|
||||
VRAMBytes: vramBytes,
|
||||
VRAMDisplay: FormatBytes(vramBytes),
|
||||
}, nil
|
||||
}
|
||||
|
||||
func sizeOnlyVRAM(sizeOnDisk uint64, ctxLen uint32) uint64 {
|
||||
k := uint64(1024)
|
||||
vram := sizeOnDisk + k*uint64(ctxLen)*2
|
||||
if vram < sizeOnDisk {
|
||||
vram = sizeOnDisk
|
||||
}
|
||||
return vram
|
||||
}
|
||||
|
||||
func FormatBytes(n uint64) string {
|
||||
const unit = 1000
|
||||
if n < unit {
|
||||
return fmt.Sprintf("%d B", n)
|
||||
}
|
||||
div, exp := uint64(unit), 0
|
||||
for u := n / unit; u >= unit; u /= unit {
|
||||
div *= unit
|
||||
exp++
|
||||
}
|
||||
return fmt.Sprintf("%.1f %cB", float64(n)/float64(div), "KMGTPE"[exp])
|
||||
}
|
||||
|
||||
type defaultSizeResolver struct{}
|
||||
|
||||
func (defaultSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
|
||||
return downloader.URI(uri).ContentLength(ctx)
|
||||
}
|
||||
|
||||
func DefaultSizeResolver() SizeResolver {
|
||||
return defaultSizeResolver{}
|
||||
}
|
||||
|
||||
func DefaultGGUFReader() GGUFMetadataReader {
|
||||
return defaultGGUFReader{}
|
||||
}
|
||||
137
pkg/vram/estimate_test.go
Normal file
137
pkg/vram/estimate_test.go
Normal file
@@ -0,0 +1,137 @@
|
||||
package vram_test
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
. "github.com/mudler/LocalAI/pkg/vram"
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
type fakeSizeResolver map[string]int64
|
||||
|
||||
func (f fakeSizeResolver) ContentLength(ctx context.Context, uri string) (int64, error) {
|
||||
if n, ok := f[uri]; ok {
|
||||
return int64(n), nil
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
type fakeGGUFReader map[string]*GGUFMeta
|
||||
|
||||
func (f fakeGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
|
||||
return f[uri], nil
|
||||
}
|
||||
|
||||
var _ = Describe("Estimate", func() {
|
||||
ctx := context.Background()
|
||||
|
||||
Describe("empty or non-GGUF inputs", func() {
|
||||
It("returns zero size and vram for nil files", func() {
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
res, err := Estimate(ctx, nil, opts, nil, nil)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.SizeBytes).To(Equal(uint64(0)))
|
||||
Expect(res.VRAMBytes).To(Equal(uint64(0)))
|
||||
Expect(res.SizeDisplay).To(Equal("0 B"))
|
||||
})
|
||||
|
||||
It("counts only .gguf files and ignores other extensions", func() {
|
||||
files := []FileInput{
|
||||
{URI: "http://a/model.gguf", Size: 1_000_000_000},
|
||||
{URI: "http://a/readme.txt", Size: 100},
|
||||
}
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
res, err := Estimate(ctx, files, opts, nil, nil)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.SizeBytes).To(Equal(uint64(1_000_000_000)))
|
||||
})
|
||||
|
||||
It("sums size for multiple non-GGUF weight files (e.g. safetensors)", func() {
|
||||
files := []FileInput{
|
||||
{URI: "http://hf.co/model/model.safetensors", Size: 2_000_000_000},
|
||||
{URI: "http://hf.co/model/model2.safetensors", Size: 3_000_000_000},
|
||||
}
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
res, err := Estimate(ctx, files, opts, nil, nil)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.SizeBytes).To(Equal(uint64(5_000_000_000)))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("GGUF size and resolver", func() {
|
||||
It("uses size resolver when file size is not set", func() {
|
||||
sizes := fakeSizeResolver{"http://example.com/model.gguf": 1_500_000_000}
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
files := []FileInput{{URI: "http://example.com/model.gguf"}}
|
||||
|
||||
res, err := Estimate(ctx, files, opts, sizes, nil)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.SizeBytes).To(Equal(uint64(1_500_000_000)))
|
||||
Expect(res.VRAMBytes).To(BeNumerically(">=", res.SizeBytes))
|
||||
Expect(res.SizeDisplay).To(Equal("1.5 GB"))
|
||||
})
|
||||
|
||||
It("uses size-only VRAM formula when metadata is missing and size is large", func() {
|
||||
sizes := fakeSizeResolver{"http://a/model.gguf": 10_000_000_000}
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
files := []FileInput{{URI: "http://a/model.gguf"}}
|
||||
|
||||
res, err := Estimate(ctx, files, opts, sizes, nil)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.VRAMBytes).To(BeNumerically(">", 10_000_000_000))
|
||||
})
|
||||
|
||||
It("sums size for multiple GGUF shards", func() {
|
||||
files := []FileInput{
|
||||
{URI: "http://a/shard1.gguf", Size: 10_000_000_000},
|
||||
{URI: "http://a/shard2.gguf", Size: 5_000_000_000},
|
||||
}
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
|
||||
res, err := Estimate(ctx, files, opts, nil, nil)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
|
||||
})
|
||||
|
||||
It("formats size display correctly", func() {
|
||||
files := []FileInput{{URI: "http://a/model.gguf", Size: 2_500_000_000}}
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
|
||||
res, err := Estimate(ctx, files, opts, nil, nil)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.SizeDisplay).To(Equal("2.5 GB"))
|
||||
})
|
||||
})
|
||||
|
||||
Describe("GGUF with metadata reader", func() {
|
||||
It("uses metadata for VRAM when reader returns meta and partial offload", func() {
|
||||
meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096}
|
||||
reader := fakeGGUFReader{"http://a/model.gguf": meta}
|
||||
opts := EstimateOptions{ContextLength: 8192, GPULayers: 20}
|
||||
files := []FileInput{{URI: "http://a/model.gguf", Size: 8_000_000_000}}
|
||||
|
||||
res, err := Estimate(ctx, files, opts, nil, reader)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.VRAMBytes).To(BeNumerically(">", 0))
|
||||
})
|
||||
|
||||
It("uses metadata head counts for KV and yields vram > size", func() {
|
||||
files := []FileInput{{URI: "http://a/model.gguf", Size: 15_000_000_000}}
|
||||
meta := &GGUFMeta{BlockCount: 32, EmbeddingLength: 4096, HeadCount: 32, HeadCountKV: 8}
|
||||
reader := fakeGGUFReader{"http://a/model.gguf": meta}
|
||||
opts := EstimateOptions{ContextLength: 8192}
|
||||
|
||||
res, err := Estimate(ctx, files, opts, nil, reader)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(res.SizeBytes).To(Equal(uint64(15_000_000_000)))
|
||||
Expect(res.VRAMBytes).To(BeNumerically(">", res.SizeBytes))
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
var _ = Describe("FormatBytes", func() {
|
||||
It("formats 2.5e9 as 2.5 GB", func() {
|
||||
Expect(FormatBytes(2_500_000_000)).To(Equal("2.5 GB"))
|
||||
})
|
||||
})
|
||||
46
pkg/vram/gguf_reader.go
Normal file
46
pkg/vram/gguf_reader.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package vram
|
||||
|
||||
import (
|
||||
"context"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
gguf "github.com/gpustack/gguf-parser-go"
|
||||
)
|
||||
|
||||
type defaultGGUFReader struct{}
|
||||
|
||||
func (defaultGGUFReader) ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error) {
|
||||
u := downloader.URI(uri)
|
||||
urlStr := u.ResolveURL()
|
||||
|
||||
if strings.HasPrefix(uri, downloader.LocalPrefix) {
|
||||
f, err := gguf.ParseGGUFFile(urlStr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ggufFileToMeta(f), nil
|
||||
}
|
||||
if !u.LooksLikeHTTPURL() {
|
||||
return nil, nil
|
||||
}
|
||||
f, err := gguf.ParseGGUFFileRemote(ctx, urlStr)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return ggufFileToMeta(f), nil
|
||||
}
|
||||
|
||||
func ggufFileToMeta(f *gguf.GGUFFile) *GGUFMeta {
|
||||
arch := f.Architecture()
|
||||
meta := &GGUFMeta{
|
||||
BlockCount: uint32(arch.BlockCount),
|
||||
EmbeddingLength: uint32(arch.EmbeddingLength),
|
||||
HeadCount: uint32(arch.AttentionHeadCount),
|
||||
HeadCountKV: uint32(arch.AttentionHeadCountKV),
|
||||
}
|
||||
if meta.HeadCountKV == 0 {
|
||||
meta.HeadCountKV = meta.HeadCount
|
||||
}
|
||||
return meta
|
||||
}
|
||||
42
pkg/vram/types.go
Normal file
42
pkg/vram/types.go
Normal file
@@ -0,0 +1,42 @@
|
||||
package vram
|
||||
|
||||
import "context"
|
||||
|
||||
// FileInput represents a single model file for estimation (URI and optional pre-known size).
|
||||
type FileInput struct {
|
||||
URI string
|
||||
Size int64
|
||||
}
|
||||
|
||||
// SizeResolver returns the content length in bytes for a given URI.
|
||||
type SizeResolver interface {
|
||||
ContentLength(ctx context.Context, uri string) (int64, error)
|
||||
}
|
||||
|
||||
// GGUFMeta holds parsed GGUF metadata used for VRAM estimation.
|
||||
type GGUFMeta struct {
|
||||
BlockCount uint32
|
||||
EmbeddingLength uint32
|
||||
HeadCount uint32
|
||||
HeadCountKV uint32
|
||||
}
|
||||
|
||||
// GGUFMetadataReader reads GGUF metadata from a URI (e.g. via HTTP Range).
|
||||
type GGUFMetadataReader interface {
|
||||
ReadMetadata(ctx context.Context, uri string) (*GGUFMeta, error)
|
||||
}
|
||||
|
||||
// EstimateOptions configures VRAM/size estimation.
|
||||
type EstimateOptions struct {
|
||||
ContextLength uint32
|
||||
GPULayers int
|
||||
KVQuantBits int
|
||||
}
|
||||
|
||||
// EstimateResult holds estimated download size and VRAM with display strings.
|
||||
type EstimateResult struct {
|
||||
SizeBytes uint64
|
||||
SizeDisplay string
|
||||
VRAMBytes uint64
|
||||
VRAMDisplay string
|
||||
}
|
||||
13
pkg/vram/vram_suite_test.go
Normal file
13
pkg/vram/vram_suite_test.go
Normal file
@@ -0,0 +1,13 @@
|
||||
package vram_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
func TestVram(t *testing.T) {
|
||||
RegisterFailHandler(Fail)
|
||||
RunSpecs(t, "Vram test suite")
|
||||
}
|
||||
@@ -2183,6 +2183,18 @@ const docTemplate = `{
|
||||
"schema.GalleryResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"estimated_size_bytes": {
|
||||
"type": "integer"
|
||||
},
|
||||
"estimated_size_display": {
|
||||
"type": "string"
|
||||
},
|
||||
"estimated_vram_bytes": {
|
||||
"type": "integer"
|
||||
},
|
||||
"estimated_vram_display": {
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
|
||||
@@ -2176,6 +2176,18 @@
|
||||
"schema.GalleryResponse": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"estimated_size_bytes": {
|
||||
"type": "integer"
|
||||
},
|
||||
"estimated_size_display": {
|
||||
"type": "string"
|
||||
},
|
||||
"estimated_vram_bytes": {
|
||||
"type": "integer"
|
||||
},
|
||||
"estimated_vram_display": {
|
||||
"type": "string"
|
||||
},
|
||||
"status": {
|
||||
"type": "string"
|
||||
},
|
||||
|
||||
@@ -444,6 +444,14 @@ definitions:
|
||||
type: object
|
||||
schema.GalleryResponse:
|
||||
properties:
|
||||
estimated_size_bytes:
|
||||
type: integer
|
||||
estimated_size_display:
|
||||
type: string
|
||||
estimated_vram_bytes:
|
||||
type: integer
|
||||
estimated_vram_display:
|
||||
type: string
|
||||
status:
|
||||
type: string
|
||||
uuid:
|
||||
|
||||
Reference in New Issue
Block a user