Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
27d7ada8dd feat(l4t): add support for extras images
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2025-02-06 11:53:07 +01:00
178 changed files with 3867 additions and 245724 deletions

View File

@@ -0,0 +1,23 @@
meta {
name: musicgen
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model_id": "facebook/musicgen-small",
"text": "Exciting 80s Newscast Interstitial",
"duration_seconds": 8
}
}

View File

@@ -0,0 +1,17 @@
meta {
name: backend monitor
type: http
seq: 4
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
body: json
auth: none
}
body:json {
{
"model": "{{DEFAULT_MODEL}}"
}
}

View File

@@ -0,0 +1,21 @@
meta {
name: backend-shutdown
type: http
seq: 3
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}"
}
}

View File

@@ -0,0 +1,5 @@
{
"version": "1",
"name": "LocalAI Test Requests",
"type": "collection"
}

View File

@@ -0,0 +1,6 @@
vars {
HOST: localhost
PORT: 8080
DEFAULT_MODEL: gpt-3.5-turbo
PROTOCOL: http://
}

View File

@@ -0,0 +1,11 @@
meta {
name: get models list
type: http
seq: 2
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
body: none
auth: none
}

View File

@@ -0,0 +1,25 @@
meta {
name: Generate image
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"prompt": "<positive prompt>|<negative prompt>",
"model": "model-name",
"step": 51,
"size": "1024x1024",
"image": ""
}
}

View File

@@ -0,0 +1,24 @@
meta {
name: -completions
type: http
seq: 4
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"prompt": "function downloadFile(string url, string outputPath) {",
"max_tokens": 256,
"temperature": 0.5
}
}

View File

@@ -0,0 +1,23 @@
meta {
name: -edits
type: http
seq: 5
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "What day of the wek is it?",
"instruction": "Fix the spelling mistakes"
}
}

View File

@@ -0,0 +1,22 @@
meta {
name: -embeddings
type: http
seq: 6
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
}
}

View File

@@ -0,0 +1,30 @@
meta {
name: chat completion -simple- 1 message-
type: http
seq: 4
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [
{
"role": "user",
"content": "How could one use friction to cook an egg?"
}
],
"max_tokens": 256,
"temperature": 0.2,
"grammar": ""
}
}

View File

@@ -0,0 +1,29 @@
meta {
name: chat-completions -long-
type: http
seq: 5
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
{"role": "user", "content": "How could one use electricity to cook an egg?"},
{"role": "assistant",
"content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
},
{"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
"max_tokens": 1024,
"temperature": 0.5
}
}

View File

@@ -0,0 +1,25 @@
meta {
name: chat-completions -stream-
type: http
seq: 6
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
"max_tokens": 256,
"temperature": 0.9,
"stream": true
}
}

View File

@@ -0,0 +1,22 @@
meta {
name: add model gallery
type: http
seq: 10
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
"name": "test"
}
}

View File

@@ -0,0 +1,21 @@
meta {
name: delete model gallery
type: http
seq: 11
}
delete {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"name": "test"
}
}

View File

@@ -0,0 +1,11 @@
meta {
name: list MODELS in galleries
type: http
seq: 7
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
body: none
auth: none
}

View File

@@ -0,0 +1,11 @@
meta {
name: list model GALLERIES
type: http
seq: 8
}
get {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: none
auth: none
}

View File

@@ -0,0 +1,11 @@
meta {
name: model delete
type: http
seq: 7
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
body: none
auth: none
}

View File

@@ -0,0 +1,21 @@
meta {
name: model gallery apply -gist-
type: http
seq: 12
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
}
}

View File

@@ -0,0 +1,22 @@
meta {
name: model gallery apply
type: http
seq: 9
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
"name": "codellama7b"
}
}

View File

Binary file not shown.

View File

@@ -0,0 +1,16 @@
meta {
name: transcribe
type: http
seq: 1
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
body: multipartForm
auth: none
}
body:multipart-form {
file: @file(transcription/gb1.ogg)
model: whisper-1
}

View File

@@ -0,0 +1,22 @@
meta {
name: -tts
type: http
seq: 2
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"model": "{{DEFAULT_MODEL}}",
"input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
}
}

View File

@@ -0,0 +1,23 @@
meta {
name: musicgen
type: http
seq: 2
}
post {
url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
body: json
auth: none
}
headers {
Content-Type: application/json
}
body:json {
{
"backend": "transformers",
"model": "facebook/musicgen-small",
"input": "80s Synths playing Jazz"
}
}

2
.github/labeler.yml vendored
View File

@@ -1,4 +1,4 @@
enhancement:
enhancements:
- head-branch: ['^feature', 'feature']
dependencies:

View File

@@ -9,7 +9,7 @@ jobs:
fail-fast: false
matrix:
include:
- repository: "ggml-org/llama.cpp"
- repository: "ggerganov/llama.cpp"
variable: "CPPLLAMA_VERSION"
branch: "master"
- repository: "ggerganov/whisper.cpp"

View File

@@ -33,7 +33,7 @@ jobs:
run: |
CGO_ENABLED=0 make build-api
- name: rm
uses: appleboy/ssh-action@v1.2.2
uses: appleboy/ssh-action@v1.2.0
with:
host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
@@ -53,7 +53,7 @@ jobs:
rm: true
target: ./local-ai
- name: restarting
uses: appleboy/ssh-action@v1.2.2
uses: appleboy/ssh-action@v1.2.0
with:
host: ${{ secrets.EXPLORER_SSH_HOST }}
username: ${{ secrets.EXPLORER_SSH_USERNAME }}

View File

@@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'
on:
workflow_dispatch:
schedule:
# daily at midnight
- cron: '0 0 * * *'
push:
branches:
- master
concurrency:
group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -17,7 +16,7 @@ jobs:
matrix:
include:
- grpc-base-image: ubuntu:22.04
runs-on: 'arc-runner-set'
runs-on: 'ubuntu-latest'
platforms: 'linux/amd64,linux/arm64'
runs-on: ${{matrix.runs-on}}
steps:

View File

@@ -310,11 +310,6 @@ jobs:
tags: ${{ steps.meta_aio_dockerhub.outputs.tags }}
labels: ${{ steps.meta_aio_dockerhub.outputs.labels }}
- name: Cleanup
run: |
docker builder prune -f
docker system prune --force --volumes --all
- name: Latest tag
# run this on branches, when it is a tag and there is a latest-image defined
if: github.event_name != 'pull_request' && inputs.latest-image != '' && github.ref_type == 'tag'

View File

@@ -354,12 +354,14 @@ FROM requirements-drivers
ARG FFMPEG
ARG BUILD_TYPE
ARG BUILD_PLATFORM
ARG TARGETARCH
ARG IMAGE_TYPE=extras
ARG EXTRA_BACKENDS
ARG MAKEFLAGS
ENV BUILD_TYPE=${BUILD_TYPE}
ENV BUILD_PLATFORM=${BUILD_PLATFORM}
ENV REBUILD=false
ENV HEALTHCHECK_ENDPOINT=http://localhost:8080/readyz
ENV MAKEFLAGS=${MAKEFLAGS}

View File

@@ -1,6 +1,6 @@
MIT License
Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal

View File

@@ -6,7 +6,9 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true
# llama.cpp versions
CPPLLAMA_VERSION?=10f2e81809bbb69ecfe64fc8b4686285f84b0c07
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=5598f475be3e31430fbe17ebb85654ec90dc201e
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -22,7 +24,7 @@ BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=19d876ee300a055629926ff836489901f734f2b7
STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64
@@ -149,6 +151,7 @@ ifeq ($(BUILD_TYPE),hipblas)
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
# llama-ggml has no hipblas support, so override it here.
export STABLE_BUILD_TYPE=
export GGML_HIP=1
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -185,6 +188,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -218,6 +222,19 @@ endif
all: help
## go-llama.cpp
sources/go-llama.cpp:
mkdir -p sources/go-llama.cpp
cd sources/go-llama.cpp && \
git init && \
git remote add origin $(GOLLAMA_REPO) && \
git fetch origin && \
git checkout $(GOLLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
## bark.cpp
sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@@ -293,17 +310,19 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace
$(GOCMD) mod download
@@ -311,6 +330,7 @@ prepare-sources: get-sources replace
## GENERIC
rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-piper clean
$(MAKE) build
@@ -414,7 +434,7 @@ run: prepare ## run local-ai
test-models/testmodel.ggml:
mkdir test-models
mkdir test-dir
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -429,7 +449,8 @@ test: prepare test-models/testmodel.ggml grpcs
export GO_TAGS="tts debug"
$(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-llama
$(MAKE) test-llama-gguf
$(MAKE) test-tts
$(MAKE) test-stablediffusion
@@ -458,6 +479,10 @@ teardown-e2e:
rm -rf $(TEST_DIR) || true
docker stop $$(docker ps -q --filter ancestor=localai-tests)
test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -735,6 +760,13 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/llama-ggml
endif
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/

View File

@@ -212,7 +212,7 @@ A huge thank you to our generous sponsors who support this project covering CI e
<p align="center">
<a href="https://www.spectrocloud.com/" target="blank">
<img height="200" src="https://github.com/user-attachments/assets/72eab1dd-8b93-4fc0-9ade-84db49f24962">
<img height="200" src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
</a>
<a href="https://www.premai.io/" target="blank">
<img height="200" src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>

View File

@@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View File

@@ -129,7 +129,7 @@ detect_gpu
detect_gpu_size
PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
check_vars

View File

@@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View File

@@ -1,8 +0,0 @@
backend: silero-vad
name: silero-vad
parameters:
model: silero-vad.onnx
download_files:
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808

View File

@@ -165,6 +165,7 @@ message Reply {
message GrammarTrigger {
string word = 1;
bool at_start = 2;
}
message ModelOptions {
@@ -228,11 +229,6 @@ message ModelOptions {
int32 MaxModelLen = 54;
int32 TensorParallelSize = 55;
string LoadFormat = 58;
bool DisableLogStatus = 66;
string DType = 67;
int32 LimitImagePerPrompt = 68;
int32 LimitVideoPerPrompt = 69;
int32 LimitAudioPerPrompt = 70;
string MMProj = 41;

View File

@@ -469,7 +469,7 @@ struct llama_server_context
bool has_eos_token = true;
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_triggers;
std::vector<common_grammar_trigger> grammar_trigger_words;
int32_t n_ctx; // total context for all clients / slots
@@ -709,7 +709,7 @@ struct llama_server_context
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
slot->sparams.grammar_triggers = grammar_triggers;
slot->sparams.grammar_trigger_words = grammar_trigger_words;
slot->sparams.grammar_lazy = grammar_lazy;
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
@@ -1155,14 +1155,6 @@ struct llama_server_context
slot.has_next_token = false;
}
if (slot.n_past >= slot.n_ctx) {
slot.truncated = true;
slot.stopped_limit = true;
slot.has_next_token = false;
LOG_VERBOSE("stopped due to running out of context capacity", {});
}
if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
{
slot.stopped_eos = true;
@@ -1350,7 +1342,7 @@ struct llama_server_context
queue_results.send(res);
}
void send_embedding(llama_client_slot &slot, const llama_batch & batch)
void send_embedding(llama_client_slot &slot)
{
task_result res;
res.id = slot.task_id;
@@ -1372,38 +1364,10 @@ struct llama_server_context
else
{
const float *data = llama_get_embeddings(ctx);
std::vector<float> embd_res(n_embd, 0.0f);
std::vector<std::vector<float>> embedding;
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
continue;
}
const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
if (embd == NULL) {
embd = llama_get_embeddings_ith(ctx, i);
}
if (embd == NULL) {
LOG("failed to get embeddings");
continue;
}
// normalize only when there is pooling
// TODO: configurable
if (llama_pooling_type(ctx) != LLAMA_POOLING_TYPE_NONE) {
common_embd_normalize(embd, embd_res.data(), n_embd, 2);
embedding.push_back(embd_res);
} else {
embedding.push_back({ embd, embd + n_embd });
}
}
// OAI compat
std::vector<float> embedding(data, data + n_embd);
res.result_json = json
{
{"embedding", embedding[0] },
{"embedding", embedding },
};
}
queue_results.send(res);
@@ -1663,17 +1627,17 @@ struct llama_server_context
{
if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
{
// this check is redundant (for good)
// we should never get here, because generation should already stopped in process_token()
// START LOCALAI changes
// Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
// See: https://github.com/mudler/LocalAI/issues/1333
// Context is exhausted, release the slot
slot.release();
send_final_response(slot);
slot.has_next_token = false;
LOG_ERROR("context is exhausted, release the slot", {});
slot.cache_tokens.clear();
slot.n_past = 0;
slot.truncated = false;
slot.has_next_token = true;
LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
continue;
// END LOCALAI changes
@@ -2024,7 +1988,7 @@ struct llama_server_context
// prompt evaluated for embedding
if (slot.embedding)
{
send_embedding(slot, batch_view);
send_embedding(slot);
slot.release();
slot.i_batch = -1;
continue;
@@ -2421,12 +2385,12 @@ static void params_parse(const backend::ModelOptions* request,
llama.grammar_lazy = true;
for (int i = 0; i < request->grammartriggers_size(); i++) {
common_grammar_trigger trigger;
trigger.type = COMMON_GRAMMAR_TRIGGER_TYPE_WORD;
trigger.value = request->grammartriggers(i).word();
// trigger.at_start = request->grammartriggers(i).at_start();
llama.grammar_triggers.push_back(trigger);
trigger.word = request->grammartriggers(i).word();
trigger.at_start = request->grammartriggers(i).at_start();
llama.grammar_trigger_words.push_back(trigger);
LOG_INFO("grammar trigger", {
{ "word", trigger.value },
{ "word", trigger.word },
{ "at_start", trigger.at_start }
});
}
}

View File

@@ -1,13 +1,13 @@
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp
index 7f892beb..0517e529 100644
index 3cd0d2fa..6c5e811a 100644
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2766,7 +2766,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
int patch_offset = ctx->has_class_embedding ? 1 : 0;
@@ -2608,7 +2608,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
int* patches_data = (int*)malloc(ggml_nbytes(patches));
for (int i = 0; i < num_patches; i++) {
- patches_data[i] = i + patch_offset;
+ patches_data[i] = i + 1;
- patches_data[i] = i + 1;
+ patches_data[i] = i;
}
ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
free(patches_data);

View File

@@ -1,7 +1,5 @@
#!/bin/bash
set -e
## Patches
## Apply patches from the `patches` directory
for patch in $(ls patches); do

View File

@@ -35,8 +35,6 @@ const char* sample_method_str[] = {
"ipndm",
"ipndm_v",
"lcm",
"ddim_trailing",
"tcd",
};
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
@@ -175,7 +173,6 @@ int gen_image(char *text, char *negativeText, int width, int height, int steps,
-1, //clip_skip
cfg_scale, // sfg_scale
3.5f,
0, // eta
width,
height,
sample_method,

View File

@@ -0,0 +1,204 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"
"github.com/go-skynet/go-llama.cpp"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type LLM struct {
base.SingleThread
llama *llama.LLama
}
func (llm *LLM) Load(opts *pb.ModelOptions) error {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
llamaOpts := []llama.ModelOption{
llama.WithRopeFreqBase(ropeFreqBase),
llama.WithRopeFreqScale(ropeFreqScale),
}
if opts.NGQA != 0 {
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
}
if opts.RMSNormEps != 0 {
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
}
if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}
if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}
if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}
model, err := llama.New(opts.ModelFile, llamaOpts...)
llm.llama = model
return err
}
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
predictOptions := []llama.PredictOption{
llama.SetTemperature(opts.Temperature),
llama.SetTopP(opts.TopP),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
llama.WithGrammar(opts.Grammar),
llama.SetRopeFreqBase(ropeFreqBase),
llama.SetRopeFreqScale(ropeFreqScale),
llama.SetNegativePromptScale(opts.NegativePromptScale),
llama.SetNegativePrompt(opts.NegativePrompt),
}
if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}
if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}
// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}
if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}
if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
}
if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
}
if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
}
if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}
if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}
if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}
if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}
if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
return predictOptions
}
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))
go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()
return nil
}
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)
if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}

View File

@@ -0,0 +1,19 @@
package main
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,2 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch

View File

@@ -1,6 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.71.0
grpcio==1.70.0
protobuf
certifi
transformers

View File

@@ -0,0 +1,5 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch
torchaudio
transformers
accelerate

View File

@@ -1,4 +1,4 @@
bark==0.1.5
grpcio==1.71.0
grpcio==1.70.0
protobuf
certifi

View File

@@ -132,11 +132,16 @@ function installRequirements() {
declare -a requirementFiles=(
"${EDIR}/requirements-install.txt"
"${EDIR}/requirements.txt"
"${EDIR}/requirements-${BUILD_TYPE}.txt"
)
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
if [ -n "${BUILD_PLATFORM}" ]; then
requirementFiles+=("${EDIR}/requirements-${BUILD_PLATFORM}.txt")
else
requirementFiles+=("${EDIR}/requirements-${BUILD_TYPE}.txt")
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}.txt")
fi
fi
# if BUILD_TYPE is empty, we are a CPU build, so we should try to install the CPU requirements
@@ -146,8 +151,14 @@ function installRequirements() {
requirementFiles+=("${EDIR}/requirements-after.txt")
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
if [ -n "${BUILD_PLATFORM}" ]; then
requirementFiles+=("${EDIR}/requirements-${BUILD_PLATFORM}-after.txt")
else
if [ "x${BUILD_TYPE}" != "x${BUILD_PROFILE}" ]; then
requirementFiles+=("${EDIR}/requirements-${BUILD_PROFILE}-after.txt")
else
requirementFiles+=("${EDIR}/requirements-${BUILD_TYPE}-after.txt")
fi
fi
for reqFile in ${requirementFiles[@]}; do

View File

@@ -1,3 +1,3 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
grpcio-tools

View File

@@ -1,4 +1,4 @@
transformers==4.48.3
transformers
accelerate
torch==2.4.1
coqui-tts

View File

@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/cu118
torch==2.4.1+cu118
torchaudio==2.4.1+cu118
transformers==4.48.3
transformers
accelerate
coqui-tts

View File

@@ -1,5 +1,5 @@
torch==2.4.1
torchaudio==2.4.1
transformers==4.48.3
transformers
accelerate
coqui-tts

View File

@@ -1,6 +1,6 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
torch==2.4.1+rocm6.0
torchaudio==2.4.1+rocm6.0
transformers==4.48.3
transformers
accelerate
coqui-tts

View File

@@ -5,6 +5,6 @@ torchaudio==2.3.1+cxx11.abi
oneccl_bind_pt==2.3.100+xpu
optimum[openvino]
setuptools
transformers==4.48.3
transformers
accelerate
coqui-tts

View File

@@ -0,0 +1,6 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch
torchaudio
transformers
accelerate
coqui-tts

View File

@@ -1,4 +1,4 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
certifi
packaging==24.1

View File

@@ -159,18 +159,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
torchType = torch.float16
variant = "fp16"
options = request.Options
# empty dict
self.options = {}
# The options are a list of strings in this form optname:optvalue
# We are storing all the options in a dict so we can use it later when
# generating the images
for opt in options:
key, value = opt.split(":")
self.options[key] = value
local = False
modelFile = request.Model
@@ -453,9 +441,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
# create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
kwargs = {key: options.get(key) for key in keys if key in options}
# populate kwargs from self.options.
kwargs.update(self.options)
# Set seed
if request.seed > 0:
kwargs["generator"] = torch.Generator(device=self.device).manual_seed(

View File

@@ -0,0 +1,10 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch
diffusers
opencv-python
transformers
accelerate
compel
peft
sentencepiece
optimum-quanto

View File

@@ -1,5 +1,5 @@
setuptools
grpcio==1.71.0
grpcio==1.70.0
pillow
protobuf
certifi

View File

@@ -0,0 +1,4 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch
transformers
accelerate

View File

@@ -1,4 +1,4 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
certifi
wheel

View File

@@ -0,0 +1,9 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch
faster-whisper
opencv-python
accelerate
compel
peft
sentencepiece
optimum-quanto

View File

@@ -1,3 +1,3 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
grpcio-tools

View File

@@ -0,0 +1,3 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch
transformers

View File

@@ -1,4 +1,4 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
phonemizer
scipy

View File

@@ -0,0 +1,5 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
transformers
accelerate
torch
rerankers[transformers]

View File

@@ -1,3 +1,3 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
certifi

View File

@@ -0,0 +1,9 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
torch
accelerate
llvmlite==0.43.0
numba==0.60.0
transformers
bitsandbytes
outetts
sentence-transformers==3.4.1

View File

@@ -1,4 +1,4 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
certifi
setuptools

View File

@@ -109,17 +109,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
engine_args.swap_space = request.SwapSpace
if request.MaxModelLen != 0:
engine_args.max_model_len = request.MaxModelLen
if request.DisableLogStatus:
engine_args.disable_log_status = request.DisableLogStatus
if request.DType != "":
engine_args.dtype = request.DType
if request.LimitImagePerPrompt != 0 or request.LimitVideoPerPrompt != 0 or request.LimitAudioPerPrompt != 0:
# limit-mm-per-prompt defaults to 1 per modality, based on vLLM docs
engine_args.limit_mm_per_prompt = {
"image": max(request.LimitImagePerPrompt, 1),
"video": max(request.LimitVideoPerPrompt, 1),
"audio": max(request.LimitAudioPerPrompt, 1)
}
try:
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
@@ -280,7 +269,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def load_image(self, image_path: str):
"""
Load an image from the given file path or base64 encoded data.
Args:
image_path (str): The path to the image file or base64 encoded data.
@@ -299,7 +288,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
def load_video(self, video_path: str):
"""
Load a video from the given file path.
Args:
video_path (str): The path to the image file.
@@ -346,4 +335,4 @@ if __name__ == "__main__":
)
args = parser.parse_args()
asyncio.run(serve(args.addr))
asyncio.run(serve(args.addr))

View File

@@ -1 +0,0 @@
vllm

View File

@@ -1,3 +1,4 @@
accelerate
torch==2.4.1
transformers
transformers
vllm

View File

@@ -2,4 +2,5 @@
accelerate
torch==2.4.1+cu118
transformers
bitsandbytes
bitsandbytes
vllm

View File

@@ -1,4 +1,5 @@
accelerate
torch==2.4.1
transformers
bitsandbytes
bitsandbytes
vllm

View File

@@ -2,4 +2,5 @@
accelerate
torch==2.4.1+rocm6.0
transformers
bitsandbytes
bitsandbytes
vllm

View File

@@ -6,4 +6,5 @@ transformers
optimum[openvino]
setuptools
bitsandbytes
oneccl_bind_pt==2.3.100+xpu
oneccl_bind_pt==2.3.100+xpu
vllm

View File

@@ -0,0 +1,7 @@
--index-url https://pypi.jetson-ai-lab.dev/jp6/cu126/
accelerate
torch
vllm
transformers
bitsandbytes
flash-attn

View File

@@ -1,4 +1,4 @@
grpcio==1.71.0
grpcio==1.70.0
protobuf
certifi
setuptools

View File

@@ -145,7 +145,13 @@ func New(opts ...config.AppOption) (*Application, error) {
if options.LoadToMemory != nil {
for _, m := range options.LoadToMemory {
cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
cfg, err := application.BackendLoader().LoadBackendConfigFileByName(m, options.ModelPath,
config.LoadOptionDebug(options.Debug),
config.LoadOptionThreads(options.Threads),
config.LoadOptionContextSize(options.ContextSize),
config.LoadOptionF16(options.F16),
config.ModelPath(options.ModelPath),
)
if err != nil {
return nil, err
}

View File

@@ -33,7 +33,7 @@ type TokenUsage struct {
TimingTokenGeneration float64
}
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
modelFile := c.Model
// Check if the modelFile exists, if it doesn't try to load it from the gallery
@@ -48,7 +48,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
}
}
opts := ModelOptions(*c, o)
opts := ModelOptions(c, o)
inferenceModel, err := loader.Load(opts...)
if err != nil {
return nil, err
@@ -84,7 +84,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
fn := func() (LLMResponse, error) {
opts := gRPCPredictOpts(*c, loader.ModelPath)
opts := gRPCPredictOpts(c, loader.ModelPath)
opts.Prompt = s
opts.Messages = protoMessages
opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
@@ -116,11 +116,6 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
}
if tokenCallback != nil {
if c.TemplateConfig.ReplyPrefix != "" {
tokenCallback(c.TemplateConfig.ReplyPrefix, tokenUsage)
}
ss := ""
var partialRune []byte
@@ -170,13 +165,8 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
tokenUsage.TimingTokenGeneration = reply.TimingTokenGeneration
tokenUsage.TimingPromptProcessing = reply.TimingPromptProcessing
response := string(reply.Message)
if c.TemplateConfig.ReplyPrefix != "" {
response = c.TemplateConfig.ReplyPrefix + response
}
return LLMResponse{
Response: response,
Response: string(reply.Message),
Usage: tokenUsage,
}, err
}

View File

@@ -122,6 +122,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
triggers = append(triggers, &pb.GrammarTrigger{
Word: t.Word,
AtStart: t.AtStart,
})
}
@@ -158,12 +159,6 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
SwapSpace: int32(c.SwapSpace),
MaxModelLen: int32(c.MaxModelLen),
TensorParallelSize: int32(c.TensorParallelSize),
DisableLogStatus: c.DisableLogStatus,
DType: c.DType,
// LimitMMPerPrompt vLLM
LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
MMProj: c.MMProj,
FlashAttention: c.FlashAttention,
CacheTypeKey: c.CacheTypeK,

View File

@@ -9,10 +9,10 @@ import (
model "github.com/mudler/LocalAI/pkg/model"
)
func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
opts := ModelOptions(backendConfig, appConfig)
rerankModel, err := loader.Load(opts...)
func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
rerankModel, err := loader.Load(opts...)
if err != nil {
return nil, err
}

View File

@@ -13,6 +13,7 @@ import (
)
func SoundGeneration(
modelFile string,
text string,
duration *float32,
temperature *float32,
@@ -24,9 +25,8 @@ func SoundGeneration(
backendConfig config.BackendConfig,
) (string, *proto.Result, error) {
opts := ModelOptions(backendConfig, appConfig)
opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
soundGenModel, err := loader.Load(opts...)
if err != nil {
return "", nil, err
}
@@ -44,7 +44,7 @@ func SoundGeneration(
res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
Text: text,
Model: backendConfig.Model,
Model: modelFile,
Dst: filePath,
Sample: doSample,
Duration: duration,

View File

@@ -4,17 +4,19 @@ import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/grpc"
"github.com/mudler/LocalAI/pkg/model"
model "github.com/mudler/LocalAI/pkg/model"
)
func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
modelFile := backendConfig.Model
var inferenceModel grpc.Backend
var err error
opts := ModelOptions(backendConfig, appConfig)
inferenceModel, err = loader.Load(opts...)
opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
inferenceModel, err = loader.Load(opts...)
if err != nil {
return schema.TokenizeResponse{}, err
}

View File

@@ -47,7 +47,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
tks = append(tks, int(t))
}
tr.Segments = append(tr.Segments,
schema.TranscriptionSegment{
schema.Segment{
Text: s.Text,
Id: int(s.Id),
Start: time.Duration(s.Start),

View File

@@ -14,22 +14,28 @@ import (
)
func ModelTTS(
backend,
text,
modelFile,
voice,
language string,
loader *model.ModelLoader,
appConfig *config.ApplicationConfig,
backendConfig config.BackendConfig,
) (string, *proto.Result, error) {
opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
ttsModel, err := loader.Load(opts...)
bb := backend
if bb == "" {
bb = model.PiperBackend
}
opts := ModelOptions(backendConfig, appConfig, model.WithBackendString(bb), model.WithModel(modelFile))
ttsModel, err := loader.Load(opts...)
if err != nil {
return "", nil, err
}
if ttsModel == nil {
return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
return "", nil, fmt.Errorf("could not load piper model")
}
if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
@@ -39,21 +45,22 @@ func ModelTTS(
fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
filePath := filepath.Join(appConfig.AudioDir, fileName)
// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
// This should be addressed in a follow up PR soon.
// Copying it over nearly verbatim, as TTS backends are not functional without this.
// If the model file is not empty, we pass it joined with the model path
modelPath := ""
// Checking first that it exists and is not outside ModelPath
// TODO: we should actually first check if the modelFile is looking like
// a FS path
mp := filepath.Join(loader.ModelPath, backendConfig.Model)
if _, err := os.Stat(mp); err == nil {
if err := utils.VerifyPath(mp, appConfig.ModelPath); err != nil {
return "", nil, err
if modelFile != "" {
// If the model file is not empty, we pass it joined with the model path
// Checking first that it exists and is not outside ModelPath
// TODO: we should actually first check if the modelFile is looking like
// a FS path
mp := filepath.Join(loader.ModelPath, modelFile)
if _, err := os.Stat(mp); err == nil {
if err := utils.VerifyPath(mp, appConfig.ModelPath); err != nil {
return "", nil, err
}
modelPath = mp
} else {
modelPath = modelFile
}
modelPath = mp
} else {
modelPath = backendConfig.Model // skip this step if it fails?????
}
res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{

View File

@@ -1,38 +0,0 @@
package backend
import (
"context"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/schema"
"github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/model"
)
func VAD(request *schema.VADRequest,
ctx context.Context,
ml *model.ModelLoader,
appConfig *config.ApplicationConfig,
backendConfig config.BackendConfig) (*schema.VADResponse, error) {
opts := ModelOptions(backendConfig, appConfig)
vadModel, err := ml.Load(opts...)
if err != nil {
return nil, err
}
req := proto.VADRequest{
Audio: request.Audio,
}
resp, err := vadModel.VAD(ctx, &req)
if err != nil {
return nil, err
}
segments := []schema.VADSegment{}
for _, s := range resp.Segments {
segments = append(segments, schema.VADSegment{Start: s.Start, End: s.End})
}
return &schema.VADResponse{
Segments: segments,
}, nil
}

View File

@@ -86,14 +86,13 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
options := config.BackendConfig{}
options.SetDefaults()
options.Backend = t.Backend
options.Model = t.Model
var inputFile *string
if t.InputFile != "" {
inputFile = &t.InputFile
}
filePath, _, err := backend.SoundGeneration(text,
filePath, _, err := backend.SoundGeneration(t.Model, text,
parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)

View File

@@ -52,10 +52,8 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
options := config.BackendConfig{}
options.SetDefaults()
options.Backend = t.Backend
options.Model = t.Model
filePath, _, err := backend.ModelTTS(text, t.Voice, t.Language, ml, opts, options)
filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
if err != nil {
return err
}

View File

@@ -130,28 +130,25 @@ type LLMConfig struct {
TrimSpace []string `yaml:"trimspace"`
TrimSuffix []string `yaml:"trimsuffix"`
ContextSize *int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraAdapters []string `yaml:"lora_adapters"`
LoraScales []float32 `yaml:"lora_scales"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
LoadFormat string `yaml:"load_format"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM
SwapSpace int `yaml:"swap_space"` // vLLM
MaxModelLen int `yaml:"max_model_len"` // vLLM
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
DisableLogStatus bool `yaml:"disable_log_stats"` // vLLM
DType string `yaml:"dtype"` // vLLM
LimitMMPerPrompt LimitMMPerPrompt `yaml:"limit_mm_per_prompt"` // vLLM
MMProj string `yaml:"mmproj"`
ContextSize *int `yaml:"context_size"`
NUMA bool `yaml:"numa"`
LoraAdapter string `yaml:"lora_adapter"`
LoraBase string `yaml:"lora_base"`
LoraAdapters []string `yaml:"lora_adapters"`
LoraScales []float32 `yaml:"lora_scales"`
LoraScale float32 `yaml:"lora_scale"`
NoMulMatQ bool `yaml:"no_mulmatq"`
DraftModel string `yaml:"draft_model"`
NDraft int32 `yaml:"n_draft"`
Quantization string `yaml:"quantization"`
LoadFormat string `yaml:"load_format"`
GPUMemoryUtilization float32 `yaml:"gpu_memory_utilization"` // vLLM
TrustRemoteCode bool `yaml:"trust_remote_code"` // vLLM
EnforceEager bool `yaml:"enforce_eager"` // vLLM
SwapSpace int `yaml:"swap_space"` // vLLM
MaxModelLen int `yaml:"max_model_len"` // vLLM
TensorParallelSize int `yaml:"tensor_parallel_size"` // vLLM
MMProj string `yaml:"mmproj"`
FlashAttention bool `yaml:"flash_attention"`
NoKVOffloading bool `yaml:"no_kv_offloading"`
@@ -169,13 +166,6 @@ type LLMConfig struct {
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
}
// LimitMMPerPrompt is a struct that holds the configuration for the limit-mm-per-prompt config in vLLM
type LimitMMPerPrompt struct {
LimitImagePerPrompt int `yaml:"image"`
LimitVideoPerPrompt int `yaml:"video"`
LimitAudioPerPrompt int `yaml:"audio"`
}
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
type AutoGPTQ struct {
ModelBaseName string `yaml:"model_base_name"`
@@ -213,8 +203,6 @@ type TemplateConfig struct {
Multimodal string `yaml:"multimodal"`
JinjaTemplate bool `yaml:"jinja_template"`
ReplyPrefix string `yaml:"reply_prefix"`
}
func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
@@ -224,15 +212,7 @@ func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
return err
}
*c = BackendConfig(aux)
c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
// Make sure the usecases are valid, we rewrite with what we identified
c.KnownUsecaseStrings = []string{}
for k, usecase := range GetAllBackendConfigUsecases() {
if c.HasUsecases(usecase) {
c.KnownUsecaseStrings = append(c.KnownUsecaseStrings, k)
}
}
return nil
}
@@ -307,8 +287,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
defaultTopP := 0.95
defaultTopK := 40
defaultTemp := 0.9
// https://github.com/mudler/LocalAI/issues/2780
defaultMirostat := 0
defaultMirostat := 2
defaultMirostatTAU := 5.0
defaultMirostatETA := 0.1
defaultTypicalP := 1.0
@@ -457,21 +436,19 @@ func (c *BackendConfig) HasTemplate() bool {
type BackendConfigUsecases int
const (
FLAG_ANY BackendConfigUsecases = 0b00000000000
FLAG_CHAT BackendConfigUsecases = 0b00000000001
FLAG_COMPLETION BackendConfigUsecases = 0b00000000010
FLAG_EDIT BackendConfigUsecases = 0b00000000100
FLAG_EMBEDDINGS BackendConfigUsecases = 0b00000001000
FLAG_RERANK BackendConfigUsecases = 0b00000010000
FLAG_IMAGE BackendConfigUsecases = 0b00000100000
FLAG_TRANSCRIPT BackendConfigUsecases = 0b00001000000
FLAG_TTS BackendConfigUsecases = 0b00010000000
FLAG_SOUND_GENERATION BackendConfigUsecases = 0b00100000000
FLAG_TOKENIZE BackendConfigUsecases = 0b01000000000
FLAG_VAD BackendConfigUsecases = 0b10000000000
FLAG_ANY BackendConfigUsecases = 0b000000000
FLAG_CHAT BackendConfigUsecases = 0b000000001
FLAG_COMPLETION BackendConfigUsecases = 0b000000010
FLAG_EDIT BackendConfigUsecases = 0b000000100
FLAG_EMBEDDINGS BackendConfigUsecases = 0b000001000
FLAG_RERANK BackendConfigUsecases = 0b000010000
FLAG_IMAGE BackendConfigUsecases = 0b000100000
FLAG_TRANSCRIPT BackendConfigUsecases = 0b001000000
FLAG_TTS BackendConfigUsecases = 0b010000000
FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
// Common Subsets
FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
)
func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
@@ -486,16 +463,10 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
"FLAG_TRANSCRIPT": FLAG_TRANSCRIPT,
"FLAG_TTS": FLAG_TTS,
"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
"FLAG_TOKENIZE": FLAG_TOKENIZE,
"FLAG_VAD": FLAG_VAD,
"FLAG_LLM": FLAG_LLM,
}
}
func stringToFlag(s string) string {
return "FLAG_" + strings.ToUpper(s)
}
func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
if len(input) == 0 {
return nil
@@ -503,7 +474,7 @@ func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
result := FLAG_ANY
flags := GetAllBackendConfigUsecases()
for _, str := range input {
flag, exists := flags[stringToFlag(str)]
flag, exists := flags["FLAG_"+strings.ToUpper(str)]
if exists {
result |= flag
}
@@ -577,18 +548,5 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
}
}
if (u & FLAG_TOKENIZE) == FLAG_TOKENIZE {
tokenizeCapableBackends := []string{"llama.cpp", "rwkv"}
if !slices.Contains(tokenizeCapableBackends, c.Backend) {
return false
}
}
if (u & FLAG_VAD) == FLAG_VAD {
if c.Backend != "silero-vad" {
return false
}
}
return true
}

View File

@@ -81,10 +81,10 @@ func readMultipleBackendConfigsFromFile(file string, opts ...ConfigLoaderOption)
c := &[]*BackendConfig{}
f, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("readMultipleBackendConfigsFromFile cannot read config file %q: %w", file, err)
return nil, fmt.Errorf("cannot read config file: %w", err)
}
if err := yaml.Unmarshal(f, c); err != nil {
return nil, fmt.Errorf("readMultipleBackendConfigsFromFile cannot unmarshal config file %q: %w", file, err)
return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
}
for _, cc := range *c {
@@ -101,10 +101,10 @@ func readBackendConfigFromFile(file string, opts ...ConfigLoaderOption) (*Backen
c := &BackendConfig{}
f, err := os.ReadFile(file)
if err != nil {
return nil, fmt.Errorf("readBackendConfigFromFile cannot read config file %q: %w", file, err)
return nil, fmt.Errorf("cannot read config file: %w", err)
}
if err := yaml.Unmarshal(f, c); err != nil {
return nil, fmt.Errorf("readBackendConfigFromFile cannot unmarshal config file %q: %w", file, err)
return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
}
c.SetDefaults(opts...)
@@ -117,9 +117,7 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
// Load a config file if present after the model name
cfg := &BackendConfig{
PredictionOptions: schema.PredictionOptions{
BasicModelRequest: schema.BasicModelRequest{
Model: modelName,
},
Model: modelName,
},
}
@@ -147,15 +145,6 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
return cfg, nil
}
func (bcl *BackendConfigLoader) LoadBackendConfigFileByNameDefaultOptions(modelName string, appConfig *ApplicationConfig) (*BackendConfig, error) {
return bcl.LoadBackendConfigFileByName(modelName, appConfig.ModelPath,
LoadOptionDebug(appConfig.Debug),
LoadOptionThreads(appConfig.Threads),
LoadOptionContextSize(appConfig.ContextSize),
LoadOptionF16(appConfig.F16),
ModelPath(appConfig.ModelPath))
}
// This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile
func (bcl *BackendConfigLoader) LoadMultipleBackendConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
bcl.Lock()
@@ -178,7 +167,7 @@ func (bcl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoa
defer bcl.Unlock()
c, err := readBackendConfigFromFile(file, opts...)
if err != nil {
return fmt.Errorf("LoadBackendConfig cannot read config file %q: %w", file, err)
return fmt.Errorf("cannot read config file: %w", err)
}
if c.Validate() {
@@ -335,10 +324,9 @@ func (bcl *BackendConfigLoader) Preload(modelPath string) error {
func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
bcl.Lock()
defer bcl.Unlock()
entries, err := os.ReadDir(path)
if err != nil {
return fmt.Errorf("LoadBackendConfigsFromPath cannot read directory '%s': %w", path, err)
return fmt.Errorf("cannot read directory '%s': %w", path, err)
}
files := make([]fs.FileInfo, 0, len(entries))
for _, entry := range entries {
@@ -356,13 +344,13 @@ func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...
}
c, err := readBackendConfigFromFile(filepath.Join(path, file.Name()), opts...)
if err != nil {
log.Error().Err(err).Str("File Name", file.Name()).Msgf("LoadBackendConfigsFromPath cannot read config file")
log.Error().Err(err).Msgf("cannot read config file: %s", file.Name())
continue
}
if c.Validate() {
bcl.configs[c.Name] = *c
} else {
log.Error().Err(err).Str("Name", c.Name).Msgf("config is not valid")
log.Error().Err(err).Msgf("config is not valid")
}
}

View File

@@ -161,11 +161,10 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
}
// We try to guess only if we don't have a template defined already
guessPath := filepath.Join(modelPath, cfg.ModelFileName())
f, err := gguf.ParseGGUFFile(guessPath)
f, err := gguf.ParseGGUFFile(filepath.Join(modelPath, cfg.ModelFileName()))
if err != nil {
// Only valid for gguf files
log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
log.Debug().Msgf("guessDefaultsFromFile: %s", "not a GGUF file")
return
}

View File

@@ -29,8 +29,6 @@ func InstallModelFromGallery(galleries []config.Gallery, name string, basePath s
if err != nil {
return err
}
config.Description = model.Description
config.License = model.License
} else if len(model.ConfigFile) > 0 {
// TODO: is this worse than using the override method with a blank cfg yaml?
reYamlConfig, err := yaml.Marshal(model.ConfigFile)
@@ -116,7 +114,7 @@ func FindModel(models []*GalleryModel, name string, basePath string) *GalleryMod
// List available models
// Models galleries are a list of yaml files that are hosted on a remote server (for example github).
// Each yaml file contains a list of models that can be downloaded and optionally overrides to define a new model setting.
func AvailableGalleryModels(galleries []config.Gallery, basePath string) (GalleryModels, error) {
func AvailableGalleryModels(galleries []config.Gallery, basePath string) ([]*GalleryModel, error) {
var models []*GalleryModel
// Get models from galleries

View File

@@ -48,10 +48,8 @@ var _ = Describe("Model test", func() {
defer os.RemoveAll(tempdir)
gallery := []GalleryModel{{
Metadata: Metadata{
Name: "bert",
URL: bertEmbeddingsURL,
},
Name: "bert",
URL: bertEmbeddingsURL,
}}
out, err := yaml.Marshal(gallery)
Expect(err).ToNot(HaveOccurred())

View File

@@ -11,14 +11,6 @@ import (
// It is used to install the model by resolving the URL and downloading the files.
// The other fields are used to override the configuration of the model.
type GalleryModel struct {
Metadata `json:",inline" yaml:",inline"`
// config_file is read in the situation where URL is blank - and therefore this is a base config.
ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
// Overrides are used to override the configuration of the model located at URL
Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
}
type Metadata struct {
URL string `json:"url,omitempty" yaml:"url,omitempty"`
Name string `json:"name,omitempty" yaml:"name,omitempty"`
Description string `json:"description,omitempty" yaml:"description,omitempty"`
@@ -26,6 +18,10 @@ type Metadata struct {
URLs []string `json:"urls,omitempty" yaml:"urls,omitempty"`
Icon string `json:"icon,omitempty" yaml:"icon,omitempty"`
Tags []string `json:"tags,omitempty" yaml:"tags,omitempty"`
// config_file is read in the situation where URL is blank - and therefore this is a base config.
ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
// Overrides are used to override the configuration of the model located at URL
Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
// AdditionalFiles are used to add additional files to the model
AdditionalFiles []File `json:"files,omitempty" yaml:"files,omitempty"`
// Gallery is a reference to the gallery which contains the model
@@ -62,15 +58,3 @@ func (gm GalleryModels) FindByName(name string) *GalleryModel {
}
return nil
}
func (gm GalleryModels) Paginate(pageNum int, itemsNum int) GalleryModels {
start := (pageNum - 1) * itemsNum
end := start + itemsNum
if start > len(gm) {
start = len(gm)
}
if end > len(gm) {
end = len(gm)
}
return gm[start:end]
}

View File

@@ -9,11 +9,7 @@ import (
var _ = Describe("Gallery API tests", func() {
Context("requests", func() {
It("parses github with a branch", func() {
req := GalleryModel{
Metadata: Metadata{
URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main",
},
}
req := GalleryModel{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"}
e, err := GetGalleryConfigFromURL(req.URL, "")
Expect(err).ToNot(HaveOccurred())
Expect(e.Name).To(Equal("gpt4all-j"))

View File

@@ -130,6 +130,7 @@ func API(application *application.Application) (*fiber.App, error) {
return metricsService.Shutdown()
})
}
}
// Health Checks should always be exempt from auth, so register these first
routes.HealthRoutes(router)
@@ -139,28 +140,6 @@ func API(application *application.Application) (*fiber.App, error) {
return nil, fmt.Errorf("failed to create key auth config: %w", err)
}
httpFS := http.FS(embedDirStatic)
router.Use(favicon.New(favicon.Config{
URL: "/favicon.ico",
FileSystem: httpFS,
File: "static/favicon.ico",
}))
router.Use("/static", filesystem.New(filesystem.Config{
Root: httpFS,
PathPrefix: "static",
Browse: true,
}))
if application.ApplicationConfig().ImageDir != "" {
router.Static("/generated-images", application.ApplicationConfig().ImageDir)
}
if application.ApplicationConfig().AudioDir != "" {
router.Static("/generated-audio", application.ApplicationConfig().AudioDir)
}
// Auth is applied to _all_ endpoints. No exceptions. Filtering out endpoints to bypass is the role of the Filter property of the KeyAuth Configuration
router.Use(v2keyauth.New(*kaConfig))
@@ -188,15 +167,27 @@ func API(application *application.Application) (*fiber.App, error) {
galleryService := services.NewGalleryService(application.ApplicationConfig())
galleryService.Start(application.ApplicationConfig().Context, application.BackendLoader())
requestExtractor := middleware.NewRequestExtractor(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
routes.RegisterElevenLabsRoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
routes.RegisterLocalAIRoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
routes.RegisterOpenAIRoutes(router, requestExtractor, application)
routes.RegisterElevenLabsRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
routes.RegisterLocalAIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
routes.RegisterOpenAIRoutes(router, application)
if !application.ApplicationConfig().DisableWebUI {
routes.RegisterUIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
}
routes.RegisterJINARoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
routes.RegisterJINARoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
httpFS := http.FS(embedDirStatic)
router.Use(favicon.New(favicon.Config{
URL: "/favicon.ico",
FileSystem: httpFS,
File: "static/favicon.ico",
}))
router.Use("/static", filesystem.New(filesystem.Config{
Root: httpFS,
PathPrefix: "static",
Browse: true,
}))
// Define a custom 404 handler
// Note: keep this at the bottom!

View File

@@ -299,18 +299,14 @@ var _ = Describe("API test", func() {
g := []gallery.GalleryModel{
{
Metadata: gallery.Metadata{
Name: "bert",
URL: bertEmbeddingsURL,
},
Name: "bert",
URL: bertEmbeddingsURL,
},
{
Metadata: gallery.Metadata{
Name: "bert2",
URL: bertEmbeddingsURL,
AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
},
Overrides: map[string]interface{}{"foo": "bar"},
Name: "bert2",
URL: bertEmbeddingsURL,
Overrides: map[string]interface{}{"foo": "bar"},
AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
},
}
out, err := yaml.Marshal(g)
@@ -526,6 +522,77 @@ var _ = Describe("API test", func() {
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
})
It("runs openllama(llama-ggml backend)", Label("llama"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
Name: "openllama_3b",
Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
uuid := response["uuid"].(string)
Eventually(func() bool {
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
return response["processed"].(bool)
}, "360s", "10s").Should(Equal(true))
By("testing completion")
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1))
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
By("testing functions")
resp2, err := client.CreateChatCompletion(
context.TODO(),
openai.ChatCompletionRequest{
Model: "openllama_3b",
Messages: []openai.ChatCompletionMessage{
{
Role: "user",
Content: "What is the weather like in San Francisco (celsius)?",
},
},
Functions: []openai.FunctionDefinition{
openai.FunctionDefinition{
Name: "get_current_weather",
Description: "Get the current weather",
Parameters: jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"location": {
Type: jsonschema.String,
Description: "The city and state, e.g. San Francisco, CA",
},
"unit": {
Type: jsonschema.String,
Enum: []string{"celcius", "fahrenheit"},
},
},
Required: []string{"location"},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp2.Choices)).To(Equal(1))
Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
var res map[string]string
err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
Expect(err).ToNot(HaveOccurred())
Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
})
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")

47
core/http/ctx/fiber.go Normal file
View File

@@ -0,0 +1,47 @@
package fiberContext
import (
"fmt"
"strings"
"github.com/gofiber/fiber/v2"
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/services"
"github.com/mudler/LocalAI/pkg/model"
"github.com/rs/zerolog/log"
)
// ModelFromContext returns the model from the context
// If no model is specified, it will take the first available
// Takes a model string as input which should be the one received from the user request.
// It returns the model name resolved from the context and an error if any.
func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *model.ModelLoader, modelInput string, firstModel bool) (string, error) {
if ctx.Params("model") != "" {
modelInput = ctx.Params("model")
}
if ctx.Query("model") != "" {
modelInput = ctx.Query("model")
}
// Set model from bearer token, if available
bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
// If no model was specified, take the first available
if modelInput == "" && !bearerExists && firstModel {
models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
if len(models) > 0 {
modelInput = models[0]
log.Debug().Msgf("No model specified, using: %s", modelInput)
} else {
log.Debug().Msgf("No model specified, returning error")
return "", fmt.Errorf("no model specified")
}
}
// If a model is found in bearer token takes precedence
if bearerExists {
log.Debug().Msgf("Using model from bearer token: %s", bearer)
modelInput = bearer
}
return modelInput, nil
}

Some files were not shown because too many files have changed in this diff Show More