mirror of
https://github.com/mudler/LocalAI.git
synced 2026-02-03 03:02:38 -05:00
Compare commits
111 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1634b219a | ||
|
|
6257e2f510 | ||
|
|
65ca754166 | ||
|
|
a0f0505f0d | ||
|
|
be6c4e6061 | ||
|
|
1996e6f4c9 | ||
|
|
671cd42917 | ||
|
|
568a01bf5c | ||
|
|
164abb8c9f | ||
|
|
ed2946feac | ||
|
|
bdd351b372 | ||
|
|
ad5e7d376a | ||
|
|
6e78d8cd9d | ||
|
|
614125f268 | ||
|
|
f41965bfb5 | ||
|
|
85a3cc8d8f | ||
|
|
ea8675d473 | ||
|
|
08a54c1812 | ||
|
|
8c7439b96e | ||
|
|
a9e42a76fa | ||
|
|
1a3b3d3e67 | ||
|
|
759d35e6b5 | ||
|
|
825e85bcc5 | ||
|
|
62165d556c | ||
|
|
78459889d8 | ||
|
|
0fdc6a92f6 | ||
|
|
8586a0167a | ||
|
|
f1d16a45c5 | ||
|
|
2023627d7f | ||
|
|
d5e1958a1f | ||
|
|
f9c58a01d3 | ||
|
|
4500650000 | ||
|
|
5674e671d0 | ||
|
|
0f44c3f69c | ||
|
|
f9069daf03 | ||
|
|
5f58841a3a | ||
|
|
287200e687 | ||
|
|
b653883c0a | ||
|
|
6b8a402353 | ||
|
|
d9b63fae7c | ||
|
|
377cdcabbf | ||
|
|
92a7f40141 | ||
|
|
e06daf437a | ||
|
|
d19bea4af2 | ||
|
|
fbca9f82fd | ||
|
|
04f284d202 | ||
|
|
cfd6112256 | ||
|
|
debc0974a6 | ||
|
|
03bbbea039 | ||
|
|
55af0b1c68 | ||
|
|
c8bfb72104 | ||
|
|
1b8a663001 | ||
|
|
a9abfa2b61 | ||
|
|
092bb0bd6b | ||
|
|
e28e80857b | ||
|
|
905473c739 | ||
|
|
aa0564a1c6 | ||
|
|
2553de0187 | ||
|
|
408dfe62ee | ||
|
|
648ffdf449 | ||
|
|
04c0841ca9 | ||
|
|
43144c4743 | ||
|
|
a778668bcd | ||
|
|
4b131a7090 | ||
|
|
d06a052d54 | ||
|
|
b5115903bf | ||
|
|
afaff175d0 | ||
|
|
4686877c6d | ||
|
|
e5586e8781 | ||
|
|
3acd767ac4 | ||
|
|
5488fc3bc1 | ||
|
|
0965c6cd68 | ||
|
|
db704199dc | ||
|
|
2cc3b7128e | ||
|
|
88b99d30bb | ||
|
|
307a835199 | ||
|
|
f84b55d1ef | ||
|
|
139209353f | ||
|
|
a30058b80f | ||
|
|
53f406dc35 | ||
|
|
2649407f44 | ||
|
|
0a8f627cce | ||
|
|
76d4e88e0c | ||
|
|
d4d2a76f8f | ||
|
|
7d306c6431 | ||
|
|
44bdacac61 | ||
|
|
6bd6e2bdeb | ||
|
|
2908ff3f6b | ||
|
|
f19277b8e2 | ||
|
|
32de75c683 | ||
|
|
164a9e972f | ||
|
|
d747f2c89b | ||
|
|
58662db48e | ||
|
|
078942fc9f | ||
|
|
6dfee99575 | ||
|
|
ad62156d54 | ||
|
|
1689740269 | ||
|
|
50a3b54e34 | ||
|
|
e94a50e9db | ||
|
|
4e0f3cc980 | ||
|
|
2a8cbad122 | ||
|
|
453c45d022 | ||
|
|
4550abbfce | ||
|
|
f2ba1cfb01 | ||
|
|
8c4196faf3 | ||
|
|
b0f4556c0f | ||
|
|
fa5c98549a | ||
|
|
3d12d2037c | ||
|
|
d6522e69ca | ||
|
|
ef1507d000 | ||
|
|
a3d69872e3 |
7
.github/ci/modelslist.go
vendored
7
.github/ci/modelslist.go
vendored
@@ -6,6 +6,7 @@ import (
|
||||
"io/ioutil"
|
||||
"os"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
@@ -279,6 +280,12 @@ func main() {
|
||||
return
|
||||
}
|
||||
|
||||
// Ensure that all arbitrary text content is sanitized before display
|
||||
for i, m := range models {
|
||||
models[i].Name = bluemonday.StrictPolicy().Sanitize(m.Name)
|
||||
models[i].Description = bluemonday.StrictPolicy().Sanitize(m.Description)
|
||||
}
|
||||
|
||||
// render the template
|
||||
data := struct {
|
||||
Models []*GalleryModel
|
||||
|
||||
4
.github/workflows/deploy-explorer.yaml
vendored
4
.github/workflows/deploy-explorer.yaml
vendored
@@ -33,7 +33,7 @@ jobs:
|
||||
run: |
|
||||
CGO_ENABLED=0 make build-api
|
||||
- name: rm
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
uses: appleboy/ssh-action@v1.1.0
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
@@ -53,7 +53,7 @@ jobs:
|
||||
rm: true
|
||||
target: ./local-ai
|
||||
- name: restarting
|
||||
uses: appleboy/ssh-action@v1.0.3
|
||||
uses: appleboy/ssh-action@v1.1.0
|
||||
with:
|
||||
host: ${{ secrets.EXPLORER_SSH_HOST }}
|
||||
username: ${{ secrets.EXPLORER_SSH_USERNAME }}
|
||||
|
||||
2
.github/workflows/secscan.yaml
vendored
2
.github/workflows/secscan.yaml
vendored
@@ -18,7 +18,7 @@ jobs:
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
- name: Run Gosec Security Scanner
|
||||
if: ${{ github.actor != 'dependabot[bot]' }}
|
||||
uses: securego/gosec@v2.21.0
|
||||
uses: securego/gosec@v2.21.4
|
||||
with:
|
||||
# we let the report trigger content trigger a failure using the GitHub Security features.
|
||||
args: '-no-fail -fmt sarif -out results.sarif ./...'
|
||||
|
||||
11
.github/workflows/test.yml
vendored
11
.github/workflows/test.yml
vendored
@@ -178,13 +178,22 @@ jobs:
|
||||
uses: actions/checkout@v4
|
||||
with:
|
||||
submodules: true
|
||||
- name: Dependencies
|
||||
run: |
|
||||
# Install protoc
|
||||
curl -L -s https://github.com/protocolbuffers/protobuf/releases/download/v26.1/protoc-26.1-linux-x86_64.zip -o protoc.zip && \
|
||||
unzip -j -d /usr/local/bin protoc.zip bin/protoc && \
|
||||
rm protoc.zip
|
||||
go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
|
||||
go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
|
||||
PATH="$PATH:$HOME/go/bin" make protogen-go
|
||||
- name: Build images
|
||||
run: |
|
||||
docker build --build-arg FFMPEG=true --build-arg IMAGE_TYPE=extras --build-arg EXTRA_BACKENDS=rerankers --build-arg MAKEFLAGS="--jobs=5 --output-sync=target" -t local-ai:tests -f Dockerfile .
|
||||
BASE_IMAGE=local-ai:tests DOCKER_AIO_IMAGE=local-ai-aio:test make docker-aio
|
||||
- name: Test
|
||||
run: |
|
||||
LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
|
||||
PATH="$PATH:$HOME/go/bin" LOCALAI_MODELS_DIR=$PWD/models LOCALAI_IMAGE_TAG=test LOCALAI_IMAGE=local-ai-aio \
|
||||
make run-e2e-aio
|
||||
- name: Setup tmate session if tests fail
|
||||
if: ${{ failure() }}
|
||||
|
||||
@@ -15,8 +15,6 @@ Thank you for your interest in contributing to LocalAI! We appreciate your time
|
||||
- [Documentation](#documentation)
|
||||
- [Community and Communication](#community-and-communication)
|
||||
|
||||
|
||||
|
||||
## Getting Started
|
||||
|
||||
### Prerequisites
|
||||
@@ -54,7 +52,7 @@ If you find a bug, have a feature request, or encounter any issues, please check
|
||||
|
||||
## Coding Guidelines
|
||||
|
||||
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like []`golangci-lint`](https://golangci-lint.run) can help you here.
|
||||
- No specific coding guidelines at the moment. Please make sure the code can be tested. The most popular lint tools like [`golangci-lint`](https://golangci-lint.run) can help you here.
|
||||
|
||||
## Testing
|
||||
|
||||
@@ -84,5 +82,3 @@ We are welcome the contribution of the documents, please open new PR or create a
|
||||
- You can reach out via the Github issue tracker.
|
||||
- Open a new discussion at [Discussion](https://github.com/go-skynet/LocalAI/discussions)
|
||||
- Join the Discord channel [Discord](https://discord.gg/uJAeKSAGDy)
|
||||
|
||||
---
|
||||
|
||||
36
Dockerfile
36
Dockerfile
@@ -9,6 +9,8 @@ FROM ${BASE_IMAGE} AS requirements-core
|
||||
USER root
|
||||
|
||||
ARG GO_VERSION=1.22.6
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG TARGETARCH
|
||||
ARG TARGETVARIANT
|
||||
|
||||
@@ -21,13 +23,25 @@ RUN apt-get update && \
|
||||
build-essential \
|
||||
ccache \
|
||||
ca-certificates \
|
||||
cmake \
|
||||
curl \
|
||||
curl libssl-dev \
|
||||
git \
|
||||
unzip upx-ucl && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# Install Go
|
||||
RUN curl -L -s https://go.dev/dl/go${GO_VERSION}.linux-${TARGETARCH}.tar.gz | tar -C /usr/local -xz
|
||||
ENV PATH=$PATH:/root/go/bin:/usr/local/go/bin
|
||||
@@ -188,6 +202,8 @@ FROM ${GRPC_BASE_IMAGE} AS grpc
|
||||
# This is a bit of a hack, but it's required in order to be able to effectively cache this layer in CI
|
||||
ARG GRPC_MAKEFLAGS="-j4 -Otarget"
|
||||
ARG GRPC_VERSION=v1.65.0
|
||||
ARG CMAKE_FROM_SOURCE=false
|
||||
ARG CMAKE_VERSION=3.26.4
|
||||
|
||||
ENV MAKEFLAGS=${GRPC_MAKEFLAGS}
|
||||
|
||||
@@ -196,12 +212,24 @@ WORKDIR /build
|
||||
RUN apt-get update && \
|
||||
apt-get install -y --no-install-recommends \
|
||||
ca-certificates \
|
||||
build-essential \
|
||||
cmake \
|
||||
build-essential curl libssl-dev \
|
||||
git && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install CMake (the version in 22.04 is too old)
|
||||
RUN <<EOT bash
|
||||
if [ "${CMAKE_FROM_SOURCE}}" = "true" ]; then
|
||||
curl -L -s https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}.tar.gz -o cmake.tar.gz && tar xvf cmake.tar.gz && cd cmake-${CMAKE_VERSION} && ./configure && make && make install
|
||||
else
|
||||
apt-get update && \
|
||||
apt-get install -y \
|
||||
cmake && \
|
||||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
fi
|
||||
EOT
|
||||
|
||||
# We install GRPC to a different prefix here so that we can copy in only the build artifacts later
|
||||
# saves several hundred MB on the final docker image size vs copying in the entire GRPC source tree
|
||||
# and running make install in the target container
|
||||
|
||||
20
Makefile
20
Makefile
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
|
||||
# llama.cpp versions
|
||||
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
|
||||
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
|
||||
CPPLLAMA_VERSION?=70392f1f81470607ba3afef04aa56c9f65587664
|
||||
CPPLLAMA_VERSION?=96776405a17034dcfd53d3ddf5d142d34bdbb657
|
||||
|
||||
# go-rwkv version
|
||||
RWKV_REPO?=https://github.com/donomii/go-rwkv.cpp
|
||||
@@ -16,7 +16,7 @@ RWKV_VERSION?=661e7ae26d442f5cfebd2a0881b44e8c55949ec6
|
||||
|
||||
# whisper.cpp version
|
||||
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
|
||||
WHISPER_CPP_VERSION?=0d2e2aed80109e8696791083bde3b58e190b7812
|
||||
WHISPER_CPP_VERSION?=fdbfb460ed546452a5d53611bba66d10d842e719
|
||||
|
||||
# bert.cpp version
|
||||
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
|
||||
@@ -468,15 +468,15 @@ run-e2e-image:
|
||||
ls -liah $(abspath ./tests/e2e-fixtures)
|
||||
docker run -p 5390:8080 -e MODELS_PATH=/models -e THREADS=1 -e DEBUG=true -d --rm -v $(TEST_DIR):/models --gpus all --name e2e-tests-$(RANDOM) localai-tests
|
||||
|
||||
run-e2e-aio:
|
||||
run-e2e-aio: protogen-go
|
||||
@echo 'Running e2e AIO tests'
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e-aio
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e-aio
|
||||
|
||||
test-e2e:
|
||||
@echo 'Running e2e tests'
|
||||
BUILD_TYPE=$(BUILD_TYPE) \
|
||||
LOCALAI_API=http://$(E2E_BRIDGE_IP):5390/v1 \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts 5 -v -r ./tests/e2e
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) -v -r ./tests/e2e
|
||||
|
||||
teardown-e2e:
|
||||
rm -rf $(TEST_DIR) || true
|
||||
@@ -484,24 +484,24 @@ teardown-e2e:
|
||||
|
||||
test-llama: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-llama-gguf: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts 5 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-tts: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="tts" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-stablediffusion: prepare-test
|
||||
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts 1 -v -r $(TEST_PATHS)
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stablediffusion" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
|
||||
|
||||
test-stores: backend-assets/grpc/local-store
|
||||
mkdir -p tests/integration/backend-assets/grpc
|
||||
cp -f backend-assets/grpc/local-store tests/integration/backend-assets/grpc/
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts 1 -v -r tests/integration
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="stores" --flake-attempts $(TEST_FLAKES) -v -r tests/integration
|
||||
|
||||
test-container:
|
||||
docker build --target requirements -t local-ai-test-container .
|
||||
|
||||
10
README.md
10
README.md
@@ -68,9 +68,7 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
|
||||
[💻 Getting started](https://localai.io/basics/getting_started/index.html)
|
||||
|
||||
## 🔥🔥 Hot topics / Roadmap
|
||||
|
||||
[Roadmap](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
## 📰 Latest project news
|
||||
|
||||
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)
|
||||
- July 2024: 🔥🔥 🆕 P2P Dashboard, LocalAI Federated mode and AI Swarms: https://github.com/mudler/LocalAI/pull/2723
|
||||
@@ -83,8 +81,12 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu
|
||||
- May 2024: Chat, TTS, and Image generation in the WebUI: https://github.com/mudler/LocalAI/pull/2222
|
||||
- April 2024: Reranker API: https://github.com/mudler/LocalAI/pull/2121
|
||||
|
||||
Hot topics (looking for contributors):
|
||||
Roadmap items: [List of issues](https://github.com/mudler/LocalAI/issues?q=is%3Aissue+is%3Aopen+label%3Aroadmap)
|
||||
|
||||
## 🔥🔥 Hot topics (looking for help):
|
||||
|
||||
- Multimodal with vLLM and Video understanding: https://github.com/mudler/LocalAI/pull/3729
|
||||
- Realtime API https://github.com/mudler/LocalAI/issues/3714
|
||||
- 🔥🔥 Distributed, P2P Global community pools: https://github.com/mudler/LocalAI/issues/3113
|
||||
- WebUI improvements: https://github.com/mudler/LocalAI/issues/2156
|
||||
- Backends v2: https://github.com/mudler/LocalAI/issues/1126
|
||||
|
||||
@@ -26,6 +26,19 @@ service Backend {
|
||||
rpc StoresFind(StoresFindOptions) returns (StoresFindResult) {}
|
||||
|
||||
rpc Rerank(RerankRequest) returns (RerankResult) {}
|
||||
|
||||
rpc GetMetrics(MetricsRequest) returns (MetricsResponse);
|
||||
}
|
||||
|
||||
// Define the empty request
|
||||
message MetricsRequest {}
|
||||
|
||||
message MetricsResponse {
|
||||
int32 slot_id = 1;
|
||||
string prompt_json_for_slot = 2; // Stores the prompt as a JSON string.
|
||||
float tokens_per_second = 3;
|
||||
int32 tokens_generated = 4;
|
||||
int32 prompt_tokens_processed = 5;
|
||||
}
|
||||
|
||||
message RerankRequest {
|
||||
@@ -136,6 +149,7 @@ message PredictOptions {
|
||||
repeated Message Messages = 44;
|
||||
repeated string Videos = 45;
|
||||
repeated string Audios = 46;
|
||||
string CorrelationId = 47;
|
||||
}
|
||||
|
||||
// The response message containing the result
|
||||
|
||||
@@ -113,7 +113,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
std::string ret;
|
||||
for (; begin != end; ++begin)
|
||||
{
|
||||
ret += llama_token_to_piece(ctx, *begin);
|
||||
ret += common_token_to_piece(ctx, *begin);
|
||||
}
|
||||
return ret;
|
||||
}
|
||||
@@ -121,7 +121,7 @@ static std::string tokens_to_str(llama_context *ctx, Iter begin, Iter end)
|
||||
// format incomplete utf-8 multibyte character for output
|
||||
static std::string tokens_to_output_formatted_string(const llama_context *ctx, const llama_token token)
|
||||
{
|
||||
std::string out = token == -1 ? "" : llama_token_to_piece(ctx, token);
|
||||
std::string out = token == -1 ? "" : common_token_to_piece(ctx, token);
|
||||
// if the size is 1 and first bit is 1, meaning it's a partial character
|
||||
// (size > 1 meaning it's already a known token)
|
||||
if (out.size() == 1 && (out[0] & 0x80) == 0x80)
|
||||
@@ -203,8 +203,8 @@ struct llama_client_slot
|
||||
std::string stopping_word;
|
||||
|
||||
// sampling
|
||||
struct gpt_sampler_params sparams;
|
||||
gpt_sampler *ctx_sampling = nullptr;
|
||||
struct common_sampler_params sparams;
|
||||
common_sampler *ctx_sampling = nullptr;
|
||||
|
||||
int32_t ga_i = 0; // group-attention state
|
||||
int32_t ga_n = 1; // group-attention factor
|
||||
@@ -257,7 +257,7 @@ struct llama_client_slot
|
||||
images.clear();
|
||||
}
|
||||
|
||||
bool has_budget(gpt_params &global_params) {
|
||||
bool has_budget(common_params &global_params) {
|
||||
if (params.n_predict == -1 && global_params.n_predict == -1)
|
||||
{
|
||||
return true; // limitless
|
||||
@@ -398,7 +398,7 @@ struct llama_server_context
|
||||
|
||||
clip_ctx *clp_ctx = nullptr;
|
||||
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
|
||||
llama_batch batch;
|
||||
|
||||
@@ -441,7 +441,7 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
bool load_model(const gpt_params ¶ms_)
|
||||
bool load_model(const common_params ¶ms_)
|
||||
{
|
||||
params = params_;
|
||||
if (!params.mmproj.empty()) {
|
||||
@@ -458,9 +458,9 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
llama_init_result llama_init = llama_init_from_gpt_params(params);
|
||||
model = llama_init.model;
|
||||
ctx = llama_init.context;
|
||||
common_init_result common_init = common_init_from_params(params);
|
||||
model = common_init.model;
|
||||
ctx = common_init.context;
|
||||
if (model == nullptr)
|
||||
{
|
||||
LOG_ERR("unable to load model: %s", params.model.c_str());
|
||||
@@ -495,6 +495,16 @@ struct llama_server_context
|
||||
}
|
||||
}
|
||||
|
||||
llama_client_slot* get_active_slot() {
|
||||
for (llama_client_slot& slot : slots) {
|
||||
// Check if the slot is currently processing
|
||||
if (slot.is_processing()) {
|
||||
return &slot; // Return the active slot
|
||||
}
|
||||
}
|
||||
return nullptr; // No active slot found
|
||||
}
|
||||
|
||||
void initialize() {
|
||||
// create slots
|
||||
all_slots_are_idle = true;
|
||||
@@ -568,12 +578,12 @@ struct llama_server_context
|
||||
std::vector<llama_token> p;
|
||||
if (first)
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
p = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
first = false;
|
||||
}
|
||||
else
|
||||
{
|
||||
p = ::llama_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
p = common_tokenize(ctx, s, false, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
prompt_tokens.insert(prompt_tokens.end(), p.begin(), p.end());
|
||||
}
|
||||
@@ -590,7 +600,7 @@ struct llama_server_context
|
||||
else
|
||||
{
|
||||
auto s = json_prompt.template get<std::string>();
|
||||
prompt_tokens = ::llama_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
prompt_tokens = common_tokenize(ctx, s, add_bos, TMP_FORCE_SPECIAL);
|
||||
}
|
||||
|
||||
return prompt_tokens;
|
||||
@@ -619,7 +629,7 @@ struct llama_server_context
|
||||
|
||||
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
|
||||
slot_params default_params;
|
||||
gpt_sampler_params default_sparams;
|
||||
common_sampler_params default_sparams;
|
||||
|
||||
slot->params.stream = json_value(data, "stream", false);
|
||||
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
|
||||
@@ -759,7 +769,7 @@ struct llama_server_context
|
||||
}
|
||||
else if (el[0].is_string())
|
||||
{
|
||||
auto toks = llama_tokenize(model, el[0].get<std::string>(), false);
|
||||
auto toks = common_tokenize(model, el[0].get<std::string>(), false);
|
||||
for (auto tok : toks)
|
||||
{
|
||||
slot->sparams.logit_bias.push_back({tok, bias});
|
||||
@@ -791,7 +801,7 @@ struct llama_server_context
|
||||
sampler_names.emplace_back(name);
|
||||
}
|
||||
}
|
||||
slot->sparams.samplers = gpt_sampler_types_from_names(sampler_names, false);
|
||||
slot->sparams.samplers = common_sampler_types_from_names(sampler_names, false);
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -875,9 +885,9 @@ struct llama_server_context
|
||||
|
||||
if (slot->ctx_sampling != nullptr)
|
||||
{
|
||||
gpt_sampler_free(slot->ctx_sampling);
|
||||
common_sampler_free(slot->ctx_sampling);
|
||||
}
|
||||
slot->ctx_sampling = gpt_sampler_init(model, slot->sparams);
|
||||
slot->ctx_sampling = common_sampler_init(model, slot->sparams);
|
||||
//llama_set_rng_seed(ctx, slot->params.seed);
|
||||
slot->command = LOAD_PROMPT;
|
||||
|
||||
@@ -904,13 +914,13 @@ struct llama_server_context
|
||||
system_tokens.clear();
|
||||
|
||||
if (!system_prompt.empty()) {
|
||||
system_tokens = ::llama_tokenize(ctx, system_prompt, add_bos_token);
|
||||
system_tokens = common_tokenize(ctx, system_prompt, add_bos_token);
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
for (int i = 0; i < (int)system_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
common_batch_add(batch, system_tokens[i], i, { 0 }, false);
|
||||
}
|
||||
|
||||
for (int32_t i = 0; i < (int32_t) batch.n_tokens; i += params.n_batch)
|
||||
@@ -999,7 +1009,7 @@ struct llama_server_context
|
||||
|
||||
bool process_token(completion_token_output &result, llama_client_slot &slot) {
|
||||
// remember which tokens were sampled - used for repetition penalties during sampling
|
||||
const std::string token_str = llama_token_to_piece(ctx, result.tok);
|
||||
const std::string token_str = common_token_to_piece(ctx, result.tok);
|
||||
slot.sampled = result.tok;
|
||||
|
||||
// search stop word and delete it
|
||||
@@ -1150,7 +1160,7 @@ struct llama_server_context
|
||||
samplers.reserve(slot.sparams.samplers.size());
|
||||
for (const auto & sampler : slot.sparams.samplers)
|
||||
{
|
||||
samplers.emplace_back(gpt_sampler_type_to_str(sampler));
|
||||
samplers.emplace_back(common_sampler_type_to_str(sampler));
|
||||
}
|
||||
|
||||
return json {
|
||||
@@ -1206,7 +1216,7 @@ struct llama_server_context
|
||||
if (slot.sparams.n_probs > 0)
|
||||
{
|
||||
std::vector<completion_token_output> probs_output = {};
|
||||
const std::vector<llama_token> to_send_toks = llama_tokenize(ctx, tkn.text_to_send, false);
|
||||
const std::vector<llama_token> to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
|
||||
size_t probs_pos = std::min(slot.sent_token_probs_index, slot.generated_token_probs.size());
|
||||
size_t probs_stop_pos = std::min(slot.sent_token_probs_index + to_send_toks.size(), slot.generated_token_probs.size());
|
||||
if (probs_pos < probs_stop_pos)
|
||||
@@ -1258,7 +1268,7 @@ struct llama_server_context
|
||||
std::vector<completion_token_output> probs = {};
|
||||
if (!slot.params.stream && slot.stopped_word)
|
||||
{
|
||||
const std::vector<llama_token> stop_word_toks = llama_tokenize(ctx, slot.stopping_word, false);
|
||||
const std::vector<llama_token> stop_word_toks = common_tokenize(ctx, slot.stopping_word, false);
|
||||
probs = std::vector<completion_token_output>(slot.generated_token_probs.begin(), slot.generated_token_probs.end() - stop_word_toks.size());
|
||||
}
|
||||
else
|
||||
@@ -1398,7 +1408,7 @@ struct llama_server_context
|
||||
}
|
||||
image_idx++;
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
// append prefix of next image
|
||||
const auto json_prompt = (image_idx >= (int) slot.images.size()) ?
|
||||
@@ -1408,7 +1418,7 @@ struct llama_server_context
|
||||
std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
|
||||
for (int i = 0; i < (int) append_tokens.size(); ++i)
|
||||
{
|
||||
llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
common_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
}
|
||||
@@ -1540,7 +1550,7 @@ struct llama_server_context
|
||||
update_system_prompt();
|
||||
}
|
||||
|
||||
llama_batch_clear(batch);
|
||||
common_batch_clear(batch);
|
||||
|
||||
if (all_slots_are_idle)
|
||||
{
|
||||
@@ -1618,7 +1628,7 @@ struct llama_server_context
|
||||
|
||||
// TODO: we always have to take into account the "system_tokens"
|
||||
// this is not great and needs to be improved somehow
|
||||
llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
common_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
|
||||
slot.n_past += 1;
|
||||
}
|
||||
|
||||
@@ -1712,7 +1722,7 @@ struct llama_server_context
|
||||
|
||||
if (!slot.params.cache_prompt)
|
||||
{
|
||||
gpt_sampler_reset(slot.ctx_sampling);
|
||||
common_sampler_reset(slot.ctx_sampling);
|
||||
|
||||
slot.n_past = 0;
|
||||
slot.n_past_se = 0;
|
||||
@@ -1724,7 +1734,7 @@ struct llama_server_context
|
||||
// push the prompt into the sampling context (do not apply grammar)
|
||||
for (auto &token : prompt_tokens)
|
||||
{
|
||||
gpt_sampler_accept(slot.ctx_sampling, token, false);
|
||||
common_sampler_accept(slot.ctx_sampling, token, false);
|
||||
}
|
||||
|
||||
slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
|
||||
@@ -1816,7 +1826,7 @@ struct llama_server_context
|
||||
ga_i += ga_w/ga_n;
|
||||
}
|
||||
}
|
||||
llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
common_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
|
||||
slot_npast++;
|
||||
}
|
||||
|
||||
@@ -1933,9 +1943,9 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
completion_token_output result;
|
||||
const llama_token id = gpt_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);
|
||||
|
||||
gpt_sampler_accept(slot.ctx_sampling, id, true);
|
||||
common_sampler_accept(slot.ctx_sampling, id, true);
|
||||
|
||||
slot.n_decoded += 1;
|
||||
if (slot.n_decoded == 1)
|
||||
@@ -1946,7 +1956,7 @@ struct llama_server_context
|
||||
}
|
||||
|
||||
result.tok = id;
|
||||
const auto * cur_p = gpt_sampler_get_candidates(slot.ctx_sampling);
|
||||
const auto * cur_p = common_sampler_get_candidates(slot.ctx_sampling);
|
||||
|
||||
for (size_t i = 0; i < (size_t) slot.sparams.n_probs; ++i) {
|
||||
result.probs.push_back({
|
||||
@@ -1999,7 +2009,7 @@ static json format_partial_response(
|
||||
struct token_translator
|
||||
{
|
||||
llama_context * ctx;
|
||||
std::string operator()(llama_token tok) const { return llama_token_to_piece(ctx, tok); }
|
||||
std::string operator()(llama_token tok) const { return common_token_to_piece(ctx, tok); }
|
||||
std::string operator()(const completion_token_output &cto) const { return (*this)(cto.tok); }
|
||||
};
|
||||
|
||||
@@ -2106,6 +2116,9 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
data["ignore_eos"] = predict->ignoreeos();
|
||||
data["embeddings"] = predict->embeddings();
|
||||
|
||||
// Add the correlationid to json data
|
||||
data["correlation_id"] = predict->correlationid();
|
||||
|
||||
// for each image in the request, add the image data
|
||||
//
|
||||
for (int i = 0; i < predict->images_size(); i++) {
|
||||
@@ -2190,7 +2203,7 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, llama
|
||||
// }
|
||||
|
||||
static void params_parse(const backend::ModelOptions* request,
|
||||
gpt_params & params) {
|
||||
common_params & params) {
|
||||
|
||||
// this is comparable to: https://github.com/ggerganov/llama.cpp/blob/d9b33fe95bd257b36c84ee5769cc048230067d6f/examples/server/server.cpp#L1809
|
||||
|
||||
@@ -2298,7 +2311,7 @@ public:
|
||||
|
||||
grpc::Status LoadModel(ServerContext* context, const backend::ModelOptions* request, backend::Result* result) {
|
||||
// Implement LoadModel RPC
|
||||
gpt_params params;
|
||||
common_params params;
|
||||
params_parse(request, params);
|
||||
|
||||
llama_backend_init();
|
||||
@@ -2344,6 +2357,11 @@ public:
|
||||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
reply.set_prompt_tokens(tokens_evaluated);
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
{ "id", data["correlation_id"] }
|
||||
});
|
||||
|
||||
// Send the reply
|
||||
writer->Write(reply);
|
||||
|
||||
@@ -2367,6 +2385,12 @@ public:
|
||||
std::string completion_text;
|
||||
task_result result = llama.queue_results.recv(task_id);
|
||||
if (!result.error && result.stop) {
|
||||
|
||||
// Log Request Correlation Id
|
||||
LOG_VERBOSE("correlation:", {
|
||||
{ "id", data["correlation_id"] }
|
||||
});
|
||||
|
||||
completion_text = result.result_json.value("content", "");
|
||||
int32_t tokens_predicted = result.result_json.value("tokens_predicted", 0);
|
||||
int32_t tokens_evaluated = result.result_json.value("tokens_evaluated", 0);
|
||||
@@ -2406,6 +2430,31 @@ public:
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
|
||||
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
|
||||
llama_client_slot* active_slot = llama.get_active_slot();
|
||||
|
||||
if (active_slot != nullptr) {
|
||||
// Calculate the tokens per second using existing logic
|
||||
double tokens_per_second = 1e3 / active_slot->t_token_generation * active_slot->n_decoded;
|
||||
|
||||
// Populate the response with metrics
|
||||
response->set_slot_id(active_slot->id);
|
||||
response->set_prompt_json_for_slot(active_slot->prompt.dump());
|
||||
response->set_tokens_per_second(tokens_per_second);
|
||||
response->set_tokens_generated(active_slot->n_decoded);
|
||||
response->set_prompt_tokens_processed(active_slot->num_prompt_tokens_processed);
|
||||
} else {
|
||||
// Handle case when no active slot exists
|
||||
response->set_slot_id(0);
|
||||
response->set_prompt_json_for_slot("");
|
||||
response->set_tokens_per_second(0);
|
||||
response->set_tokens_generated(0);
|
||||
response->set_prompt_tokens_processed(0);
|
||||
}
|
||||
|
||||
return grpc::Status::OK;
|
||||
}
|
||||
};
|
||||
|
||||
void RunServer(const std::string& server_address) {
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
accelerate
|
||||
auto-gptq==0.7.1
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
transformers
|
||||
@@ -1,4 +1,4 @@
|
||||
bark==0.1.5
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -1,2 +1,2 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
@@ -1,4 +1,4 @@
|
||||
coqui-tts
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
This method sets up the gRPC service by starting the server
|
||||
"""
|
||||
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(10)
|
||||
time.sleep(30)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
setuptools
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
pillow
|
||||
protobuf
|
||||
certifi
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
wheel
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -2,7 +2,7 @@
|
||||
intel-extension-for-pytorch
|
||||
torch
|
||||
optimum[openvino]
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
librosa==0.9.1
|
||||
faster-whisper==1.0.3
|
||||
@@ -18,6 +18,6 @@ python-dotenv
|
||||
pypinyin==0.50.0
|
||||
cn2an==0.5.22
|
||||
jieba==0.42.1
|
||||
gradio==4.38.1
|
||||
gradio==4.44.1
|
||||
langid==1.1.6
|
||||
git+https://github.com/myshell-ai/MeloTTS.git
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
librosa
|
||||
faster-whisper
|
||||
|
||||
@@ -19,7 +19,7 @@ class TestBackendServicer(unittest.TestCase):
|
||||
This method sets up the gRPC service by starting the server
|
||||
"""
|
||||
self.service = subprocess.Popen(["python3", "backend.py", "--addr", "localhost:50051"])
|
||||
time.sleep(10)
|
||||
time.sleep(30)
|
||||
|
||||
def tearDown(self) -> None:
|
||||
"""
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
llvmlite==0.43.0
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
datasets
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
scipy==1.14.0
|
||||
certifi
|
||||
@@ -72,7 +72,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
Returns:
|
||||
A Result object that contains the result of the LoadModel operation.
|
||||
"""
|
||||
|
||||
model_name = request.Model
|
||||
|
||||
# Check to see if the Model exists in the filesystem already.
|
||||
if os.path.exists(request.ModelFile):
|
||||
model_name = request.ModelFile
|
||||
|
||||
compute = torch.float16
|
||||
if request.F16Memory == True:
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
|
||||
@@ -1,3 +1,3 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
@@ -5,6 +5,8 @@ import argparse
|
||||
import signal
|
||||
import sys
|
||||
import os
|
||||
from typing import List
|
||||
from PIL import Image
|
||||
|
||||
import backend_pb2
|
||||
import backend_pb2_grpc
|
||||
@@ -15,6 +17,8 @@ from vllm.engine.async_llm_engine import AsyncLLMEngine
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import random_uuid
|
||||
from vllm.transformers_utils.tokenizer import get_tokenizer
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.assets.video import VideoAsset
|
||||
|
||||
_ONE_DAY_IN_SECONDS = 60 * 60 * 24
|
||||
|
||||
@@ -105,6 +109,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
try:
|
||||
self.llm = AsyncLLMEngine.from_engine_args(engine_args)
|
||||
except Exception as err:
|
||||
print(f"Unexpected {err=}, {type(err)=}", file=sys.stderr)
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
try:
|
||||
@@ -117,7 +122,7 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
)
|
||||
except Exception as err:
|
||||
return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
|
||||
|
||||
print("Model loaded successfully", file=sys.stderr)
|
||||
return backend_pb2.Result(message="Model loaded successfully", success=True)
|
||||
|
||||
async def Predict(self, request, context):
|
||||
@@ -196,15 +201,33 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if request.Seed != 0:
|
||||
sampling_params.seed = request.Seed
|
||||
|
||||
# Extract image paths and process images
|
||||
prompt = request.Prompt
|
||||
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt apply the tokenizer template
|
||||
|
||||
image_paths = request.Images
|
||||
image_data = [self.load_image(img_path) for img_path in image_paths]
|
||||
|
||||
videos_path = request.Videos
|
||||
video_data = [self.load_video(video_path) for video_path in videos_path]
|
||||
|
||||
# If tokenizer template is enabled and messages are provided instead of prompt, apply the tokenizer template
|
||||
if not request.Prompt and request.UseTokenizerTemplate and request.Messages:
|
||||
prompt = self.tokenizer.apply_chat_template(request.Messages, tokenize=False, add_generation_prompt=True)
|
||||
|
||||
# Generate text
|
||||
# Generate text using the LLM engine
|
||||
request_id = random_uuid()
|
||||
outputs = self.llm.generate(prompt, sampling_params, request_id)
|
||||
print(f"Generating text with request_id: {request_id}", file=sys.stderr)
|
||||
outputs = self.llm.generate(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": {
|
||||
"image": image_data if image_data else None,
|
||||
"video": video_data if video_data else None,
|
||||
} if image_data or video_data else None,
|
||||
},
|
||||
sampling_params=sampling_params,
|
||||
request_id=request_id,
|
||||
)
|
||||
|
||||
# Stream the results
|
||||
generated_text = ""
|
||||
@@ -227,9 +250,49 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
|
||||
if streaming:
|
||||
return
|
||||
|
||||
# Remove the image files from /tmp folder
|
||||
for img_path in image_paths:
|
||||
try:
|
||||
os.remove(img_path)
|
||||
except Exception as e:
|
||||
print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
|
||||
|
||||
# Sending the final generated text
|
||||
yield backend_pb2.Reply(message=bytes(generated_text, encoding='utf-8'))
|
||||
|
||||
def load_image(self, image_path: str):
|
||||
"""
|
||||
Load an image from the given file path.
|
||||
|
||||
Args:
|
||||
image_path (str): The path to the image file.
|
||||
|
||||
Returns:
|
||||
Image: The loaded image.
|
||||
"""
|
||||
try:
|
||||
return Image.open(image_path)
|
||||
except Exception as e:
|
||||
print(f"Error loading image {image_path}: {e}", file=sys.stderr)
|
||||
return self.load_video(image_path)
|
||||
|
||||
def load_video(self, video_path: str):
|
||||
"""
|
||||
Load a video from the given file path.
|
||||
|
||||
Args:
|
||||
video_path (str): The path to the image file.
|
||||
|
||||
Returns:
|
||||
Video: The loaded video.
|
||||
"""
|
||||
try:
|
||||
video = VideoAsset(name=video_path).np_ndarrays
|
||||
return video
|
||||
except Exception as e:
|
||||
print(f"Error loading video {image_path}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
async def serve(address):
|
||||
# Start asyncio gRPC server
|
||||
server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
|
||||
|
||||
@@ -13,4 +13,20 @@ if [ "x${BUILD_PROFILE}" == "xintel" ]; then
|
||||
EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
|
||||
fi
|
||||
|
||||
installRequirements
|
||||
# We don't embed this into the images as it is a large dependency and not always needed.
|
||||
# Besides, the speed inference are not actually usable in the current state for production use-cases.
|
||||
if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
|
||||
ensureVenv
|
||||
# https://docs.vllm.ai/en/v0.6.1/getting_started/cpu-installation.html
|
||||
if [ ! -d vllm ]; then
|
||||
git clone https://github.com/vllm-project/vllm
|
||||
fi
|
||||
pushd vllm
|
||||
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.66.2 protobuf bitsandbytes
|
||||
uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
|
||||
VLLM_TARGET_DEVICE=cpu python setup.py install
|
||||
popd
|
||||
rm -rf vllm
|
||||
else
|
||||
installRequirements
|
||||
fi
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/cu118
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -1,3 +1,4 @@
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -1,4 +1,5 @@
|
||||
--extra-index-url https://download.pytorch.org/whl/rocm6.0
|
||||
accelerate
|
||||
torch
|
||||
transformers
|
||||
transformers
|
||||
bitsandbytes
|
||||
@@ -4,4 +4,5 @@ accelerate
|
||||
torch
|
||||
transformers
|
||||
optimum[openvino]
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
setuptools==75.1.0 # https://github.com/mudler/LocalAI/issues/2406
|
||||
bitsandbytes
|
||||
@@ -1,4 +1,4 @@
|
||||
grpcio==1.66.1
|
||||
grpcio==1.66.2
|
||||
protobuf
|
||||
certifi
|
||||
setuptools
|
||||
@@ -10,20 +10,11 @@ import (
|
||||
)
|
||||
|
||||
func ModelEmbedding(s string, tokens []int, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() ([]float32, error), error) {
|
||||
modelFile := backendConfig.Model
|
||||
|
||||
grpcOpts := GRPCModelOpts(backendConfig)
|
||||
|
||||
var inferenceModel interface{}
|
||||
var err error
|
||||
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
model.WithThreads(uint32(*backendConfig.Threads)),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
if backendConfig.Backend == "" {
|
||||
inferenceModel, err = loader.GreedyLoader(opts...)
|
||||
|
||||
@@ -8,19 +8,8 @@ import (
|
||||
)
|
||||
|
||||
func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (func() error, error) {
|
||||
threads := backendConfig.Threads
|
||||
if *threads == 0 && appConfig.Threads != 0 {
|
||||
threads = &appConfig.Threads
|
||||
}
|
||||
gRPCOpts := GRPCModelOpts(backendConfig)
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithBackendString(backendConfig.Backend),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithThreads(uint32(*threads)),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithModel(backendConfig.Model),
|
||||
model.WithLoadGRPCLoadModelOpts(gRPCOpts),
|
||||
})
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
inferenceModel, err := loader.BackendLoader(
|
||||
opts...,
|
||||
|
||||
@@ -33,22 +33,11 @@ type TokenUsage struct {
|
||||
|
||||
func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
|
||||
modelFile := c.Model
|
||||
threads := c.Threads
|
||||
if *threads == 0 && o.Threads != 0 {
|
||||
threads = &o.Threads
|
||||
}
|
||||
grpcOpts := GRPCModelOpts(c)
|
||||
|
||||
var inferenceModel grpc.Backend
|
||||
var err error
|
||||
|
||||
opts := modelOpts(c, o, []model.Option{
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
model.WithThreads(uint32(*threads)), // some models uses this to allocate threads during startup
|
||||
model.WithAssetDir(o.AssetsDestination),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(o.Context),
|
||||
})
|
||||
opts := ModelOptions(c, o, []model.Option{})
|
||||
|
||||
if c.Backend != "" {
|
||||
opts = append(opts, model.WithBackendString(c.Backend))
|
||||
|
||||
@@ -11,32 +11,65 @@ import (
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
func modelOpts(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
|
||||
func ModelOptions(c config.BackendConfig, so *config.ApplicationConfig, opts []model.Option) []model.Option {
|
||||
name := c.Name
|
||||
if name == "" {
|
||||
name = c.Model
|
||||
}
|
||||
|
||||
defOpts := []model.Option{
|
||||
model.WithBackendString(c.Backend),
|
||||
model.WithModel(c.Model),
|
||||
model.WithAssetDir(so.AssetsDestination),
|
||||
model.WithContext(so.Context),
|
||||
model.WithModelID(name),
|
||||
}
|
||||
|
||||
threads := 1
|
||||
|
||||
if c.Threads != nil {
|
||||
threads = *c.Threads
|
||||
}
|
||||
|
||||
if so.Threads != 0 {
|
||||
threads = so.Threads
|
||||
}
|
||||
|
||||
c.Threads = &threads
|
||||
|
||||
grpcOpts := grpcModelOpts(c)
|
||||
defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))
|
||||
|
||||
if so.SingleBackend {
|
||||
opts = append(opts, model.WithSingleActiveBackend())
|
||||
defOpts = append(defOpts, model.WithSingleActiveBackend())
|
||||
}
|
||||
|
||||
if so.ParallelBackendRequests {
|
||||
opts = append(opts, model.EnableParallelRequests)
|
||||
defOpts = append(defOpts, model.EnableParallelRequests)
|
||||
}
|
||||
|
||||
if c.GRPC.Attempts != 0 {
|
||||
opts = append(opts, model.WithGRPCAttempts(c.GRPC.Attempts))
|
||||
defOpts = append(defOpts, model.WithGRPCAttempts(c.GRPC.Attempts))
|
||||
}
|
||||
|
||||
if c.GRPC.AttemptsSleepTime != 0 {
|
||||
opts = append(opts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
|
||||
defOpts = append(defOpts, model.WithGRPCAttemptsDelay(c.GRPC.AttemptsSleepTime))
|
||||
}
|
||||
|
||||
for k, v := range so.ExternalGRPCBackends {
|
||||
opts = append(opts, model.WithExternalBackend(k, v))
|
||||
defOpts = append(defOpts, model.WithExternalBackend(k, v))
|
||||
}
|
||||
|
||||
return opts
|
||||
return append(defOpts, opts...)
|
||||
}
|
||||
|
||||
func getSeed(c config.BackendConfig) int32 {
|
||||
seed := int32(*c.Seed)
|
||||
var seed int32 = config.RAND_SEED
|
||||
|
||||
if c.Seed != nil {
|
||||
seed = int32(*c.Seed)
|
||||
}
|
||||
|
||||
if seed == config.RAND_SEED {
|
||||
seed = rand.Int31()
|
||||
}
|
||||
@@ -44,11 +77,47 @@ func getSeed(c config.BackendConfig) int32 {
|
||||
return seed
|
||||
}
|
||||
|
||||
func GRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
b := 512
|
||||
if c.Batch != 0 {
|
||||
b = c.Batch
|
||||
}
|
||||
|
||||
f16 := false
|
||||
if c.F16 != nil {
|
||||
f16 = *c.F16
|
||||
}
|
||||
|
||||
embeddings := false
|
||||
if c.Embeddings != nil {
|
||||
embeddings = *c.Embeddings
|
||||
}
|
||||
|
||||
lowVRAM := false
|
||||
if c.LowVRAM != nil {
|
||||
lowVRAM = *c.LowVRAM
|
||||
}
|
||||
|
||||
mmap := false
|
||||
if c.MMap != nil {
|
||||
mmap = *c.MMap
|
||||
}
|
||||
|
||||
ctxSize := 1024
|
||||
if c.ContextSize != nil {
|
||||
ctxSize = *c.ContextSize
|
||||
}
|
||||
|
||||
mmlock := false
|
||||
if c.MMlock != nil {
|
||||
mmlock = *c.MMlock
|
||||
}
|
||||
|
||||
nGPULayers := 9999999
|
||||
if c.NGPULayers != nil {
|
||||
nGPULayers = *c.NGPULayers
|
||||
}
|
||||
|
||||
return &pb.ModelOptions{
|
||||
CUDA: c.CUDA || c.Diffusers.CUDA,
|
||||
SchedulerType: c.Diffusers.SchedulerType,
|
||||
@@ -56,14 +125,14 @@ func GRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
CFGScale: c.Diffusers.CFGScale,
|
||||
LoraAdapter: c.LoraAdapter,
|
||||
LoraScale: c.LoraScale,
|
||||
F16Memory: *c.F16,
|
||||
F16Memory: f16,
|
||||
LoraBase: c.LoraBase,
|
||||
IMG2IMG: c.Diffusers.IMG2IMG,
|
||||
CLIPModel: c.Diffusers.ClipModel,
|
||||
CLIPSubfolder: c.Diffusers.ClipSubFolder,
|
||||
CLIPSkip: int32(c.Diffusers.ClipSkip),
|
||||
ControlNet: c.Diffusers.ControlNet,
|
||||
ContextSize: int32(*c.ContextSize),
|
||||
ContextSize: int32(ctxSize),
|
||||
Seed: getSeed(c),
|
||||
NBatch: int32(b),
|
||||
NoMulMatQ: c.NoMulMatQ,
|
||||
@@ -85,16 +154,16 @@ func GRPCModelOpts(c config.BackendConfig) *pb.ModelOptions {
|
||||
YarnBetaSlow: c.YarnBetaSlow,
|
||||
NGQA: c.NGQA,
|
||||
RMSNormEps: c.RMSNormEps,
|
||||
MLock: *c.MMlock,
|
||||
MLock: mmlock,
|
||||
RopeFreqBase: c.RopeFreqBase,
|
||||
RopeScaling: c.RopeScaling,
|
||||
Type: c.ModelType,
|
||||
RopeFreqScale: c.RopeFreqScale,
|
||||
NUMA: c.NUMA,
|
||||
Embeddings: *c.Embeddings,
|
||||
LowVRAM: *c.LowVRAM,
|
||||
NGPULayers: int32(*c.NGPULayers),
|
||||
MMap: *c.MMap,
|
||||
Embeddings: embeddings,
|
||||
LowVRAM: lowVRAM,
|
||||
NGPULayers: int32(nGPULayers),
|
||||
MMap: mmap,
|
||||
MainGPU: c.MainGPU,
|
||||
Threads: int32(*c.Threads),
|
||||
TensorSplit: c.TensorSplit,
|
||||
|
||||
@@ -9,21 +9,9 @@ import (
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func Rerank(backend, modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
||||
bb := backend
|
||||
if bb == "" {
|
||||
return nil, fmt.Errorf("backend is required")
|
||||
}
|
||||
func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
|
||||
|
||||
grpcOpts := GRPCModelOpts(backendConfig)
|
||||
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(bb),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
|
||||
rerankModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
|
||||
@@ -13,7 +13,6 @@ import (
|
||||
)
|
||||
|
||||
func SoundGeneration(
|
||||
backend string,
|
||||
modelFile string,
|
||||
text string,
|
||||
duration *float32,
|
||||
@@ -25,18 +24,8 @@ func SoundGeneration(
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig,
|
||||
) (string, *proto.Result, error) {
|
||||
if backend == "" {
|
||||
return "", nil, fmt.Errorf("backend is a required parameter")
|
||||
}
|
||||
|
||||
grpcOpts := GRPCModelOpts(backendConfig)
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(backend),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{model.WithModel(modelFile)})
|
||||
|
||||
soundGenModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
|
||||
33
core/backend/token_metrics.go
Normal file
33
core/backend/token_metrics.go
Normal file
@@ -0,0 +1,33 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/grpc/proto"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func TokenMetrics(
|
||||
modelFile string,
|
||||
loader *model.ModelLoader,
|
||||
appConfig *config.ApplicationConfig,
|
||||
backendConfig config.BackendConfig) (*proto.MetricsResponse, error) {
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{
|
||||
model.WithModel(modelFile),
|
||||
})
|
||||
model, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
if model == nil {
|
||||
return nil, fmt.Errorf("could not loadmodel model")
|
||||
}
|
||||
|
||||
res, err := model.GetTokenMetrics(context.Background(), &proto.MetricsRequest{})
|
||||
|
||||
return res, err
|
||||
}
|
||||
44
core/backend/tokenize.go
Normal file
44
core/backend/tokenize.go
Normal file
@@ -0,0 +1,44 @@
|
||||
package backend
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/grpc"
|
||||
model "github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
|
||||
|
||||
modelFile := backendConfig.Model
|
||||
|
||||
var inferenceModel grpc.Backend
|
||||
var err error
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{
|
||||
model.WithModel(modelFile),
|
||||
})
|
||||
|
||||
if backendConfig.Backend == "" {
|
||||
inferenceModel, err = loader.GreedyLoader(opts...)
|
||||
} else {
|
||||
opts = append(opts, model.WithBackendString(backendConfig.Backend))
|
||||
inferenceModel, err = loader.BackendLoader(opts...)
|
||||
}
|
||||
if err != nil {
|
||||
return schema.TokenizeResponse{}, err
|
||||
}
|
||||
|
||||
predictOptions := gRPCPredictOpts(backendConfig, loader.ModelPath)
|
||||
predictOptions.Prompt = s
|
||||
|
||||
// tokenize the string
|
||||
resp, err := inferenceModel.TokenizeString(appConfig.Context, predictOptions)
|
||||
if err != nil {
|
||||
return schema.TokenizeResponse{}, err
|
||||
}
|
||||
|
||||
return schema.TokenizeResponse{
|
||||
Tokens: resp.Tokens,
|
||||
}, nil
|
||||
|
||||
}
|
||||
@@ -14,13 +14,11 @@ import (
|
||||
|
||||
func ModelTranscription(audio, language string, translate bool, ml *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (*schema.TranscriptionResult, error) {
|
||||
|
||||
opts := modelOpts(backendConfig, appConfig, []model.Option{
|
||||
model.WithBackendString(model.WhisperBackend),
|
||||
model.WithModel(backendConfig.Model),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithThreads(uint32(*backendConfig.Threads)),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
})
|
||||
if backendConfig.Backend == "" {
|
||||
backendConfig.Backend = model.WhisperBackend
|
||||
}
|
||||
|
||||
opts := ModelOptions(backendConfig, appConfig, []model.Option{})
|
||||
|
||||
transcriptionModel, err := ml.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
|
||||
@@ -28,14 +28,9 @@ func ModelTTS(
|
||||
bb = model.PiperBackend
|
||||
}
|
||||
|
||||
grpcOpts := GRPCModelOpts(backendConfig)
|
||||
|
||||
opts := modelOpts(config.BackendConfig{}, appConfig, []model.Option{
|
||||
opts := ModelOptions(config.BackendConfig{}, appConfig, []model.Option{
|
||||
model.WithBackendString(bb),
|
||||
model.WithModel(modelFile),
|
||||
model.WithContext(appConfig.Context),
|
||||
model.WithAssetDir(appConfig.AssetsDestination),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
})
|
||||
ttsModel, err := loader.BackendLoader(opts...)
|
||||
if err != nil {
|
||||
|
||||
@@ -85,13 +85,14 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
|
||||
|
||||
options := config.BackendConfig{}
|
||||
options.SetDefaults()
|
||||
options.Backend = t.Backend
|
||||
|
||||
var inputFile *string
|
||||
if t.InputFile != "" {
|
||||
inputFile = &t.InputFile
|
||||
}
|
||||
|
||||
filePath, _, err := backend.SoundGeneration(t.Backend, t.Model, text,
|
||||
filePath, _, err := backend.SoundGeneration(t.Model, text,
|
||||
parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
|
||||
inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
|
||||
|
||||
|
||||
@@ -15,8 +15,9 @@ import (
|
||||
)
|
||||
|
||||
type UtilCMD struct {
|
||||
GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
|
||||
HFScan HFScanCMD `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
|
||||
GGUFInfo GGUFInfoCMD `cmd:"" name:"gguf-info" help:"Get information about a GGUF file"`
|
||||
HFScan HFScanCMD `cmd:"" name:"hf-scan" help:"Checks installed models for known security issues. WARNING: this is a best-effort feature and may not catch everything!"`
|
||||
UsecaseHeuristic UsecaseHeuristicCMD `cmd:"" name:"usecase-heuristic" help:"Checks a specific model config and prints what usecase LocalAI will offer for it."`
|
||||
}
|
||||
|
||||
type GGUFInfoCMD struct {
|
||||
@@ -30,6 +31,11 @@ type HFScanCMD struct {
|
||||
ToScan []string `arg:""`
|
||||
}
|
||||
|
||||
type UsecaseHeuristicCMD struct {
|
||||
ConfigName string `name:"The config file to check"`
|
||||
ModelsPath string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
|
||||
}
|
||||
|
||||
func (u *GGUFInfoCMD) Run(ctx *cliContext.Context) error {
|
||||
if u.Args == nil || len(u.Args) == 0 {
|
||||
return fmt.Errorf("no GGUF file provided")
|
||||
@@ -99,3 +105,31 @@ func (hfscmd *HFScanCMD) Run(ctx *cliContext.Context) error {
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
func (uhcmd *UsecaseHeuristicCMD) Run(ctx *cliContext.Context) error {
|
||||
if len(uhcmd.ConfigName) == 0 {
|
||||
log.Error().Msg("ConfigName is a required parameter")
|
||||
return fmt.Errorf("config name is a required parameter")
|
||||
}
|
||||
if len(uhcmd.ModelsPath) == 0 {
|
||||
log.Error().Msg("ModelsPath is a required parameter")
|
||||
return fmt.Errorf("model path is a required parameter")
|
||||
}
|
||||
bcl := config.NewBackendConfigLoader(uhcmd.ModelsPath)
|
||||
err := bcl.LoadBackendConfig(uhcmd.ConfigName)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("ConfigName", uhcmd.ConfigName).Msg("error while loading backend")
|
||||
return err
|
||||
}
|
||||
bc, exists := bcl.GetBackendConfig(uhcmd.ConfigName)
|
||||
if !exists {
|
||||
log.Error().Str("ConfigName", uhcmd.ConfigName).Msg("ConfigName not found")
|
||||
}
|
||||
for name, uc := range config.GetAllBackendConfigUsecases() {
|
||||
if bc.HasUsecases(uc) {
|
||||
log.Info().Str("Usecase", name)
|
||||
}
|
||||
}
|
||||
log.Info().Msg("---")
|
||||
return nil
|
||||
}
|
||||
|
||||
@@ -3,11 +3,13 @@ package config
|
||||
import (
|
||||
"os"
|
||||
"regexp"
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/downloader"
|
||||
"github.com/mudler/LocalAI/pkg/functions"
|
||||
"gopkg.in/yaml.v3"
|
||||
)
|
||||
|
||||
const (
|
||||
@@ -27,13 +29,15 @@ type BackendConfig struct {
|
||||
schema.PredictionOptions `yaml:"parameters"`
|
||||
Name string `yaml:"name"`
|
||||
|
||||
F16 *bool `yaml:"f16"`
|
||||
Threads *int `yaml:"threads"`
|
||||
Debug *bool `yaml:"debug"`
|
||||
Roles map[string]string `yaml:"roles"`
|
||||
Embeddings *bool `yaml:"embeddings"`
|
||||
Backend string `yaml:"backend"`
|
||||
TemplateConfig TemplateConfig `yaml:"template"`
|
||||
F16 *bool `yaml:"f16"`
|
||||
Threads *int `yaml:"threads"`
|
||||
Debug *bool `yaml:"debug"`
|
||||
Roles map[string]string `yaml:"roles"`
|
||||
Embeddings *bool `yaml:"embeddings"`
|
||||
Backend string `yaml:"backend"`
|
||||
TemplateConfig TemplateConfig `yaml:"template"`
|
||||
KnownUsecaseStrings []string `yaml:"known_usecases"`
|
||||
KnownUsecases *BackendConfigUsecases `yaml:"-"`
|
||||
|
||||
PromptStrings, InputStrings []string `yaml:"-"`
|
||||
InputToken [][]int `yaml:"-"`
|
||||
@@ -192,6 +196,21 @@ type TemplateConfig struct {
|
||||
// JoinChatMessagesByCharacter is a string that will be used to join chat messages together.
|
||||
// It defaults to \n
|
||||
JoinChatMessagesByCharacter *string `yaml:"join_chat_messages_by_character"`
|
||||
|
||||
Video string `yaml:"video"`
|
||||
Image string `yaml:"image"`
|
||||
Audio string `yaml:"audio"`
|
||||
}
|
||||
|
||||
func (c *BackendConfig) UnmarshalYAML(value *yaml.Node) error {
|
||||
type BCAlias BackendConfig
|
||||
var aux BCAlias
|
||||
if err := value.Decode(&aux); err != nil {
|
||||
return err
|
||||
}
|
||||
*c = BackendConfig(aux)
|
||||
c.KnownUsecases = GetUsecasesFromYAML(c.KnownUsecaseStrings)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (c *BackendConfig) SetFunctionCallString(s string) {
|
||||
@@ -410,3 +429,121 @@ func (c *BackendConfig) Validate() bool {
|
||||
func (c *BackendConfig) HasTemplate() bool {
|
||||
return c.TemplateConfig.Completion != "" || c.TemplateConfig.Edit != "" || c.TemplateConfig.Chat != "" || c.TemplateConfig.ChatMessage != ""
|
||||
}
|
||||
|
||||
type BackendConfigUsecases int
|
||||
|
||||
const (
|
||||
FLAG_ANY BackendConfigUsecases = 0b000000000
|
||||
FLAG_CHAT BackendConfigUsecases = 0b000000001
|
||||
FLAG_COMPLETION BackendConfigUsecases = 0b000000010
|
||||
FLAG_EDIT BackendConfigUsecases = 0b000000100
|
||||
FLAG_EMBEDDINGS BackendConfigUsecases = 0b000001000
|
||||
FLAG_RERANK BackendConfigUsecases = 0b000010000
|
||||
FLAG_IMAGE BackendConfigUsecases = 0b000100000
|
||||
FLAG_TRANSCRIPT BackendConfigUsecases = 0b001000000
|
||||
FLAG_TTS BackendConfigUsecases = 0b010000000
|
||||
FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
|
||||
|
||||
// Common Subsets
|
||||
FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
|
||||
)
|
||||
|
||||
func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
|
||||
return map[string]BackendConfigUsecases{
|
||||
"FLAG_ANY": FLAG_ANY,
|
||||
"FLAG_CHAT": FLAG_CHAT,
|
||||
"FLAG_COMPLETION": FLAG_COMPLETION,
|
||||
"FLAG_EDIT": FLAG_EDIT,
|
||||
"FLAG_EMBEDDINGS": FLAG_EMBEDDINGS,
|
||||
"FLAG_RERANK": FLAG_RERANK,
|
||||
"FLAG_IMAGE": FLAG_IMAGE,
|
||||
"FLAG_TRANSCRIPT": FLAG_TRANSCRIPT,
|
||||
"FLAG_TTS": FLAG_TTS,
|
||||
"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
|
||||
"FLAG_LLM": FLAG_LLM,
|
||||
}
|
||||
}
|
||||
|
||||
func GetUsecasesFromYAML(input []string) *BackendConfigUsecases {
|
||||
if len(input) == 0 {
|
||||
return nil
|
||||
}
|
||||
result := FLAG_ANY
|
||||
flags := GetAllBackendConfigUsecases()
|
||||
for _, str := range input {
|
||||
flag, exists := flags["FLAG_"+strings.ToUpper(str)]
|
||||
if exists {
|
||||
result |= flag
|
||||
}
|
||||
}
|
||||
return &result
|
||||
}
|
||||
|
||||
// HasUsecases examines a BackendConfig and determines which endpoints have a chance of success.
|
||||
func (c *BackendConfig) HasUsecases(u BackendConfigUsecases) bool {
|
||||
if (c.KnownUsecases != nil) && ((u & *c.KnownUsecases) == u) {
|
||||
return true
|
||||
}
|
||||
return c.GuessUsecases(u)
|
||||
}
|
||||
|
||||
// GuessUsecases is a **heuristic based** function, as the backend in question may not be loaded yet, and the config may not record what it's useful at.
|
||||
// In its current state, this function should ideally check for properties of the config like templates, rather than the direct backend name checks for the lower half.
|
||||
// This avoids the maintenance burden of updating this list for each new backend - but unfortunately, that's the best option for some services currently.
|
||||
func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
|
||||
if (u & FLAG_CHAT) == FLAG_CHAT {
|
||||
if c.TemplateConfig.Chat == "" && c.TemplateConfig.ChatMessage == "" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_COMPLETION) == FLAG_COMPLETION {
|
||||
if c.TemplateConfig.Completion == "" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_EDIT) == FLAG_EDIT {
|
||||
if c.TemplateConfig.Edit == "" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_EMBEDDINGS) == FLAG_EMBEDDINGS {
|
||||
if c.Embeddings == nil || !*c.Embeddings {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_IMAGE) == FLAG_IMAGE {
|
||||
imageBackends := []string{"diffusers", "tinydream", "stablediffusion"}
|
||||
if !slices.Contains(imageBackends, c.Backend) {
|
||||
return false
|
||||
}
|
||||
|
||||
if c.Backend == "diffusers" && c.Diffusers.PipelineType == "" {
|
||||
return false
|
||||
}
|
||||
|
||||
}
|
||||
if (u & FLAG_RERANK) == FLAG_RERANK {
|
||||
if c.Backend != "rerankers" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_TRANSCRIPT) == FLAG_TRANSCRIPT {
|
||||
if c.Backend != "whisper" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
if (u & FLAG_TTS) == FLAG_TTS {
|
||||
ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
|
||||
if !slices.Contains(ttsBackends, c.Backend) {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
if (u & FLAG_SOUND_GENERATION) == FLAG_SOUND_GENERATION {
|
||||
if c.Backend != "transformers-musicgen" {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
}
|
||||
|
||||
35
core/config/backend_config_filter.go
Normal file
35
core/config/backend_config_filter.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package config
|
||||
|
||||
import "regexp"
|
||||
|
||||
type BackendConfigFilterFn func(string, *BackendConfig) bool
|
||||
|
||||
func NoFilterFn(_ string, _ *BackendConfig) bool { return true }
|
||||
|
||||
func BuildNameFilterFn(filter string) (BackendConfigFilterFn, error) {
|
||||
if filter == "" {
|
||||
return NoFilterFn, nil
|
||||
}
|
||||
rxp, err := regexp.Compile(filter)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return func(name string, config *BackendConfig) bool {
|
||||
if config != nil {
|
||||
return rxp.MatchString(config.Name)
|
||||
}
|
||||
return rxp.MatchString(name)
|
||||
}, nil
|
||||
}
|
||||
|
||||
func BuildUsecaseFilterFn(usecases BackendConfigUsecases) BackendConfigFilterFn {
|
||||
if usecases == FLAG_ANY {
|
||||
return NoFilterFn
|
||||
}
|
||||
return func(name string, config *BackendConfig) bool {
|
||||
if config == nil {
|
||||
return false // TODO: Potentially make this a param, for now, no known usecase to include
|
||||
}
|
||||
return config.HasUsecases(usecases)
|
||||
}
|
||||
}
|
||||
@@ -201,6 +201,26 @@ func (bcl *BackendConfigLoader) GetAllBackendConfigs() []BackendConfig {
|
||||
return res
|
||||
}
|
||||
|
||||
func (bcl *BackendConfigLoader) GetBackendConfigsByFilter(filter BackendConfigFilterFn) []BackendConfig {
|
||||
bcl.Lock()
|
||||
defer bcl.Unlock()
|
||||
var res []BackendConfig
|
||||
|
||||
if filter == nil {
|
||||
filter = NoFilterFn
|
||||
}
|
||||
|
||||
for n, v := range bcl.configs {
|
||||
if filter(n, &v) {
|
||||
res = append(res, v)
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: I don't think this one needs to Sort on name... but we'll see what breaks.
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
func (bcl *BackendConfigLoader) RemoveBackendConfig(m string) {
|
||||
bcl.Lock()
|
||||
defer bcl.Unlock()
|
||||
|
||||
@@ -19,12 +19,17 @@ var _ = Describe("Test cases for config related functions", func() {
|
||||
`backend: "../foo-bar"
|
||||
name: "foo"
|
||||
parameters:
|
||||
model: "foo-bar"`)
|
||||
model: "foo-bar"
|
||||
known_usecases:
|
||||
- chat
|
||||
- COMPLETION
|
||||
`)
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
config, err := readBackendConfigFromFile(tmp.Name())
|
||||
Expect(err).To(BeNil())
|
||||
Expect(config).ToNot(BeNil())
|
||||
Expect(config.Validate()).To(BeFalse())
|
||||
Expect(config.KnownUsecases).ToNot(BeNil())
|
||||
})
|
||||
It("Test Validate", func() {
|
||||
tmp, err := os.CreateTemp("", "config.yaml")
|
||||
@@ -61,4 +66,99 @@ parameters:
|
||||
Expect(config.Validate()).To(BeTrue())
|
||||
})
|
||||
})
|
||||
It("Properly handles backend usecase matching", func() {
|
||||
|
||||
a := BackendConfig{
|
||||
Name: "a",
|
||||
}
|
||||
Expect(a.HasUsecases(FLAG_ANY)).To(BeTrue()) // FLAG_ANY just means the config _exists_ essentially.
|
||||
|
||||
b := BackendConfig{
|
||||
Name: "b",
|
||||
Backend: "stablediffusion",
|
||||
}
|
||||
Expect(b.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(b.HasUsecases(FLAG_IMAGE)).To(BeTrue())
|
||||
Expect(b.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
||||
|
||||
c := BackendConfig{
|
||||
Name: "c",
|
||||
Backend: "llama-cpp",
|
||||
TemplateConfig: TemplateConfig{
|
||||
Chat: "chat",
|
||||
},
|
||||
}
|
||||
Expect(c.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(c.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
||||
Expect(c.HasUsecases(FLAG_COMPLETION)).To(BeFalse())
|
||||
Expect(c.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
||||
|
||||
d := BackendConfig{
|
||||
Name: "d",
|
||||
Backend: "llama-cpp",
|
||||
TemplateConfig: TemplateConfig{
|
||||
Chat: "chat",
|
||||
Completion: "completion",
|
||||
},
|
||||
}
|
||||
Expect(d.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(d.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
||||
Expect(d.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
||||
Expect(d.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
||||
|
||||
trueValue := true
|
||||
e := BackendConfig{
|
||||
Name: "e",
|
||||
Backend: "llama-cpp",
|
||||
TemplateConfig: TemplateConfig{
|
||||
Completion: "completion",
|
||||
},
|
||||
Embeddings: &trueValue,
|
||||
}
|
||||
|
||||
Expect(e.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(e.HasUsecases(FLAG_IMAGE)).To(BeFalse())
|
||||
Expect(e.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
||||
Expect(e.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
||||
Expect(e.HasUsecases(FLAG_EMBEDDINGS)).To(BeTrue())
|
||||
|
||||
f := BackendConfig{
|
||||
Name: "f",
|
||||
Backend: "piper",
|
||||
}
|
||||
Expect(f.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(f.HasUsecases(FLAG_TTS)).To(BeTrue())
|
||||
Expect(f.HasUsecases(FLAG_CHAT)).To(BeFalse())
|
||||
|
||||
g := BackendConfig{
|
||||
Name: "g",
|
||||
Backend: "whisper",
|
||||
}
|
||||
Expect(g.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(g.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
|
||||
Expect(g.HasUsecases(FLAG_TTS)).To(BeFalse())
|
||||
|
||||
h := BackendConfig{
|
||||
Name: "h",
|
||||
Backend: "transformers-musicgen",
|
||||
}
|
||||
Expect(h.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(h.HasUsecases(FLAG_TRANSCRIPT)).To(BeFalse())
|
||||
Expect(h.HasUsecases(FLAG_TTS)).To(BeTrue())
|
||||
Expect(h.HasUsecases(FLAG_SOUND_GENERATION)).To(BeTrue())
|
||||
|
||||
knownUsecases := FLAG_CHAT | FLAG_COMPLETION
|
||||
i := BackendConfig{
|
||||
Name: "i",
|
||||
Backend: "whisper",
|
||||
// Earlier test checks parsing, this just needs to set final values
|
||||
KnownUsecases: &knownUsecases,
|
||||
}
|
||||
Expect(i.HasUsecases(FLAG_ANY)).To(BeTrue())
|
||||
Expect(i.HasUsecases(FLAG_TRANSCRIPT)).To(BeTrue())
|
||||
Expect(i.HasUsecases(FLAG_TTS)).To(BeFalse())
|
||||
Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
|
||||
Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
|
||||
|
||||
})
|
||||
})
|
||||
|
||||
@@ -12,6 +12,7 @@ import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
. "github.com/mudler/LocalAI/core/http"
|
||||
@@ -950,7 +951,7 @@ var _ = Describe("API test", func() {
|
||||
openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(len(resp.Choices) > 0).To(BeTrue())
|
||||
Expect(resp.Choices[0].Message.Content).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
|
||||
Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
|
||||
|
||||
stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
@@ -969,7 +970,7 @@ var _ = Describe("API test", func() {
|
||||
tokens++
|
||||
}
|
||||
Expect(text).ToNot(BeEmpty())
|
||||
Expect(text).To(Or(ContainSubstring("Sure"), ContainSubstring("five")))
|
||||
Expect(strings.ToLower(text)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
|
||||
|
||||
Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
|
||||
})
|
||||
|
||||
@@ -19,14 +19,16 @@ func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *mo
|
||||
if ctx.Params("model") != "" {
|
||||
modelInput = ctx.Params("model")
|
||||
}
|
||||
|
||||
if ctx.Query("model") != "" {
|
||||
modelInput = ctx.Query("model")
|
||||
}
|
||||
// Set model from bearer token, if available
|
||||
bearer := strings.TrimLeft(ctx.Get("authorization"), "Bearer ")
|
||||
bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
|
||||
bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
|
||||
|
||||
// If no model was specified, take the first available
|
||||
if modelInput == "" && !bearerExists && firstModel {
|
||||
models, _ := services.ListModels(cl, loader, "", true)
|
||||
models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
if len(models) > 0 {
|
||||
modelInput = models[0]
|
||||
log.Debug().Msgf("No model specified, using: %s", modelInput)
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
|
||||
"github.com/chasefleming/elem-go"
|
||||
"github.com/chasefleming/elem-go/attrs"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/p2p"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
@@ -41,7 +42,7 @@ func DoneProgress(galleryID, text string, showDelete bool) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
),
|
||||
elem.If(showDelete, deleteButton(galleryID, modelName), reInstallButton(galleryID)),
|
||||
).Render()
|
||||
@@ -57,7 +58,7 @@ func ErrorProgress(err, galleryName string) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text("Error "+err),
|
||||
elem.Text("Error "+bluemonday.StrictPolicy().Sanitize(err)),
|
||||
),
|
||||
installButton(galleryName),
|
||||
).Render()
|
||||
@@ -170,7 +171,7 @@ func P2PNodeBoxes(nodes []p2p.NodeData) string {
|
||||
attrs.Props{
|
||||
"class": "text-gray-200 font-semibold ml-2 mr-1",
|
||||
},
|
||||
elem.Text(n.ID),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(n.ID)),
|
||||
),
|
||||
elem.Text("Status: "),
|
||||
elem.If(
|
||||
@@ -227,7 +228,7 @@ func StartProgressBar(uid, progress, text string) string {
|
||||
"tabindex": "-1",
|
||||
"autofocus": "",
|
||||
},
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)), //Perhaps overly defensive
|
||||
elem.Div(attrs.Props{
|
||||
"hx-get": "/browse/job/progress/" + uid,
|
||||
"hx-trigger": "every 600ms",
|
||||
@@ -249,9 +250,7 @@ func cardSpan(text, icon string) elem.Node {
|
||||
"class": icon + " pr-2",
|
||||
}),
|
||||
|
||||
elem.Text(text),
|
||||
|
||||
//elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -285,11 +284,9 @@ func searchableElement(text, icon string) elem.Node {
|
||||
elem.I(attrs.Props{
|
||||
"class": icon + " pr-2",
|
||||
}),
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
),
|
||||
),
|
||||
|
||||
//elem.Text(text),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -303,7 +300,7 @@ func link(text, url string) elem.Node {
|
||||
elem.I(attrs.Props{
|
||||
"class": "fas fa-link pr-2",
|
||||
}),
|
||||
elem.Text(text),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(text)),
|
||||
)
|
||||
}
|
||||
func installButton(galleryName string) elem.Node {
|
||||
@@ -387,13 +384,13 @@ func ListModels(models []*gallery.GalleryModel, processTracker ProcessTracker, g
|
||||
attrs.Props{
|
||||
"class": "mb-2 text-xl font-bold leading-tight",
|
||||
},
|
||||
elem.Text(m.Name),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(m.Name)),
|
||||
),
|
||||
elem.P(
|
||||
attrs.Props{
|
||||
"class": "mb-4 text-sm [&:not(:hover)]:truncate text-base",
|
||||
},
|
||||
elem.Text(m.Description),
|
||||
elem.Text(bluemonday.StrictPolicy().Sanitize(m.Description)),
|
||||
),
|
||||
)
|
||||
}
|
||||
|
||||
@@ -55,7 +55,7 @@ func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
}
|
||||
|
||||
// TODO: Support uploading files?
|
||||
filePath, _, err := backend.SoundGeneration(cfg.Backend, modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
|
||||
filePath, _, err := backend.SoundGeneration(modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@@ -45,13 +45,13 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
|
||||
config.LoadOptionContextSize(appConfig.ContextSize),
|
||||
config.LoadOptionF16(appConfig.F16),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
} else {
|
||||
modelFile = cfg.Model
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Request for model: %s", modelFile)
|
||||
|
||||
if input.Backend != "" {
|
||||
@@ -64,7 +64,7 @@ func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
|
||||
Documents: req.Documents,
|
||||
}
|
||||
|
||||
results, err := backend.Rerank(cfg.Backend, modelFile, request, ml, appConfig, *cfg)
|
||||
results, err := backend.Rerank(modelFile, request, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
60
core/http/endpoints/localai/get_token_metrics.go
Normal file
60
core/http/endpoints/localai/get_token_metrics.go
Normal file
@@ -0,0 +1,60 @@
|
||||
package localai
|
||||
|
||||
import (
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
fiberContext "github.com/mudler/LocalAI/core/http/ctx"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/rs/zerolog/log"
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
// TokenMetricsEndpoint is an endpoint to get TokensProcessed Per Second for Active SlotID
|
||||
//
|
||||
// @Summary Get TokenMetrics for Active Slot.
|
||||
// @Accept json
|
||||
// @Produce audio/x-wav
|
||||
// @Success 200 {string} binary "generated audio/wav file"
|
||||
// @Router /v1/tokenMetrics [get]
|
||||
// @Router /tokenMetrics [get]
|
||||
func TokenMetricsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
|
||||
input := new(schema.TokenMetricsRequest)
|
||||
|
||||
// Get input data from the request body
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
|
||||
if err != nil {
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
}
|
||||
|
||||
cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
|
||||
config.LoadOptionDebug(appConfig.Debug),
|
||||
config.LoadOptionThreads(appConfig.Threads),
|
||||
config.LoadOptionContextSize(appConfig.ContextSize),
|
||||
config.LoadOptionF16(appConfig.F16),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Err(err)
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
} else {
|
||||
modelFile = cfg.Model
|
||||
}
|
||||
log.Debug().Msgf("Token Metrics for model: %s", modelFile)
|
||||
|
||||
response, err := backend.TokenMetrics(modelFile, ml, appConfig, *cfg)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return c.JSON(response)
|
||||
}
|
||||
}
|
||||
@@ -17,12 +17,14 @@ func SystemInformations(ml *model.ModelLoader, appConfig *config.ApplicationConf
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
loadedModels := ml.ListModels()
|
||||
for b := range appConfig.ExternalGRPCBackends {
|
||||
availableBackends = append(availableBackends, b)
|
||||
}
|
||||
return c.JSON(
|
||||
schema.SystemInformationResponse{
|
||||
Backends: availableBackends,
|
||||
Models: loadedModels,
|
||||
},
|
||||
)
|
||||
}
|
||||
|
||||
58
core/http/endpoints/localai/tokenize.go
Normal file
58
core/http/endpoints/localai/tokenize.go
Normal file
@@ -0,0 +1,58 @@
|
||||
package localai
|
||||
|
||||
import (
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/mudler/LocalAI/core/backend"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
fiberContext "github.com/mudler/LocalAI/core/http/ctx"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
// TokenizeEndpoint exposes a REST API to tokenize the content
|
||||
// @Summary Tokenize the input.
|
||||
// @Success 200 {object} schema.TokenizeResponse "Response"
|
||||
// @Router /v1/tokenize [post]
|
||||
func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
|
||||
input := new(schema.TokenizeRequest)
|
||||
|
||||
// Get input data from the request body
|
||||
if err := c.BodyParser(input); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
|
||||
if err != nil {
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
}
|
||||
|
||||
cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
|
||||
config.LoadOptionDebug(appConfig.Debug),
|
||||
config.LoadOptionThreads(appConfig.Threads),
|
||||
config.LoadOptionContextSize(appConfig.ContextSize),
|
||||
config.LoadOptionF16(appConfig.F16),
|
||||
)
|
||||
|
||||
if err != nil {
|
||||
log.Err(err)
|
||||
modelFile = input.Model
|
||||
log.Warn().Msgf("Model not found in context: %s", input.Model)
|
||||
} else {
|
||||
modelFile = cfg.Model
|
||||
}
|
||||
log.Debug().Msgf("Request for model: %s", modelFile)
|
||||
|
||||
tokenResponse, err := backend.ModelTokenize(input.Content, ml, *cfg, appConfig)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
c.JSON(tokenResponse)
|
||||
return nil
|
||||
|
||||
}
|
||||
}
|
||||
@@ -13,15 +13,10 @@ import (
|
||||
func WelcomeEndpoint(appConfig *config.ApplicationConfig,
|
||||
cl *config.BackendConfigLoader, ml *model.ModelLoader, modelStatus func() (map[string]string, map[string]string)) func(*fiber.Ctx) error {
|
||||
return func(c *fiber.Ctx) error {
|
||||
models, _ := services.ListModels(cl, ml, "", true)
|
||||
backendConfigs := cl.GetAllBackendConfigs()
|
||||
|
||||
galleryConfigs := map[string]*gallery.Config{}
|
||||
modelsWithBackendConfig := map[string]interface{}{}
|
||||
|
||||
for _, m := range backendConfigs {
|
||||
modelsWithBackendConfig[m.Name] = nil
|
||||
|
||||
cfg, err := gallery.GetLocalModelConfiguration(ml.ModelPath, m.Name)
|
||||
if err != nil {
|
||||
continue
|
||||
@@ -29,17 +24,11 @@ func WelcomeEndpoint(appConfig *config.ApplicationConfig,
|
||||
galleryConfigs[m.Name] = cfg
|
||||
}
|
||||
|
||||
modelsWithoutConfig, _ := services.ListModels(cl, ml, config.NoFilterFn, services.LOOSE_ONLY)
|
||||
|
||||
// Get model statuses to display in the UI the operation in progress
|
||||
processingModels, taskTypes := modelStatus()
|
||||
|
||||
modelsWithoutConfig := []string{}
|
||||
|
||||
for _, m := range models {
|
||||
if _, ok := modelsWithBackendConfig[m]; !ok {
|
||||
modelsWithoutConfig = append(modelsWithoutConfig, m)
|
||||
}
|
||||
}
|
||||
|
||||
summary := fiber.Map{
|
||||
"Title": "LocalAI API - " + internal.PrintableVersion(),
|
||||
"Version": internal.PrintableVersion(),
|
||||
|
||||
@@ -10,6 +10,7 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/core/services"
|
||||
@@ -83,7 +84,7 @@ func CreateAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
|
||||
if !modelExists(cl, ml, request.Model) {
|
||||
log.Warn().Msgf("Model: %s was not found in list of models.", request.Model)
|
||||
return c.Status(fiber.StatusBadRequest).SendString("Model " + request.Model + " not found")
|
||||
return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Model %q not found", request.Model)))
|
||||
}
|
||||
|
||||
if request.Tools == nil {
|
||||
@@ -147,7 +148,7 @@ func ListAssistantsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoade
|
||||
// Convert string limit to integer
|
||||
limit, err := strconv.Atoi(limitQuery)
|
||||
if err != nil {
|
||||
return c.Status(http.StatusBadRequest).SendString(fmt.Sprintf("Invalid limit query value: %s", limitQuery))
|
||||
return c.Status(http.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Invalid limit query value: %s", limitQuery)))
|
||||
}
|
||||
|
||||
// Sort assistants
|
||||
@@ -225,7 +226,7 @@ func filterAssistantsAfterID(assistants []Assistant, id string) []Assistant {
|
||||
|
||||
func modelExists(cl *config.BackendConfigLoader, ml *model.ModelLoader, modelName string) (found bool) {
|
||||
found = false
|
||||
models, err := services.ListModels(cl, ml, "", true)
|
||||
models, err := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@@ -288,7 +289,7 @@ func GetAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader,
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -337,11 +338,11 @@ func CreateAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.Model
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find file_id: %s", request.FileID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find file_id: %s", request.FileID)))
|
||||
}
|
||||
}
|
||||
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find %q", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find %q", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -442,7 +443,7 @@ func ModifyAssistantEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoad
|
||||
return c.Status(fiber.StatusOK).JSON(newAssistant)
|
||||
}
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant with id: %s", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant with id: %s", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -513,9 +514,9 @@ func GetAssistantFileEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoa
|
||||
if assistantFile.ID == fileId {
|
||||
return c.Status(fiber.StatusOK).JSON(assistantFile)
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with file_id: %s", fileId)))
|
||||
}
|
||||
}
|
||||
return c.Status(fiber.StatusNotFound).SendString(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID))
|
||||
return c.Status(fiber.StatusNotFound).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to find assistant file with assistant_id: %s", assistantID)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -161,6 +161,12 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
||||
textContentToReturn = ""
|
||||
id = uuid.New().String()
|
||||
created = int(time.Now().Unix())
|
||||
// Set CorrelationID
|
||||
correlationID := c.Get("X-Correlation-ID")
|
||||
if len(strings.TrimSpace(correlationID)) == 0 {
|
||||
correlationID = id
|
||||
}
|
||||
c.Set("X-Correlation-ID", correlationID)
|
||||
|
||||
modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
|
||||
if err != nil {
|
||||
@@ -444,6 +450,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
|
||||
c.Set("Cache-Control", "no-cache")
|
||||
c.Set("Connection", "keep-alive")
|
||||
c.Set("Transfer-Encoding", "chunked")
|
||||
c.Set("X-Correlation-ID", id)
|
||||
|
||||
responses := make(chan schema.OpenAIResponse)
|
||||
|
||||
|
||||
@@ -57,6 +57,8 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, a
|
||||
}
|
||||
|
||||
return func(c *fiber.Ctx) error {
|
||||
// Add Correlation
|
||||
c.Set("X-Correlation-ID", id)
|
||||
modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed reading parameters from request:%w", err)
|
||||
|
||||
@@ -8,6 +8,7 @@ import (
|
||||
"sync/atomic"
|
||||
"time"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
|
||||
@@ -49,7 +50,7 @@ func UploadFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
|
||||
|
||||
err = c.SaveFile(file, savePath)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + err.Error())
|
||||
return c.Status(fiber.StatusInternalServerError).SendString("Failed to save file: " + bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
}
|
||||
|
||||
f := schema.File{
|
||||
@@ -121,7 +122,7 @@ func GetFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Applicat
|
||||
return func(c *fiber.Ctx) error {
|
||||
file, err := getFileFromRequest(c)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
}
|
||||
|
||||
return c.JSON(file)
|
||||
@@ -143,14 +144,14 @@ func DeleteFilesEndpoint(cm *config.BackendConfigLoader, appConfig *config.Appli
|
||||
return func(c *fiber.Ctx) error {
|
||||
file, err := getFileFromRequest(c)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
}
|
||||
|
||||
err = os.Remove(filepath.Join(appConfig.UploadDir, file.Filename))
|
||||
if err != nil {
|
||||
// If the file doesn't exist then we should just continue to remove it
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(fmt.Sprintf("Unable to delete file: %s, %v", file.Filename, err))
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(fmt.Sprintf("Unable to delete file: %s, %v", file.Filename, err)))
|
||||
}
|
||||
}
|
||||
|
||||
@@ -180,12 +181,12 @@ func GetFilesContentsEndpoint(cm *config.BackendConfigLoader, appConfig *config.
|
||||
return func(c *fiber.Ctx) error {
|
||||
file, err := getFileFromRequest(c)
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
}
|
||||
|
||||
fileContents, err := os.ReadFile(filepath.Join(appConfig.UploadDir, file.Filename))
|
||||
if err != nil {
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(err.Error())
|
||||
return c.Status(fiber.StatusInternalServerError).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
}
|
||||
|
||||
return c.Send(fileContents)
|
||||
|
||||
@@ -18,32 +18,32 @@ func ListModelsEndpoint(bcl *config.BackendConfigLoader, ml *model.ModelLoader)
|
||||
filter := c.Query("filter")
|
||||
|
||||
// By default, exclude any loose files that are already referenced by a configuration file.
|
||||
excludeConfigured := c.QueryBool("excludeConfigured", true)
|
||||
var policy services.LooseFilePolicy
|
||||
if c.QueryBool("excludeConfigured", true) {
|
||||
policy = services.SKIP_IF_CONFIGURED
|
||||
} else {
|
||||
policy = services.ALWAYS_INCLUDE // This replicates current behavior. TODO: give more options to the user?
|
||||
}
|
||||
|
||||
dataModels, err := modelList(bcl, ml, filter, excludeConfigured)
|
||||
filterFn, err := config.BuildNameFilterFn(filter)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
modelNames, err := services.ListModels(bcl, ml, filterFn, policy)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Map from a slice of names to a slice of OpenAIModel response objects
|
||||
dataModels := []schema.OpenAIModel{}
|
||||
for _, m := range modelNames {
|
||||
dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
|
||||
}
|
||||
|
||||
return c.JSON(schema.ModelsDataResponse{
|
||||
Object: "list",
|
||||
Data: dataModels,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func modelList(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]schema.OpenAIModel, error) {
|
||||
|
||||
models, err := services.ListModels(bcl, ml, filter, excludeConfigured)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
dataModels := []schema.OpenAIModel{}
|
||||
|
||||
// Then iterate through the loose files:
|
||||
for _, m := range models {
|
||||
dataModels = append(dataModels, schema.OpenAIModel{ID: m, Object: "model"})
|
||||
}
|
||||
|
||||
return dataModels, nil
|
||||
}
|
||||
|
||||
@@ -6,15 +6,22 @@ import (
|
||||
"fmt"
|
||||
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/google/uuid"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
fiberContext "github.com/mudler/LocalAI/core/http/ctx"
|
||||
"github.com/mudler/LocalAI/core/schema"
|
||||
"github.com/mudler/LocalAI/pkg/functions"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
"github.com/mudler/LocalAI/pkg/templates"
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
type correlationIDKeyType string
|
||||
|
||||
// CorrelationIDKey to track request across process boundary
|
||||
const CorrelationIDKey correlationIDKeyType = "correlationID"
|
||||
|
||||
func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
|
||||
input := new(schema.OpenAIRequest)
|
||||
|
||||
@@ -24,9 +31,14 @@ func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLo
|
||||
}
|
||||
|
||||
received, _ := json.Marshal(input)
|
||||
// Extract or generate the correlation ID
|
||||
correlationID := c.Get("X-Correlation-ID", uuid.New().String())
|
||||
|
||||
ctx, cancel := context.WithCancel(o.Context)
|
||||
input.Context = ctx
|
||||
// Add the correlation ID to the new context
|
||||
ctxWithCorrelationID := context.WithValue(ctx, CorrelationIDKey, correlationID)
|
||||
|
||||
input.Context = ctxWithCorrelationID
|
||||
input.Cancel = cancel
|
||||
|
||||
log.Debug().Msgf("Request received: %s", string(received))
|
||||
@@ -157,8 +169,13 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
|
||||
continue CONTENT
|
||||
}
|
||||
input.Messages[i].StringVideos = append(input.Messages[i].StringVideos, base64) // TODO: make sure that we only return base64 stuff
|
||||
|
||||
t := "[vid-{{.ID}}]{{.Text}}"
|
||||
if config.TemplateConfig.Video != "" {
|
||||
t = config.TemplateConfig.Video
|
||||
}
|
||||
// set a placeholder for each image
|
||||
input.Messages[i].StringContent = fmt.Sprintf("[vid-%d]", vidIndex) + input.Messages[i].StringContent
|
||||
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, vidIndex, input.Messages[i].StringContent)
|
||||
vidIndex++
|
||||
case "audio_url", "audio":
|
||||
// Decode content as base64 either if it's an URL or base64 text
|
||||
@@ -169,7 +186,11 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
|
||||
}
|
||||
input.Messages[i].StringAudios = append(input.Messages[i].StringAudios, base64) // TODO: make sure that we only return base64 stuff
|
||||
// set a placeholder for each image
|
||||
input.Messages[i].StringContent = fmt.Sprintf("[audio-%d]", audioIndex) + input.Messages[i].StringContent
|
||||
t := "[audio-{{.ID}}]{{.Text}}"
|
||||
if config.TemplateConfig.Audio != "" {
|
||||
t = config.TemplateConfig.Audio
|
||||
}
|
||||
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, audioIndex, input.Messages[i].StringContent)
|
||||
audioIndex++
|
||||
case "image_url", "image":
|
||||
// Decode content as base64 either if it's an URL or base64 text
|
||||
@@ -178,9 +199,14 @@ func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIReque
|
||||
log.Error().Msgf("Failed encoding image: %s", err)
|
||||
continue CONTENT
|
||||
}
|
||||
|
||||
t := "[img-{{.ID}}]{{.Text}}"
|
||||
if config.TemplateConfig.Image != "" {
|
||||
t = config.TemplateConfig.Image
|
||||
}
|
||||
input.Messages[i].StringImages = append(input.Messages[i].StringImages, base64) // TODO: make sure that we only return base64 stuff
|
||||
// set a placeholder for each image
|
||||
input.Messages[i].StringContent = fmt.Sprintf("[img-%d]", imgIndex) + input.Messages[i].StringContent
|
||||
input.Messages[i].StringContent, _ = templates.TemplateMultiModal(t, imgIndex, input.Messages[i].StringContent)
|
||||
imgIndex++
|
||||
}
|
||||
}
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"github.com/dave-gray101/v2keyauth"
|
||||
"github.com/gofiber/fiber/v2"
|
||||
"github.com/gofiber/fiber/v2/middleware/keyauth"
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
)
|
||||
|
||||
@@ -38,7 +39,7 @@ func getApiKeyErrorHandler(applicationConfig *config.ApplicationConfig) fiber.Er
|
||||
if applicationConfig.OpaqueErrors {
|
||||
return ctx.SendStatus(403)
|
||||
}
|
||||
return ctx.Status(403).SendString(err.Error())
|
||||
return ctx.Status(403).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
}
|
||||
if applicationConfig.OpaqueErrors {
|
||||
return ctx.SendStatus(500)
|
||||
|
||||
@@ -63,4 +63,7 @@ func RegisterLocalAIRoutes(app *fiber.App,
|
||||
|
||||
app.Get("/system", localai.SystemInformations(ml, appConfig))
|
||||
|
||||
// misc
|
||||
app.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
|
||||
|
||||
}
|
||||
|
||||
@@ -6,6 +6,7 @@ import (
|
||||
"sort"
|
||||
"strings"
|
||||
|
||||
"github.com/microcosm-cc/bluemonday"
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/core/gallery"
|
||||
"github.com/mudler/LocalAI/core/http/elements"
|
||||
@@ -171,7 +172,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
Search string `form:"search"`
|
||||
}{}
|
||||
if err := c.BodyParser(&form); err != nil {
|
||||
return c.Status(fiber.StatusBadRequest).SendString(err.Error())
|
||||
return c.Status(fiber.StatusBadRequest).SendString(bluemonday.StrictPolicy().Sanitize(err.Error()))
|
||||
}
|
||||
|
||||
models, _ := gallery.AvailableGalleryModels(appConfig.Galleries, appConfig.ModelPath)
|
||||
@@ -303,7 +304,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
|
||||
// Show the Chat page
|
||||
app.Get("/chat/:model", func(c *fiber.Ctx) error {
|
||||
backendConfigs, _ := services.ListModels(cl, ml, "", true)
|
||||
backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
|
||||
summary := fiber.Map{
|
||||
"Title": "LocalAI - Chat with " + c.Params("model"),
|
||||
@@ -318,7 +319,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
})
|
||||
|
||||
app.Get("/talk/", func(c *fiber.Ctx) error {
|
||||
backendConfigs, _ := services.ListModels(cl, ml, "", true)
|
||||
backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
|
||||
if len(backendConfigs) == 0 {
|
||||
// If no model is available redirect to the index which suggests how to install models
|
||||
@@ -339,7 +340,7 @@ func RegisterUIRoutes(app *fiber.App,
|
||||
|
||||
app.Get("/chat/", func(c *fiber.Ctx) error {
|
||||
|
||||
backendConfigs, _ := services.ListModels(cl, ml, "", true)
|
||||
backendConfigs, _ := services.ListModels(cl, ml, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
|
||||
|
||||
if len(backendConfigs) == 0 {
|
||||
// If no model is available redirect to the index which suggests how to install models
|
||||
|
||||
@@ -7,6 +7,7 @@ import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"net"
|
||||
|
||||
"github.com/mudler/edgevpn/pkg/node"
|
||||
@@ -41,7 +42,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
|
||||
log.Error().Err(err).Msg("Error listening")
|
||||
return err
|
||||
}
|
||||
// ll.Info("Binding local port on", srcaddr)
|
||||
|
||||
go func() {
|
||||
<-ctx.Done()
|
||||
l.Close()
|
||||
@@ -82,6 +83,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
|
||||
|
||||
if workerID == "" {
|
||||
log.Error().Msg("No available nodes yet")
|
||||
fs.sendHTMLResponse(conn, 503, "Sorry, waiting for nodes to connect")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -89,6 +91,7 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
|
||||
nodeData, exists := GetNode(fs.service, workerID)
|
||||
if !exists {
|
||||
log.Error().Msgf("Node %s not found", workerID)
|
||||
fs.sendHTMLResponse(conn, 404, "Node not found")
|
||||
return
|
||||
}
|
||||
|
||||
@@ -100,3 +103,42 @@ func (fs *FederatedServer) proxy(ctx context.Context, node *node.Node) error {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// sendHTMLResponse sends a basic HTML response with a status code and a message.
|
||||
// This is extracted to make the HTML content maintainable.
|
||||
func (fs *FederatedServer) sendHTMLResponse(conn net.Conn, statusCode int, message string) {
|
||||
defer conn.Close()
|
||||
|
||||
// Define the HTML content separately for easier maintenance.
|
||||
htmlContent := fmt.Sprintf("<html><body><h1>%s</h1></body></html>\r\n", message)
|
||||
|
||||
// Create the HTTP response with dynamic status code and content.
|
||||
response := fmt.Sprintf(
|
||||
"HTTP/1.1 %d %s\r\n"+
|
||||
"Content-Type: text/html\r\n"+
|
||||
"Connection: close\r\n"+
|
||||
"\r\n"+
|
||||
"%s",
|
||||
statusCode, getHTTPStatusText(statusCode), htmlContent,
|
||||
)
|
||||
|
||||
// Write the response to the client connection.
|
||||
_, writeErr := io.WriteString(conn, response)
|
||||
if writeErr != nil {
|
||||
log.Error().Err(writeErr).Msg("Error writing response to client")
|
||||
}
|
||||
}
|
||||
|
||||
// getHTTPStatusText returns a textual representation of HTTP status codes.
|
||||
func getHTTPStatusText(statusCode int) string {
|
||||
switch statusCode {
|
||||
case 503:
|
||||
return "Service Unavailable"
|
||||
case 404:
|
||||
return "Not Found"
|
||||
case 200:
|
||||
return "OK"
|
||||
default:
|
||||
return "Unknown Status"
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2,6 +2,7 @@ package schema
|
||||
|
||||
import (
|
||||
"github.com/mudler/LocalAI/core/p2p"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
gopsutil "github.com/shirou/gopsutil/v3/process"
|
||||
)
|
||||
|
||||
@@ -9,6 +10,10 @@ type BackendMonitorRequest struct {
|
||||
Model string `json:"model" yaml:"model"`
|
||||
}
|
||||
|
||||
type TokenMetricsRequest struct {
|
||||
Model string `json:"model" yaml:"model"`
|
||||
}
|
||||
|
||||
type BackendMonitorResponse struct {
|
||||
MemoryInfo *gopsutil.MemoryInfoStat
|
||||
MemoryPercent float32
|
||||
@@ -72,5 +77,6 @@ type P2PNodesResponse struct {
|
||||
}
|
||||
|
||||
type SystemInformationResponse struct {
|
||||
Backends []string `json:"backends"`
|
||||
Backends []string `json:"backends"`
|
||||
Models []model.Model `json:"loaded_models"`
|
||||
}
|
||||
|
||||
10
core/schema/tokenize.go
Normal file
10
core/schema/tokenize.go
Normal file
@@ -0,0 +1,10 @@
|
||||
package schema
|
||||
|
||||
type TokenizeRequest struct {
|
||||
Content string `json:"content"`
|
||||
Model string `json:"model"`
|
||||
}
|
||||
|
||||
type TokenizeResponse struct {
|
||||
Tokens []int32 `json:"tokens"`
|
||||
}
|
||||
@@ -1,55 +1,49 @@
|
||||
package services
|
||||
|
||||
import (
|
||||
"regexp"
|
||||
|
||||
"github.com/mudler/LocalAI/core/config"
|
||||
"github.com/mudler/LocalAI/pkg/model"
|
||||
)
|
||||
|
||||
func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter string, excludeConfigured bool) ([]string, error) {
|
||||
type LooseFilePolicy int
|
||||
|
||||
models, err := ml.ListFilesInModelPath()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
const (
|
||||
LOOSE_ONLY LooseFilePolicy = iota
|
||||
SKIP_IF_CONFIGURED
|
||||
SKIP_ALWAYS
|
||||
ALWAYS_INCLUDE
|
||||
)
|
||||
|
||||
var mm map[string]interface{} = map[string]interface{}{}
|
||||
func ListModels(bcl *config.BackendConfigLoader, ml *model.ModelLoader, filter config.BackendConfigFilterFn, looseFilePolicy LooseFilePolicy) ([]string, error) {
|
||||
|
||||
var skipMap map[string]interface{} = map[string]interface{}{}
|
||||
|
||||
dataModels := []string{}
|
||||
|
||||
var filterFn func(name string) bool
|
||||
// Start with known configurations
|
||||
|
||||
// If filter is not specified, do not filter the list by model name
|
||||
if filter == "" {
|
||||
filterFn = func(_ string) bool { return true }
|
||||
} else {
|
||||
// If filter _IS_ specified, we compile it to a regex which is used to create the filterFn
|
||||
rxp, err := regexp.Compile(filter)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
for _, c := range bcl.GetBackendConfigsByFilter(filter) {
|
||||
// Is this better than looseFilePolicy <= SKIP_IF_CONFIGURED ? less performant but more readable?
|
||||
if (looseFilePolicy == SKIP_IF_CONFIGURED) || (looseFilePolicy == LOOSE_ONLY) {
|
||||
skipMap[c.Model] = nil
|
||||
}
|
||||
filterFn = func(name string) bool {
|
||||
return rxp.MatchString(name)
|
||||
}
|
||||
}
|
||||
|
||||
// Start with the known configurations
|
||||
for _, c := range bcl.GetAllBackendConfigs() {
|
||||
if excludeConfigured {
|
||||
mm[c.Model] = nil
|
||||
}
|
||||
|
||||
if filterFn(c.Name) {
|
||||
if looseFilePolicy != LOOSE_ONLY {
|
||||
dataModels = append(dataModels, c.Name)
|
||||
}
|
||||
}
|
||||
|
||||
// Then iterate through the loose files:
|
||||
for _, m := range models {
|
||||
// And only adds them if they shouldn't be skipped.
|
||||
if _, exists := mm[m]; !exists && filterFn(m) {
|
||||
dataModels = append(dataModels, m)
|
||||
// Then iterate through the loose files if requested.
|
||||
if looseFilePolicy != SKIP_ALWAYS {
|
||||
|
||||
models, err := ml.ListFilesInModelPath()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
for _, m := range models {
|
||||
// And only adds them if they shouldn't be skipped.
|
||||
if _, exists := skipMap[m]; !exists && filter(m, nil) {
|
||||
dataModels = append(dataModels, m)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -160,13 +160,7 @@ func Startup(opts ...config.AppOption) (*config.BackendConfigLoader, *model.Mode
|
||||
|
||||
log.Debug().Msgf("Auto loading model %s into memory from file: %s", m, cfg.Model)
|
||||
|
||||
grpcOpts := backend.GRPCModelOpts(*cfg)
|
||||
o := []model.Option{
|
||||
model.WithModel(cfg.Model),
|
||||
model.WithAssetDir(options.AssetsDestination),
|
||||
model.WithThreads(uint32(options.Threads)),
|
||||
model.WithLoadGRPCLoadModelOpts(grpcOpts),
|
||||
}
|
||||
o := backend.ModelOptions(*cfg, options, []model.Option{})
|
||||
|
||||
var backendErr error
|
||||
if cfg.Backend != "" {
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
{
|
||||
"version": "v2.21.0"
|
||||
"version": "v2.21.1"
|
||||
}
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
llama_index==0.11.12
|
||||
llama_index==0.11.16
|
||||
requests==2.32.3
|
||||
weaviate_client==4.8.1
|
||||
transformers
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
langchain==0.3.0
|
||||
openai==1.47.1
|
||||
langchain==0.3.2
|
||||
openai==1.51.1
|
||||
|
||||
@@ -5,7 +5,7 @@ metadata:
|
||||
spec:
|
||||
containers:
|
||||
- name: broken-pod
|
||||
image: nginx:1.27.0
|
||||
image: nginx:1.27.2
|
||||
livenessProbe:
|
||||
httpGet:
|
||||
path: /
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
langchain==0.3.0
|
||||
openai==1.47.1
|
||||
chromadb==0.5.7
|
||||
llama-index==0.11.12
|
||||
langchain==0.3.1
|
||||
openai==1.51.1
|
||||
chromadb==0.5.11
|
||||
llama-index==0.11.16
|
||||
@@ -1,4 +1,4 @@
|
||||
aiohttp==3.10.3
|
||||
aiohttp==3.10.9
|
||||
aiosignal==1.3.1
|
||||
async-timeout==4.0.3
|
||||
attrs==24.2.0
|
||||
@@ -6,19 +6,19 @@ certifi==2024.8.30
|
||||
charset-normalizer==3.3.2
|
||||
colorama==0.4.6
|
||||
dataclasses-json==0.6.7
|
||||
debugpy==1.8.2
|
||||
debugpy==1.8.6
|
||||
frozenlist==1.4.1
|
||||
greenlet==3.1.0
|
||||
greenlet==3.1.1
|
||||
idna==3.10
|
||||
langchain==0.3.0
|
||||
langchain-community==0.2.16
|
||||
langchain==0.3.2
|
||||
langchain-community==0.3.1
|
||||
marshmallow==3.22.0
|
||||
marshmallow-enum==1.5.1
|
||||
multidict==6.0.5
|
||||
multidict==6.1.0
|
||||
mypy-extensions==1.0.0
|
||||
numexpr==2.10.1
|
||||
numpy==2.1.1
|
||||
openai==1.45.1
|
||||
openai==1.51.1
|
||||
openapi-schema-pydantic==1.2.4
|
||||
packaging>=23.2
|
||||
pydantic==2.9.2
|
||||
@@ -30,4 +30,4 @@ tqdm==4.66.5
|
||||
typing-inspect==0.9.0
|
||||
typing_extensions==4.12.2
|
||||
urllib3==2.2.3
|
||||
yarl==1.11.1
|
||||
yarl==1.13.1
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
streamlit==1.38.0
|
||||
streamlit==1.39.0
|
||||
requests
|
||||
66
gallery/arch-function.yaml
Normal file
66
gallery/arch-function.yaml
Normal file
@@ -0,0 +1,66 @@
|
||||
---
|
||||
name: "chatml"
|
||||
|
||||
config_file: |
|
||||
mmap: true
|
||||
function:
|
||||
disable_no_action: true
|
||||
grammar:
|
||||
mixed_mode: false
|
||||
disable: true
|
||||
parallel_calls: true
|
||||
expect_strings_after_json: true
|
||||
json_regex_match:
|
||||
- "(?s)<tool_call>(.*?)</tool_call>"
|
||||
- "(?s)<tool_call>(.*)"
|
||||
capture_llm_results:
|
||||
- (?s)<scratchpad>(.*?)</scratchpad>
|
||||
replace_llm_results:
|
||||
- key: (?s)<scratchpad>(.*?)</scratchpad>
|
||||
value: ""
|
||||
template:
|
||||
chat_message: |
|
||||
<|im_start|>{{ .RoleName }}
|
||||
{{ if .FunctionCall -}}
|
||||
Function call:
|
||||
{{ else if eq .RoleName "tool" -}}
|
||||
Function response:
|
||||
{{ end -}}
|
||||
{{ if .Content -}}
|
||||
{{.Content }}
|
||||
{{ end -}}
|
||||
{{ if .FunctionCall -}}
|
||||
{{toJson .FunctionCall}}
|
||||
{{ end -}}<|im_end|>
|
||||
function: |
|
||||
<|im_start|>system
|
||||
# Tools
|
||||
|
||||
You may call one or more functions to assist with the user query.
|
||||
|
||||
You are provided with function signatures within <tools></tools> XML tags:
|
||||
<tools>
|
||||
{{range .Functions}}
|
||||
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
|
||||
{{end}}
|
||||
</tools>
|
||||
For each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:
|
||||
<tool_call>
|
||||
{"name": <function-name>, "arguments": <args-json-object>}
|
||||
</tool_call>
|
||||
<|im_end|>
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
chat: |
|
||||
{{.Input -}}
|
||||
<|im_start|>assistant
|
||||
completion: |
|
||||
{{.Input}}
|
||||
context_size: 4096
|
||||
f16: true
|
||||
stopwords:
|
||||
- '<|im_end|>'
|
||||
- '<dummy32000>'
|
||||
- '</s>'
|
||||
- "<|eot_id|>"
|
||||
- "<|end_of_text|>"
|
||||
@@ -1,6 +1,189 @@
|
||||
---
|
||||
## Qwen2.5
|
||||
- name: "moe-girl-1ba-7bt-i1"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/kTXXSSSqpb21rfyOX7FUa.jpeg
|
||||
# chatml
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/allura-org/MoE-Girl-1BA-7BT
|
||||
- https://huggingface.co/mradermacher/MoE-Girl-1BA-7BT-i1-GGUF
|
||||
description: |
|
||||
A finetune of OLMoE by AllenAI designed for roleplaying (and maybe general usecases if you try hard enough).
|
||||
PLEASE do not expect godliness out of this, it's a model with 1 billion active parameters. Expect something more akin to Gemma 2 2B, not Llama 3 8B.
|
||||
overrides:
|
||||
parameters:
|
||||
model: MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
|
||||
sha256: e6ef9c311c73573b243de6ff7538b386f430af30b2be0a96a5745c17137ad432
|
||||
uri: huggingface://mradermacher/MoE-Girl-1BA-7BT-i1-GGUF/MoE-Girl-1BA-7BT.i1-Q4_K_M.gguf
|
||||
- name: "salamandra-7b-instruct"
|
||||
icon: https://huggingface.co/BSC-LT/salamandra-7b-instruct/resolve/main/images/salamandra_header.png
|
||||
# Uses chatml
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
license: apache-2.0
|
||||
urls:
|
||||
- https://huggingface.co/BSC-LT/salamandra-7b-instruct
|
||||
- https://huggingface.co/cstr/salamandra-7b-instruct-GGUF
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- cpu
|
||||
- salamandra
|
||||
description: |
|
||||
Transformer-based decoder-only language model that has been pre-trained on 7.8 trillion tokens of highly curated data. The pre-training corpus contains text in 35 European languages and code.
|
||||
Salamandra comes in three different sizes — 2B, 7B and 40B parameters — with their respective base and instruction-tuned variants. This model card corresponds to the 7B instructed version.
|
||||
overrides:
|
||||
parameters:
|
||||
model: salamandra-7b-instruct.Q4_K_M-f32.gguf
|
||||
files:
|
||||
- filename: salamandra-7b-instruct.Q4_K_M-f32.gguf
|
||||
sha256: bac8e8c1d1d9d53cbdb148b8ff9ad378ddb392429207099e85b5aae3a43bff3d
|
||||
uri: huggingface://cstr/salamandra-7b-instruct-GGUF/salamandra-7b-instruct.Q4_K_M-f32.gguf
|
||||
## llama3.2
|
||||
- &llama32
|
||||
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
|
||||
license: llama3.2
|
||||
description: |
|
||||
The Meta Llama 3.2 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction-tuned generative models in 1B and 3B sizes (text in/text out). The Llama 3.2 instruction-tuned text only models are optimized for multilingual dialogue use cases, including agentic retrieval and summarization tasks. They outperform many of the available open source and closed chat models on common industry benchmarks.
|
||||
|
||||
Model Developer: Meta
|
||||
|
||||
Model Architecture: Llama 3.2 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety.
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- cpu
|
||||
- llama3.2
|
||||
name: "llama-3.2-1b-instruct:q4_k_m"
|
||||
urls:
|
||||
- https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-3.2-1b-instruct-q4_k_m.gguf
|
||||
files:
|
||||
- filename: llama-3.2-1b-instruct-q4_k_m.gguf
|
||||
sha256: 1d0e9419ec4e12aef73ccf4ffd122703e94c48344a96bc7c5f0f2772c2152ce3
|
||||
uri: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama-3.2-3b-instruct:q4_k_m"
|
||||
urls:
|
||||
- https://huggingface.co/hugging-quants/Llama-3.2-3B-Instruct-Q4_K_M-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-3.2-3b-instruct-q4_k_m.gguf
|
||||
files:
|
||||
- filename: llama-3.2-3b-instruct-q4_k_m.gguf
|
||||
sha256: c55a83bfb6396799337853ca69918a0b9bbb2917621078c34570bc17d20fd7a1
|
||||
uri: huggingface://hugging-quants/Llama-3.2-3B-Instruct-Q4_K_M-GGUF/llama-3.2-3b-instruct-q4_k_m.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama-3.2-3b-instruct:q8_0"
|
||||
urls:
|
||||
- https://huggingface.co/hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-3.2-3b-instruct-q8_0.gguf
|
||||
files:
|
||||
- filename: llama-3.2-3b-instruct-q8_0.gguf
|
||||
sha256: 51725f77f997a5080c3d8dd66e073da22ddf48ab5264f21f05ded9b202c3680e
|
||||
uri: huggingface://hugging-quants/Llama-3.2-3B-Instruct-Q8_0-GGUF/llama-3.2-3b-instruct-q8_0.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama-3.2-1b-instruct:q8_0"
|
||||
urls:
|
||||
- https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: llama-3.2-1b-instruct-q8_0.gguf
|
||||
files:
|
||||
- filename: llama-3.2-1b-instruct-q8_0.gguf
|
||||
sha256: ba345c83bf5cc679c653b853c46517eea5a34f03ed2205449db77184d9ae62a9
|
||||
uri: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q8_0-GGUF/llama-3.2-1b-instruct-q8_0.gguf
|
||||
## Uncensored
|
||||
- !!merge <<: *llama32
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/66c9d7a26f2335ba288810a4/4YDg-rcEXCK0fdTS1fBzE.webp
|
||||
name: "versatillama-llama-3.2-3b-instruct-abliterated"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated-GGUF
|
||||
description: |
|
||||
Small but Smart Fine-Tuned on Vast dataset of Conversations. Able to Generate Human like text with high performance within its size. It is Very Versatile when compared for it's size and Parameters and offers capability almost as good as Llama 3.1 8B Instruct.
|
||||
overrides:
|
||||
parameters:
|
||||
model: VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
|
||||
sha256: 15b9e4a987f50d7594d030815c7166a996e20db46fe1e20da03e96955020312c
|
||||
uri: huggingface://QuantFactory/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated-GGUF/VersatiLlama-Llama-3.2-3B-Instruct-Abliterated.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama3.2-3b-enigma"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/64f267a8a4f79a118e0fcc89/it7MY5MyLCLpFQev5dUis.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama3.2-3B-Enigma-GGUF
|
||||
description: |
|
||||
Enigma is a code-instruct model built on Llama 3.2 3b. It is a high quality code instruct model with the Llama 3.2 Instruct chat format. The model is finetuned on synthetic code-instruct data generated with Llama 3.1 405b and supplemented with generalist synthetic data. It uses the Llama 3.2 Instruct prompt format.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama3.2-3B-Enigma.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama3.2-3B-Enigma.Q4_K_M.gguf
|
||||
sha256: 4304e6ee1e348b228470700ec1e9423f5972333d376295195ce6cd5c70cae5e4
|
||||
uri: huggingface://QuantFactory/Llama3.2-3B-Enigma-GGUF/Llama3.2-3B-Enigma.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama3.2-3b-esper2"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/64f267a8a4f79a118e0fcc89/4I6oK8DG0so4VD8GroFsd.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama3.2-3B-Esper2-GGUF
|
||||
description: |
|
||||
Esper 2 is a DevOps and cloud architecture code specialist built on Llama 3.2 3b. It is an AI assistant focused on AWS, Azure, GCP, Terraform, Dockerfiles, pipelines, shell scripts and more, with real world problem solving and high quality code instruct performance within the Llama 3.2 Instruct chat format. Finetuned on synthetic DevOps-instruct and code-instruct data generated with Llama 3.1 405b and supplemented with generalist chat data.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama3.2-3B-Esper2.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama3.2-3B-Esper2.Q4_K_M.gguf
|
||||
sha256: 11d2bd674aa22a71a59ec49ad29b695000d14bc275b0195b8d7089bfc7582fc7
|
||||
uri: huggingface://QuantFactory/Llama3.2-3B-Esper2-GGUF/Llama3.2-3B-Esper2.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama-3.2-3b-agent007"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama-3.2-3B-Agent007-GGUF
|
||||
description: |
|
||||
The model is a quantized version of EpistemeAI/Llama-3.2-3B-Agent007, developed by EpistemeAI and fine-tuned from unsloth/llama-3.2-3b-instruct-bnb-4bit. It was trained 2x faster with Unsloth and Huggingface's TRL library. Fine tuned with Agent datasets.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama-3.2-3B-Agent007.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama-3.2-3B-Agent007.Q4_K_M.gguf
|
||||
sha256: 7a2543a69b116f2a059e2e445e5d362bb7df4a51b97e83d8785c1803dc9d687f
|
||||
uri: huggingface://QuantFactory/Llama-3.2-3B-Agent007-GGUF/Llama-3.2-3B-Agent007.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "llama-3.2-3b-agent007-coder"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Llama-3.2-3B-Agent007-Coder-GGUF
|
||||
description: |
|
||||
The Llama-3.2-3B-Agent007-Coder-GGUF is a quantized version of the EpistemeAI/Llama-3.2-3B-Agent007-Coder model, which is a fine-tuned version of the unsloth/llama-3.2-3b-instruct-bnb-4bit model. It is created using llama.cpp and trained with additional datasets such as the Agent dataset, Code Alpaca 20K, and magpie ultra 0.1. This model is optimized for multilingual dialogue use cases and agentic retrieval and summarization tasks. The model is available for commercial and research use in multiple languages and is best used with the transformers library.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
|
||||
sha256: 49a4861c094d94ef5faa33f69b02cd132bb0167f1c3ca59059404f85f61e1d12
|
||||
uri: huggingface://QuantFactory/Llama-3.2-3B-Agent007-Coder-GGUF/Llama-3.2-3B-Agent007-Coder.Q4_K_M.gguf
|
||||
- !!merge <<: *llama32
|
||||
name: "fireball-meta-llama-3.2-8b-instruct-agent-003-128k-code-dpo"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO-GGUF
|
||||
description: |
|
||||
The LLM model is a quantized version of EpistemeAI/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO, which is an experimental and revolutionary fine-tune with DPO dataset to allow LLama 3.1 8B to be an agentic coder. It has some built-in agent features such as search, calculator, and ReAct. Other noticeable features include self-learning using unsloth, RAG applications, and memory. The context window of the model is 128K. It can be integrated into projects using popular libraries like Transformers and vLLM. The model is suitable for use with Langchain or LLamaIndex. The model is developed by EpistemeAI and licensed under the Apache 2.0 license.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
|
||||
sha256: 7f45fa79bc6c9847ef9fbad08c3bb5a0f2dbb56d2e2200a5d37b260a57274e55
|
||||
uri: huggingface://QuantFactory/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO-GGUF/Fireball-Meta-Llama-3.2-8B-Instruct-agent-003-128k-code-DPO.Q4_K_M.gguf
|
||||
- &qwen25
|
||||
## Qwen2.5
|
||||
name: "qwen2.5-14b-instruct"
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
license: apache-2.0
|
||||
@@ -29,11 +212,11 @@
|
||||
- https://huggingface.co/bartowski/Qwen2.5-Math-7B-Instruct-GGUF
|
||||
- https://huggingface.co/Qwen/Qwen2.5-Math-7B-Instruct
|
||||
description: |
|
||||
In August 2024, we released the first series of mathematical LLMs - Qwen2-Math - of our Qwen family. A month later, we have upgraded it and open-sourced Qwen2.5-Math series, including base models Qwen2.5-Math-1.5B/7B/72B, instruction-tuned models Qwen2.5-Math-1.5B/7B/72B-Instruct, and mathematical reward model Qwen2.5-Math-RM-72B.
|
||||
In August 2024, we released the first series of mathematical LLMs - Qwen2-Math - of our Qwen family. A month later, we have upgraded it and open-sourced Qwen2.5-Math series, including base models Qwen2.5-Math-1.5B/7B/72B, instruction-tuned models Qwen2.5-Math-1.5B/7B/72B-Instruct, and mathematical reward model Qwen2.5-Math-RM-72B.
|
||||
|
||||
Unlike Qwen2-Math series which only supports using Chain-of-Thught (CoT) to solve English math problems, Qwen2.5-Math series is expanded to support using both CoT and Tool-integrated Reasoning (TIR) to solve math problems in both Chinese and English. The Qwen2.5-Math series models have achieved significant performance improvements compared to the Qwen2-Math series models on the Chinese and English mathematics benchmarks with CoT.
|
||||
Unlike Qwen2-Math series which only supports using Chain-of-Thught (CoT) to solve English math problems, Qwen2.5-Math series is expanded to support using both CoT and Tool-integrated Reasoning (TIR) to solve math problems in both Chinese and English. The Qwen2.5-Math series models have achieved significant performance improvements compared to the Qwen2-Math series models on the Chinese and English mathematics benchmarks with CoT.
|
||||
|
||||
The base models of Qwen2-Math are initialized with Qwen2-1.5B/7B/72B, and then pretrained on a meticulously designed Mathematics-specific Corpus. This corpus contains large-scale high-quality mathematical web texts, books, codes, exam questions, and mathematical pre-training data synthesized by Qwen2.
|
||||
The base models of Qwen2-Math are initialized with Qwen2-1.5B/7B/72B, and then pretrained on a meticulously designed Mathematics-specific Corpus. This corpus contains large-scale high-quality mathematical web texts, books, codes, exam questions, and mathematical pre-training data synthesized by Qwen2.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Qwen2.5-Math-7B-Instruct-Q4_K_M.gguf
|
||||
@@ -135,8 +318,8 @@
|
||||
model: Qwen2.5-32B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Qwen2.5-32B.Q4_K_M.gguf
|
||||
sha256: 02703e27c8b964db445444581a6937ad7538f0c32a100b26b49fa0e8ff527155
|
||||
uri: huggingface://mradermacher/Qwen2.5-32B-GGUF/Qwen2.5-32B.Q4_K_M.gguf
|
||||
sha256: fa42a4067e3630929202b6bb1ef5cebc43c1898494aedfd567b7d53c7a9d84a6
|
||||
- !!merge <<: *qwen25
|
||||
name: "qwen2.5-32b-instruct"
|
||||
urls:
|
||||
@@ -161,8 +344,189 @@
|
||||
- filename: Qwen2.5-72B-Instruct-Q4_K_M.gguf
|
||||
sha256: e4c8fad16946be8cf0bbf67eb8f4e18fc7415a5a6d2854b4cda453edb4082545
|
||||
uri: huggingface://bartowski/Qwen2.5-72B-Instruct-GGUF/Qwen2.5-72B-Instruct-Q4_K_M.gguf
|
||||
## SmolLM
|
||||
- !!merge <<: *qwen25
|
||||
name: "bigqwen2.5-52b-instruct"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/61b8e2ba285851687028d395/98GiKtmH1AtHHbIbOUH4Y.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/mlabonne/BigQwen2.5-52B-Instruct
|
||||
- https://huggingface.co/bartowski/BigQwen2.5-52B-Instruct-GGUF
|
||||
description: |
|
||||
BigQwen2.5-52B-Instruct is a Qwen/Qwen2-32B-Instruct self-merge made with MergeKit.
|
||||
It applies the mlabonne/Meta-Llama-3-120B-Instruct recipe.
|
||||
overrides:
|
||||
parameters:
|
||||
model: BigQwen2.5-52B-Instruct-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: BigQwen2.5-52B-Instruct-Q4_K_M.gguf
|
||||
sha256: 9c939f08e366b51b07096eb2ecb5cc2a82894ac7baf639e446237ad39889c896
|
||||
uri: huggingface://bartowski/BigQwen2.5-52B-Instruct-GGUF/BigQwen2.5-52B-Instruct-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "replete-llm-v2.5-qwen-14b"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/ihnWXDEgV-ZKN_B036U1J.png
|
||||
urls:
|
||||
- https://huggingface.co/Replete-AI/Replete-LLM-V2.5-Qwen-14b
|
||||
- https://huggingface.co/bartowski/Replete-LLM-V2.5-Qwen-14b-GGUF
|
||||
description: |
|
||||
Replete-LLM-V2.5-Qwen-14b is a continues finetuned version of Qwen2.5-14B. I noticed recently that the Qwen team did not learn from my methods of continuous finetuning, the great benefits, and no downsides of it. So I took it upon myself to merge the instruct model with the base model myself using the Ties merge method
|
||||
|
||||
This version of the model shows higher performance than the original instruct and base models.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Replete-LLM-V2.5-Qwen-14b-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Replete-LLM-V2.5-Qwen-14b-Q4_K_M.gguf
|
||||
sha256: 17d0792ff5e3062aecb965629f66e679ceb407e4542e8045993dcfe9e7e14d9d
|
||||
uri: huggingface://bartowski/Replete-LLM-V2.5-Qwen-14b-GGUF/Replete-LLM-V2.5-Qwen-14b-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "replete-llm-v2.5-qwen-7b"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/ihnWXDEgV-ZKN_B036U1J.png
|
||||
urls:
|
||||
- https://huggingface.co/Replete-AI/Replete-LLM-V2.5-Qwen-7b
|
||||
- https://huggingface.co/bartowski/Replete-LLM-V2.5-Qwen-7b-GGUF
|
||||
description: |
|
||||
Replete-LLM-V2.5-Qwen-7b is a continues finetuned version of Qwen2.5-14B. I noticed recently that the Qwen team did not learn from my methods of continuous finetuning, the great benefits, and no downsides of it. So I took it upon myself to merge the instruct model with the base model myself using the Ties merge method
|
||||
|
||||
This version of the model shows higher performance than the original instruct and base models.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Replete-LLM-V2.5-Qwen-7b-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Replete-LLM-V2.5-Qwen-7b-Q4_K_M.gguf
|
||||
sha256: 054d54972259c0398b4e0af3f408f608e1166837b1d7535d08fc440d1daf8639
|
||||
uri: huggingface://bartowski/Replete-LLM-V2.5-Qwen-7b-GGUF/Replete-LLM-V2.5-Qwen-7b-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "calme-2.2-qwen2.5-72b-i1"
|
||||
icon: https://huggingface.co/MaziyarPanahi/calme-2.2-qwen2.5-72b/resolve/main/calme-2.webp
|
||||
urls:
|
||||
- https://huggingface.co/MaziyarPanahi/calme-2.2-qwen2.5-72b
|
||||
- https://huggingface.co/mradermacher/calme-2.2-qwen2.5-72b-i1-GGUF
|
||||
description: |
|
||||
This model is a fine-tuned version of the powerful Qwen/Qwen2.5-72B-Instruct, pushing the boundaries of natural language understanding and generation even further. My goal was to create a versatile and robust model that excels across a wide range of benchmarks and real-world applications.
|
||||
Use Cases
|
||||
|
||||
This model is suitable for a wide range of applications, including but not limited to:
|
||||
|
||||
Advanced question-answering systems
|
||||
Intelligent chatbots and virtual assistants
|
||||
Content generation and summarization
|
||||
Code generation and analysis
|
||||
Complex problem-solving and decision support
|
||||
overrides:
|
||||
parameters:
|
||||
model: calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
|
||||
sha256: 5fdfa599724d7c78502c477ced1d294e92781b91d3265bd0748fbf15a6fefde6
|
||||
uri: huggingface://mradermacher/calme-2.2-qwen2.5-72b-i1-GGUF/calme-2.2-qwen2.5-72b.i1-Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "t.e-8.1-iq-imatrix-request"
|
||||
# chatml
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/65d4cf2693a0a3744a27536c/K1aNPf32z-6tYZdcSQBzF.png
|
||||
urls:
|
||||
- https://huggingface.co/Cran-May/T.E-8.1
|
||||
- https://huggingface.co/Lewdiculous/T.E-8.1-GGUF-IQ-Imatrix-Request
|
||||
description: |
|
||||
Trained for roleplay uses.
|
||||
overrides:
|
||||
parameters:
|
||||
model: T.E-8.1-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: T.E-8.1-Q4_K_M-imat.gguf
|
||||
sha256: 1b7892b82c01ea4cbebe34cd00f9836cbbc369fc3247c1f44a92842201e7ec0b
|
||||
uri: huggingface://Lewdiculous/T.E-8.1-GGUF-IQ-Imatrix-Request/T.E-8.1-Q4_K_M-imat.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "rombos-llm-v2.5.1-qwen-3b"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/pNDtgE5FDkxxvbG4qiZ1A.jpeg
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Rombos-LLM-V2.5.1-Qwen-3b-GGUF
|
||||
description: |
|
||||
Rombos-LLM-V2.5.1-Qwen-3b is a little experiment that merges a high-quality LLM, arcee-ai/raspberry-3B, using the last step of the Continuous Finetuning method outlined in a Google document. The merge is done using the mergekit with the following parameters:
|
||||
|
||||
- Models: Qwen2.5-3B-Instruct, raspberry-3B
|
||||
- Merge method: ties
|
||||
- Base model: Qwen2.5-3B
|
||||
- Parameters: weight=1, density=1, normalize=true, int8_mask=true
|
||||
- Dtype: bfloat16
|
||||
|
||||
The model has been evaluated on various tasks and datasets, and the results are available on the Open LLM Leaderboard. The model has shown promising performance across different benchmarks.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
|
||||
sha256: 656c342a2921cac8912e0123fc295c3bb3d631a85c671c12a3843a957e46d30d
|
||||
uri: huggingface://QuantFactory/Rombos-LLM-V2.5.1-Qwen-3b-GGUF/Rombos-LLM-V2.5.1-Qwen-3b.Q4_K_M.gguf
|
||||
- !!merge <<: *qwen25
|
||||
name: "qwen2.5-7b-ins-v3"
|
||||
urls:
|
||||
- https://huggingface.co/happzy2633/qwen2.5-7b-ins-v3
|
||||
- https://huggingface.co/bartowski/qwen2.5-7b-ins-v3-GGUF
|
||||
description: |
|
||||
Qwen 2.5 fine-tuned on CoT to match o1 performance. An attempt to build an Open o1 mathcing OpenAI o1 model
|
||||
Demo: https://huggingface.co/spaces/happzy2633/open-o1
|
||||
overrides:
|
||||
parameters:
|
||||
model: qwen2.5-7b-ins-v3-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: qwen2.5-7b-ins-v3-Q4_K_M.gguf
|
||||
sha256: 9c23734072714a4886c0386ae0ff07a5e940d67ad52278e2ed689fec44e1e0c8
|
||||
uri: huggingface://bartowski/qwen2.5-7b-ins-v3-GGUF/qwen2.5-7b-ins-v3-Q4_K_M.gguf
|
||||
- &archfunct
|
||||
license: apache-2.0
|
||||
tags:
|
||||
- llm
|
||||
- gguf
|
||||
- gpu
|
||||
- qwen
|
||||
- qwen2.5
|
||||
- cpu
|
||||
- function-calling
|
||||
name: "arch-function-1.5b"
|
||||
uri: "github:mudler/LocalAI/gallery/arch-function.yaml@master"
|
||||
urls:
|
||||
- https://huggingface.co/katanemolabs/Arch-Function-1.5B
|
||||
- https://huggingface.co/mradermacher/Arch-Function-1.5B-GGUF
|
||||
description: |
|
||||
The Katanemo Arch-Function collection of large language models (LLMs) is a collection state-of-the-art (SOTA) LLMs specifically designed for function calling tasks. The models are designed to understand complex function signatures, identify required parameters, and produce accurate function call outputs based on natural language prompts. Achieving performance on par with GPT-4, these models set a new benchmark in the domain of function-oriented tasks, making them suitable for scenarios where automated API interaction and function execution is crucial.
|
||||
In summary, the Katanemo Arch-Function collection demonstrates:
|
||||
State-of-the-art performance in function calling
|
||||
Accurate parameter identification and suggestion, even in ambiguous or incomplete inputs
|
||||
High generalization across multiple function calling use cases, from API interactions to automated backend tasks.
|
||||
Optimized low-latency, high-throughput performance, making it suitable for real-time, production environments.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Arch-Function-1.5B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Arch-Function-1.5B.Q4_K_M.gguf
|
||||
sha256: 5ac54d2d50cca0ee0335ca2c9b688204c0829cd3a73de3ee3fda108281ad9691
|
||||
uri: huggingface://mradermacher/Arch-Function-1.5B-GGUF/Arch-Function-1.5B.Q4_K_M.gguf
|
||||
- !!merge <<: *archfunct
|
||||
name: "arch-function-7b"
|
||||
urls:
|
||||
- https://huggingface.co/katanemolabs/Arch-Function-7B
|
||||
- https://huggingface.co/mradermacher/Arch-Function-7B-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: Arch-Function-7B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Arch-Function-7B.Q4_K_M.gguf
|
||||
sha256: 6e38661321d79d02b8cf57c79d97c6c0e19adb9ffa66083cc440c24e257234b6
|
||||
uri: huggingface://mradermacher/Arch-Function-7B-GGUF/Arch-Function-7B.Q4_K_M.gguf
|
||||
- !!merge <<: *archfunct
|
||||
name: "arch-function-3b"
|
||||
urls:
|
||||
- https://huggingface.co/katanemolabs/Arch-Function-3B
|
||||
- https://huggingface.co/mradermacher/Arch-Function-3B-GGUF
|
||||
overrides:
|
||||
parameters:
|
||||
model: Arch-Function-3B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Arch-Function-3B.Q4_K_M.gguf
|
||||
sha256: 9945cb8d070498d163e5df90c1987f591d35e4fd2222a6c51bcfff848c4b573b
|
||||
uri: huggingface://mradermacher/Arch-Function-3B-GGUF/Arch-Function-3B.Q4_K_M.gguf
|
||||
- &smollm
|
||||
## SmolLM
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
name: "smollm-1.7b-instruct"
|
||||
icon: https://huggingface.co/datasets/HuggingFaceTB/images/resolve/main/banner_smol.png
|
||||
@@ -591,9 +955,9 @@
|
||||
- https://huggingface.co/leafspark/Reflection-Llama-3.1-70B-bf16
|
||||
- https://huggingface.co/senseable/Reflection-Llama-3.1-70B-gguf
|
||||
description: |
|
||||
Reflection Llama-3.1 70B is (currently) the world's top open-source LLM, trained with a new technique called Reflection-Tuning that teaches a LLM to detect mistakes in its reasoning and correct course.
|
||||
Reflection Llama-3.1 70B is (currently) the world's top open-source LLM, trained with a new technique called Reflection-Tuning that teaches a LLM to detect mistakes in its reasoning and correct course.
|
||||
|
||||
The model was trained on synthetic data generated by Glaive. If you're training a model, Glaive is incredible — use them.
|
||||
The model was trained on synthetic data generated by Glaive. If you're training a model, Glaive is incredible — use them.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Reflection-Llama-3.1-70B-q4_k_m.gguf
|
||||
@@ -670,6 +1034,21 @@
|
||||
- filename: NightyGurps-14b-v1.1-Q4_K_M.gguf
|
||||
sha256: d09d53259ad2c0298150fa8c2db98fe42f11731af89fdc80ad0e255a19adc4b0
|
||||
uri: huggingface://bartowski/NightyGurps-14b-v1.1-GGUF/NightyGurps-14b-v1.1-Q4_K_M.gguf
|
||||
- !!merge <<: *llama31
|
||||
name: "llama-3.1-swallow-70b-v0.1-i1"
|
||||
icon: https://huggingface.co/tokyotech-llm/Llama-3.1-Swallow-70B-v0.1/resolve/main/logo.png
|
||||
urls:
|
||||
- https://huggingface.co/tokyotech-llm/Llama-3.1-Swallow-70B-v0.1
|
||||
- https://huggingface.co/mradermacher/Llama-3.1-Swallow-70B-v0.1-i1-GGUF
|
||||
description: |
|
||||
Llama 3.1 Swallow is a series of large language models (8B, 70B) that were built by continual pre-training on the Meta Llama 3.1 models. Llama 3.1 Swallow enhanced the Japanese language capabilities of the original Llama 3.1 while retaining the English language capabilities. We use approximately 200 billion tokens that were sampled from a large Japanese web corpus (Swallow Corpus Version 2), Japanese and English Wikipedia articles, and mathematical and coding contents, etc (see the Training Datasets section) for continual pre-training. The instruction-tuned models (Instruct) were built by supervised fine-tuning (SFT) on the synthetic data specially built for Japanese. See the Swallow Model Index section to find other model variants.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
|
||||
sha256: 9eaa08a4872a26f56fe34b27a99f7bd0d22ee2b2d1c84cfcde2091b5f61af5fa
|
||||
uri: huggingface://mradermacher/Llama-3.1-Swallow-70B-v0.1-i1-GGUF/Llama-3.1-Swallow-70B-v0.1.i1-Q4_K_M.gguf
|
||||
## Uncensored models
|
||||
- !!merge <<: *llama31
|
||||
name: "humanish-roleplay-llama-3.1-8b-i1"
|
||||
@@ -913,15 +1292,15 @@
|
||||
- https://huggingface.co/Sao10K/L3.1-8B-Niitama-v1.1
|
||||
- https://huggingface.co/Lewdiculous/L3.1-8B-Niitama-v1.1-GGUF-IQ-Imatrix
|
||||
description: |
|
||||
GGUF-IQ-Imatrix quants for Sao10K/L3.1-8B-Niitama-v1.1
|
||||
Here's the subjectively superior L3 version: L3-8B-Niitama-v1
|
||||
An experimental model using experimental methods.
|
||||
GGUF-IQ-Imatrix quants for Sao10K/L3.1-8B-Niitama-v1.1
|
||||
Here's the subjectively superior L3 version: L3-8B-Niitama-v1
|
||||
An experimental model using experimental methods.
|
||||
|
||||
More detail on it:
|
||||
More detail on it:
|
||||
|
||||
Tamamo and Niitama are made from the same data. Literally. The only thing that's changed is how theyre shuffled and formatted. Yet, I get wildly different results.
|
||||
Tamamo and Niitama are made from the same data. Literally. The only thing that's changed is how theyre shuffled and formatted. Yet, I get wildly different results.
|
||||
|
||||
Interesting, eh? Feels kinda not as good compared to the l3 version, but it's aight.
|
||||
Interesting, eh? Feels kinda not as good compared to the l3 version, but it's aight.
|
||||
overrides:
|
||||
parameters:
|
||||
model: L3.1-8B-Niitama-v1.1-Q4_K_M-imat.gguf
|
||||
@@ -986,6 +1365,53 @@
|
||||
- filename: Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
sha256: 0a601c7341228d9160332965298d799369a1dc2b7080771fb8051bdeb556b30c
|
||||
uri: huggingface://bartowski/Llama-3.1-8B-ArliAI-RPMax-v1.1-GGUF/Llama-3.1-8B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
- !!merge <<: *llama31
|
||||
name: "violet_twilight-v0.2-iq-imatrix"
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/64adfd277b5ff762771e4571/P962FQhRG4I8nbU_DJolY.png
|
||||
urls:
|
||||
- https://huggingface.co/Epiculous/Violet_Twilight-v0.2
|
||||
- https://huggingface.co/Lewdiculous/Violet_Twilight-v0.2-GGUF-IQ-Imatrix
|
||||
description: |
|
||||
Now for something a bit different, Violet_Twilight-v0.2! This model is a SLERP merge of Azure_Dusk-v0.2 and Crimson_Dawn-v0.2!
|
||||
overrides:
|
||||
parameters:
|
||||
model: Violet_Twilight-v0.2-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: Violet_Twilight-v0.2-Q4_K_M-imat.gguf
|
||||
sha256: 0793d196a00cd6fd4e67b8c585b27a94d397e33d427e4ad4aa9a16b7abc339cd
|
||||
uri: huggingface://Lewdiculous/Violet_Twilight-v0.2-GGUF-IQ-Imatrix/Violet_Twilight-v0.2-Q4_K_M-imat.gguf
|
||||
- !!merge <<: *llama31
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
name: "dans-personalityengine-v1.0.0-8b"
|
||||
urls:
|
||||
- https://huggingface.co/PocketDoc/Dans-PersonalityEngine-v1.0.0-8b
|
||||
- https://huggingface.co/bartowski/Dans-PersonalityEngine-v1.0.0-8b-GGUF
|
||||
description: |
|
||||
This model is intended to be multifarious in its capabilities and should be quite capable at both co-writing and roleplay as well as find itself quite at home performing sentiment analysis or summarization as part of a pipeline. It has been trained on a wide array of one shot instructions, multi turn instructions, role playing scenarios, text adventure games, co-writing, and much more. The full dataset is publicly available and can be found in the datasets section of the model page.
|
||||
|
||||
There has not been any form of harmfulness alignment done on this model, please take the appropriate precautions when using it in a production environment.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
|
||||
sha256: 193b66434c9962e278bb171a21e652f0d3f299f04e86c95f9f75ec5aa8ff006e
|
||||
uri: huggingface://bartowski/Dans-PersonalityEngine-v1.0.0-8b-GGUF/Dans-PersonalityEngine-v1.0.0-8b-Q4_K_M.gguf
|
||||
- !!merge <<: *llama31
|
||||
name: "nihappy-l3.1-8b-v0.09"
|
||||
urls:
|
||||
- https://huggingface.co/Arkana08/NIHAPPY-L3.1-8B-v0.09
|
||||
- https://huggingface.co/QuantFactory/NIHAPPY-L3.1-8B-v0.09-GGUF
|
||||
description: |
|
||||
The model is a quantized version of Arkana08/NIHAPPY-L3.1-8B-v0.09 created using llama.cpp. It is a role-playing model that integrates the finest qualities of various pre-trained language models, focusing on dynamic storytelling.
|
||||
overrides:
|
||||
parameters:
|
||||
model: NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
|
||||
sha256: 9bd46a06093448b143bd2775f0fb1b1b172c851fafdce31289e13b7dfc23a0d7
|
||||
uri: huggingface://QuantFactory/NIHAPPY-L3.1-8B-v0.09-GGUF/NIHAPPY-L3.1-8B-v0.09.Q4_K_M.gguf
|
||||
- &deepseek
|
||||
## Deepseek
|
||||
url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
|
||||
@@ -1546,8 +1972,8 @@
|
||||
urls:
|
||||
- https://huggingface.co/Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix
|
||||
description: |
|
||||
A finetune of Mistral Nemo by Sao10K.
|
||||
Uses the ChatML prompt format.
|
||||
A finetune of Mistral Nemo by Sao10K.
|
||||
Uses the ChatML prompt format.
|
||||
overrides:
|
||||
parameters:
|
||||
model: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
|
||||
@@ -1555,6 +1981,47 @@
|
||||
- filename: MN-12B-Lyra-v4-Q4_K_M-imat.gguf
|
||||
sha256: 1989123481ca1936c8a2cbe278ff5d1d2b0ae63dbdc838bb36a6d7547b8087b3
|
||||
uri: huggingface://Lewdiculous/MN-12B-Lyra-v4-GGUF-IQ-Imatrix/MN-12B-Lyra-v4-Q4_K_M-imat.gguf
|
||||
- !!merge <<: *mistral03
|
||||
name: "magnusintellectus-12b-v1-i1"
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/66b564058d9afb7a9d5607d5/hUVJI1Qa4tCMrZWMgYkoD.png
|
||||
urls:
|
||||
- https://huggingface.co/GalrionSoftworks/MagnusIntellectus-12B-v1
|
||||
- https://huggingface.co/mradermacher/MagnusIntellectus-12B-v1-i1-GGUF
|
||||
description: |
|
||||
How pleasant, the rocks appear to have made a decent conglomerate. A-.
|
||||
|
||||
MagnusIntellectus is a merge of the following models using LazyMergekit:
|
||||
|
||||
UsernameJustAnother/Nemo-12B-Marlin-v5
|
||||
anthracite-org/magnum-12b-v2
|
||||
overrides:
|
||||
parameters:
|
||||
model: MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
|
||||
sha256: c97107983b4edc5b6f2a592d227ca2dd4196e2af3d3bc0fe6b7a8954a1fb5870
|
||||
uri: huggingface://mradermacher/MagnusIntellectus-12B-v1-i1-GGUF/MagnusIntellectus-12B-v1.i1-Q4_K_M.gguf
|
||||
- !!merge <<: *mistral03
|
||||
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
|
||||
name: "mn-backyardai-party-12b-v1-iq-arm-imatrix"
|
||||
icon: https://huggingface.co/Sao10K/MN-BackyardAI-Party-12B-v1/resolve/main/party1.png
|
||||
urls:
|
||||
- https://huggingface.co/Sao10K/MN-BackyardAI-Party-12B-v1
|
||||
- https://huggingface.co/Lewdiculous/MN-BackyardAI-Party-12B-v1-GGUF-IQ-ARM-Imatrix
|
||||
description: |
|
||||
This is a group-chat based roleplaying model, based off of 12B-Lyra-v4a2, a variant of Lyra-v4 that is currently private.
|
||||
|
||||
It is trained on an entirely human-based dataset, based on forum / internet group roleplaying styles. The only augmentation done with LLMs is to the character sheets, to fit to the system prompt, to fit various character sheets within context.
|
||||
|
||||
This model is still capable of 1 on 1 roleplay, though I recommend using ChatML when doing that instead.
|
||||
overrides:
|
||||
parameters:
|
||||
model: MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
|
||||
files:
|
||||
- filename: MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
|
||||
sha256: cea68768dff58b553974b755bb40ef790ab8b86866d9b5c46bc2e6c3311b876a
|
||||
uri: huggingface://Lewdiculous/MN-BackyardAI-Party-12B-v1-GGUF-IQ-ARM-Imatrix/MN-BackyardAI-Party-12B-v1-Q4_K_M-imat.gguf
|
||||
- &mudler
|
||||
### START mudler's LocalAI specific-models
|
||||
url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
|
||||
@@ -2074,7 +2541,7 @@
|
||||
- https://huggingface.co/EpistemeAI/Athena-codegemma-2-2b-it
|
||||
- https://huggingface.co/mradermacher/Athena-codegemma-2-2b-it-GGUF
|
||||
description: |
|
||||
Supervised fine tuned (sft unsloth) for coding with EpistemeAI coding dataset.
|
||||
Supervised fine tuned (sft unsloth) for coding with EpistemeAI coding dataset.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Athena-codegemma-2-2b-it.Q4_K_M.gguf
|
||||
@@ -2151,6 +2618,37 @@
|
||||
- filename: Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
sha256: 89fe35345754d7e9de8d0c0d5bf35b2be9b12a09811b365b712b8b27112f7712
|
||||
uri: huggingface://bartowski/Gemma-2-2B-ArliAI-RPMax-v1.1-GGUF/Gemma-2-2B-ArliAI-RPMax-v1.1-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "gemma-2-9b-it-abliterated"
|
||||
urls:
|
||||
- https://huggingface.co/IlyaGusev/gemma-2-9b-it-abliterated
|
||||
- https://huggingface.co/bartowski/gemma-2-9b-it-abliterated-GGUF
|
||||
description: |
|
||||
Abliterated version of google/gemma-2-9b-it.
|
||||
|
||||
The abliteration script (link) is based on code from the blog post and heavily uses TransformerLens. The only major difference from the code used for Llama is scaling the embedding layer back.
|
||||
|
||||
Orthogonalization did not produce the same results as regular interventions since there are RMSNorm layers before merging activations into the residual stream. However, the final model still seems to be uncensored.
|
||||
overrides:
|
||||
parameters:
|
||||
model: gemma-2-9b-it-abliterated-Q4_K_M.gguf
|
||||
files:
|
||||
- filename: gemma-2-9b-it-abliterated-Q4_K_M.gguf
|
||||
sha256: 88d84ac9796732c10f6c58e0feb4db8e04c05d74bdb7047a5e37906a589896e1
|
||||
uri: huggingface://bartowski/gemma-2-9b-it-abliterated-GGUF/gemma-2-9b-it-abliterated-Q4_K_M.gguf
|
||||
- !!merge <<: *gemma
|
||||
name: "gemma-2-ataraxy-v3i-9b"
|
||||
urls:
|
||||
- https://huggingface.co/QuantFactory/Gemma-2-Ataraxy-v3i-9B-GGUF
|
||||
description: |
|
||||
Gemma-2-Ataraxy-v3i-9B is an experimental model that replaces the simpo model in the original recipe with a different simpo model and a writing model trained on Gutenberg, using a higher density. It is a merge of pre-trained language models created using mergekit, with della merge method using unsloth/gemma-2-9b-it as the base. The models included in the merge are nbeerbower/Gemma2-Gutenberg-Doppel-9B, ifable/gemma-2-Ifable-9B, and wzhouad/gemma-2-9b-it-WPO-HB. It has been quantized using llama.cpp.
|
||||
overrides:
|
||||
parameters:
|
||||
model: Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
|
||||
files:
|
||||
- filename: Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
|
||||
sha256: f14c5b9373d4058f0f812c6c34184addeb4aeeecb02a7bbcf9844d9afc8d0066
|
||||
uri: huggingface://QuantFactory/Gemma-2-Ataraxy-v3i-9B-GGUF/Gemma-2-Ataraxy-v3i-9B.Q4_K_M.gguf
|
||||
- &llama3
|
||||
url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
|
||||
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
|
||||
|
||||
@@ -3,7 +3,6 @@ name: "moondream2"
|
||||
|
||||
|
||||
config_file: |
|
||||
backend: llama-cpp
|
||||
context_size: 2046
|
||||
roles:
|
||||
user: "\nQuestion: "
|
||||
|
||||
@@ -51,4 +51,6 @@ type Backend interface {
|
||||
StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts ...grpc.CallOption) (*pb.StoresFindResult, error)
|
||||
|
||||
Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.CallOption) (*pb.RerankResult, error)
|
||||
|
||||
GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error)
|
||||
}
|
||||
|
||||
@@ -41,6 +41,7 @@ func (llm *Base) Predict(opts *pb.PredictOptions) (string, error) {
|
||||
}
|
||||
|
||||
func (llm *Base) PredictStream(opts *pb.PredictOptions, results chan string) error {
|
||||
close(results)
|
||||
return fmt.Errorf("unimplemented")
|
||||
}
|
||||
|
||||
|
||||
@@ -374,3 +374,21 @@ func (c *Client) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.
|
||||
client := pb.NewBackendClient(conn)
|
||||
return client.Rerank(ctx, in, opts...)
|
||||
}
|
||||
|
||||
func (c *Client) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error) {
|
||||
if !c.parallel {
|
||||
c.opMutex.Lock()
|
||||
defer c.opMutex.Unlock()
|
||||
}
|
||||
c.setBusy(true)
|
||||
defer c.setBusy(false)
|
||||
c.wdMark()
|
||||
defer c.wdUnMark()
|
||||
conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer conn.Close()
|
||||
client := pb.NewBackendClient(conn)
|
||||
return client.GetMetrics(ctx, in, opts...)
|
||||
}
|
||||
|
||||
@@ -87,6 +87,10 @@ func (e *embedBackend) Rerank(ctx context.Context, in *pb.RerankRequest, opts ..
|
||||
return e.s.Rerank(ctx, in)
|
||||
}
|
||||
|
||||
func (e *embedBackend) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opts ...grpc.CallOption) (*pb.MetricsResponse, error) {
|
||||
return e.s.GetMetrics(ctx, in)
|
||||
}
|
||||
|
||||
type embedBackendServerStream struct {
|
||||
ctx context.Context
|
||||
fn func(s []byte)
|
||||
|
||||
@@ -28,7 +28,7 @@ var Aliases map[string]string = map[string]string{
|
||||
"langchain-huggingface": LCHuggingFaceBackend,
|
||||
}
|
||||
|
||||
var autoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
||||
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
|
||||
|
||||
const (
|
||||
LlamaGGML = "llama-ggml"
|
||||
@@ -62,7 +62,7 @@ func backendPath(assetDir, backend string) string {
|
||||
|
||||
// backendsInAssetDir returns the list of backends in the asset directory
|
||||
// that should be loaded
|
||||
func backendsInAssetDir(assetDir string) ([]string, error) {
|
||||
func backendsInAssetDir(assetDir string) (map[string][]string, error) {
|
||||
// Exclude backends from automatic loading
|
||||
excludeBackends := []string{LocalStoreBackend}
|
||||
entry, err := os.ReadDir(backendPath(assetDir, ""))
|
||||
@@ -86,7 +86,7 @@ ENTRY:
|
||||
|
||||
// Skip the llama.cpp variants if we are autoDetecting
|
||||
// But we always load the fallback variant if it exists
|
||||
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && autoDetect {
|
||||
if strings.Contains(e.Name(), LLamaCPP) && !strings.Contains(e.Name(), LLamaCPPFallback) && AutoDetect {
|
||||
continue
|
||||
}
|
||||
|
||||
@@ -94,7 +94,7 @@ ENTRY:
|
||||
}
|
||||
|
||||
// if we are autoDetecting, we want to show the llama.cpp variants as a single backend
|
||||
if autoDetect {
|
||||
if AutoDetect {
|
||||
// if we find the llama.cpp variants, show them of as a single backend (llama-cpp) as later we are going to pick that up
|
||||
// when starting the service
|
||||
foundLCPPAVX, foundLCPPAVX2, foundLCPPFallback, foundLCPPGRPC, foundLCPPCuda, foundLCPPHipblas, foundSycl16, foundSycl32 := false, false, false, false, false, false, false, false
|
||||
@@ -136,6 +136,10 @@ ENTRY:
|
||||
}
|
||||
}
|
||||
|
||||
return backends, nil
|
||||
}
|
||||
|
||||
func orderBackends(backends map[string][]string) ([]string, error) {
|
||||
// order backends from the asset directory.
|
||||
// as we scan for backends, we want to keep some order which backends are tried of.
|
||||
// for example, llama.cpp should be tried first, and we want to keep the huggingface backend at the last.
|
||||
@@ -181,8 +185,9 @@ ENTRY:
|
||||
return orderedBackends.Keys(), nil
|
||||
}
|
||||
|
||||
// selectGRPCProcess selects the GRPC process to start based on system capabilities
|
||||
func selectGRPCProcess(backend, assetDir string, f16 bool) string {
|
||||
// selectGRPCProcessByHostCapabilities selects the GRPC process to start based on system capabilities
|
||||
// Note: this is now relevant only for llama.cpp
|
||||
func selectGRPCProcessByHostCapabilities(backend, assetDir string, f16 bool) string {
|
||||
foundCUDA := false
|
||||
foundAMDGPU := false
|
||||
foundIntelGPU := false
|
||||
@@ -199,6 +204,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
|
||||
return backendPath(assetDir, LLamaCPPGRPC)
|
||||
}
|
||||
|
||||
// Check for GPU-binaries that are shipped with single binary releases
|
||||
gpus, err := xsysinfo.GPUs()
|
||||
if err == nil {
|
||||
for _, gpu := range gpus {
|
||||
@@ -243,35 +249,40 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
|
||||
return grpcProcess
|
||||
}
|
||||
|
||||
// No GPU found or no specific binaries found, try to load the CPU variant(s)
|
||||
|
||||
// Select the Fallback by default
|
||||
selectedProcess := backendPath(assetDir, LLamaCPPFallback)
|
||||
|
||||
// IF we find any optimized binary, we use that
|
||||
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
|
||||
p := backendPath(assetDir, LLamaCPPAVX2)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX2 variant", backend)
|
||||
grpcProcess = p
|
||||
selectedProcess = p
|
||||
}
|
||||
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
|
||||
p := backendPath(assetDir, LLamaCPPAVX)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with AVX variant", backend)
|
||||
grpcProcess = p
|
||||
}
|
||||
} else {
|
||||
p := backendPath(assetDir, LLamaCPPFallback)
|
||||
if _, err := os.Stat(p); err == nil {
|
||||
log.Info().Msgf("[%s] attempting to load with fallback variant", backend)
|
||||
grpcProcess = p
|
||||
selectedProcess = p
|
||||
}
|
||||
}
|
||||
|
||||
return grpcProcess
|
||||
// Check if the binary exists!
|
||||
if _, err := os.Stat(selectedProcess); err == nil {
|
||||
return selectedProcess
|
||||
}
|
||||
|
||||
return ""
|
||||
}
|
||||
|
||||
// starts the grpcModelProcess for the backend, and returns a grpc client
|
||||
// It also loads the model
|
||||
func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string) (*Model, error) {
|
||||
return func(modelName, modelFile string) (*Model, error) {
|
||||
func (ml *ModelLoader) grpcModel(backend string, autodetect bool, o *Options) func(string, string, string) (*Model, error) {
|
||||
return func(modelID, modelName, modelFile string) (*Model, error) {
|
||||
|
||||
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelName, modelFile, backend, *o)
|
||||
log.Debug().Msgf("Loading Model %s with gRPC (file: %s) (backend: %s): %+v", modelID, modelFile, backend, *o)
|
||||
|
||||
var client *Model
|
||||
|
||||
@@ -304,18 +315,19 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
return nil, fmt.Errorf("failed allocating free ports: %s", err.Error())
|
||||
}
|
||||
// Make sure the process is executable
|
||||
if err := ml.startProcess(uri, o.model, serverAddress); err != nil {
|
||||
process, err := ml.startProcess(uri, modelID, serverAddress)
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("path", uri).Msg("failed to launch ")
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.Debug().Msgf("GRPC Service Started")
|
||||
|
||||
client = NewModel(serverAddress)
|
||||
client = NewModel(modelID, serverAddress, process)
|
||||
} else {
|
||||
log.Debug().Msg("external backend is uri")
|
||||
log.Debug().Msg("external backend is a uri")
|
||||
// address
|
||||
client = NewModel(uri)
|
||||
client = NewModel(modelID, uri, nil)
|
||||
}
|
||||
} else {
|
||||
grpcProcess := backendPath(o.assetDir, backend)
|
||||
@@ -323,9 +335,9 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
return nil, fmt.Errorf("refering to a backend not in asset dir: %s", err.Error())
|
||||
}
|
||||
|
||||
if autoDetect {
|
||||
if autodetect {
|
||||
// autoDetect GRPC process to start based on system capabilities
|
||||
if selectedProcess := selectGRPCProcess(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
|
||||
if selectedProcess := selectGRPCProcessByHostCapabilities(backend, o.assetDir, o.gRPCOptions.F16Memory); selectedProcess != "" {
|
||||
grpcProcess = selectedProcess
|
||||
}
|
||||
}
|
||||
@@ -346,13 +358,14 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
args, grpcProcess = library.LoadLDSO(o.assetDir, args, grpcProcess)
|
||||
|
||||
// Make sure the process is executable in any circumstance
|
||||
if err := ml.startProcess(grpcProcess, o.model, serverAddress, args...); err != nil {
|
||||
process, err := ml.startProcess(grpcProcess, modelID, serverAddress, args...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.Debug().Msgf("GRPC Service Started")
|
||||
|
||||
client = NewModel(serverAddress)
|
||||
client = NewModel(modelID, serverAddress, process)
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Wait for the service to start up")
|
||||
@@ -374,6 +387,9 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
|
||||
if !ready {
|
||||
log.Debug().Msgf("GRPC Service NOT ready")
|
||||
if process := client.Process(); process != nil {
|
||||
process.Stop()
|
||||
}
|
||||
return nil, fmt.Errorf("grpc service not ready")
|
||||
}
|
||||
|
||||
@@ -385,9 +401,15 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
|
||||
res, err := client.GRPC(o.parallelRequests, ml.wd).LoadModel(o.context, &options)
|
||||
if err != nil {
|
||||
if process := client.Process(); process != nil {
|
||||
process.Stop()
|
||||
}
|
||||
return nil, fmt.Errorf("could not load model: %w", err)
|
||||
}
|
||||
if !res.Success {
|
||||
if process := client.Process(); process != nil {
|
||||
process.Stop()
|
||||
}
|
||||
return nil, fmt.Errorf("could not load model (no success): %s", res.Message)
|
||||
}
|
||||
|
||||
@@ -396,17 +418,17 @@ func (ml *ModelLoader) grpcModel(backend string, o *Options) func(string, string
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) ListAvailableBackends(assetdir string) ([]string, error) {
|
||||
return backendsInAssetDir(assetdir)
|
||||
backends, err := backendsInAssetDir(assetdir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return orderBackends(backends)
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err error) {
|
||||
o := NewOptions(opts...)
|
||||
|
||||
if o.model != "" {
|
||||
log.Info().Msgf("Loading model '%s' with backend %s", o.model, o.backendString)
|
||||
} else {
|
||||
log.Info().Msgf("Loading model with backend %s", o.backendString)
|
||||
}
|
||||
log.Info().Msgf("Loading model '%s' with backend %s", o.modelID, o.backendString)
|
||||
|
||||
backend := strings.ToLower(o.backendString)
|
||||
if realBackend, exists := Aliases[backend]; exists {
|
||||
@@ -414,14 +436,7 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
|
||||
log.Debug().Msgf("%s is an alias of %s", backend, realBackend)
|
||||
}
|
||||
|
||||
if o.singleActiveBackend {
|
||||
log.Debug().Msgf("Stopping all backends except '%s'", o.model)
|
||||
err := ml.StopGRPC(allExcept(o.model))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel")
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
|
||||
|
||||
var backendToConsume string
|
||||
|
||||
@@ -433,38 +448,57 @@ func (ml *ModelLoader) BackendLoader(opts ...Option) (client grpc.Backend, err e
|
||||
backendToConsume = backend
|
||||
}
|
||||
|
||||
model, err := ml.LoadModel(o.model, ml.grpcModel(backendToConsume, o))
|
||||
model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backendToConsume, AutoDetect, o))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
// XXX: This is too backend specific(llama-cpp), remove this bit or generalize further
|
||||
// We failed somehow starting the binary. For instance, could be that we are missing
|
||||
// some libraries if running in binary-only mode.
|
||||
// In this case, we attempt to load the model with the fallback variant.
|
||||
|
||||
// If not llama-cpp backend, return error immediately
|
||||
if backend != LLamaCPP {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
// Otherwise attempt with fallback
|
||||
log.Error().Msgf("[%s] Failed loading model, trying with fallback '%s'", backend, LLamaCPPFallback)
|
||||
model, err = ml.LoadModel(o.modelID, o.model, ml.grpcModel(LLamaCPPFallback, false, o))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
|
||||
return model.GRPC(o.parallelRequests, ml.wd), nil
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) stopActiveBackends(modelID string, singleActiveBackend bool) {
|
||||
// If we can have only one backend active, kill all the others (except external backends)
|
||||
if singleActiveBackend {
|
||||
log.Debug().Msgf("Stopping all backends except '%s'", modelID)
|
||||
err := ml.StopGRPC(allExcept(modelID))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("keptModel", modelID).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
||||
o := NewOptions(opts...)
|
||||
|
||||
// Return earlier if we have a model already loaded
|
||||
// (avoid looping through all the backends)
|
||||
if m := ml.CheckIsLoaded(o.model); m != nil {
|
||||
log.Debug().Msgf("Model '%s' already loaded", o.model)
|
||||
if m := ml.CheckIsLoaded(o.modelID); m != nil {
|
||||
log.Debug().Msgf("Model '%s' already loaded", o.modelID)
|
||||
|
||||
return m.GRPC(o.parallelRequests, ml.wd), nil
|
||||
}
|
||||
|
||||
// If we can have only one backend active, kill all the others (except external backends)
|
||||
if o.singleActiveBackend {
|
||||
log.Debug().Msgf("Stopping all backends except '%s'", o.model)
|
||||
err := ml.StopGRPC(allExcept(o.model))
|
||||
if err != nil {
|
||||
log.Error().Err(err).Str("keptModel", o.model).Msg("error while shutting down all backends except for the keptModel - greedyloader continuing")
|
||||
}
|
||||
}
|
||||
ml.stopActiveBackends(o.modelID, o.singleActiveBackend)
|
||||
|
||||
var err error
|
||||
|
||||
// get backends embedded in the binary
|
||||
autoLoadBackends, err := backendsInAssetDir(o.assetDir)
|
||||
autoLoadBackends, err := ml.ListAvailableBackends(o.assetDir)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -476,23 +510,13 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
||||
|
||||
log.Debug().Msgf("Loading from the following backends (in order): %+v", autoLoadBackends)
|
||||
|
||||
if o.model != "" {
|
||||
log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.model, autoLoadBackends)
|
||||
}
|
||||
log.Info().Msgf("Trying to load the model '%s' with the backend '%s'", o.modelID, autoLoadBackends)
|
||||
|
||||
for _, key := range autoLoadBackends {
|
||||
log.Info().Msgf("[%s] Attempting to load", key)
|
||||
options := []Option{
|
||||
options := append(opts, []Option{
|
||||
WithBackendString(key),
|
||||
WithModel(o.model),
|
||||
WithLoadGRPCLoadModelOpts(o.gRPCOptions),
|
||||
WithThreads(o.threads),
|
||||
WithAssetDir(o.assetDir),
|
||||
}
|
||||
|
||||
for k, v := range o.externalBackends {
|
||||
options = append(options, WithExternalBackend(k, v))
|
||||
}
|
||||
}...)
|
||||
|
||||
model, modelerr := ml.BackendLoader(options...)
|
||||
if modelerr == nil && model != nil {
|
||||
@@ -505,39 +529,6 @@ func (ml *ModelLoader) GreedyLoader(opts ...Option) (grpc.Backend, error) {
|
||||
err = errors.Join(err, fmt.Errorf("backend %s returned no usable model", key))
|
||||
log.Info().Msgf("[%s] Fails: %s", key, "backend returned no usable model")
|
||||
}
|
||||
|
||||
if autoDetect && key == LLamaCPP && err != nil {
|
||||
// try as hard as possible to run the llama.cpp variants
|
||||
backendToUse := ""
|
||||
if xsysinfo.HasCPUCaps(cpuid.AVX2) {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
|
||||
backendToUse = LLamaCPPAVX2
|
||||
}
|
||||
} else if xsysinfo.HasCPUCaps(cpuid.AVX) {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPAVX2)); err == nil {
|
||||
backendToUse = LLamaCPPAVX
|
||||
}
|
||||
} else {
|
||||
if _, err := os.Stat(backendPath(o.assetDir, LLamaCPPFallback)); err == nil {
|
||||
backendToUse = LLamaCPPFallback
|
||||
} else {
|
||||
// If we don't have a fallback, just skip fallback
|
||||
continue
|
||||
}
|
||||
}
|
||||
|
||||
// Autodetection failed, try the fallback
|
||||
log.Info().Msgf("[%s] Autodetection failed, trying the fallback", key)
|
||||
options = append(options, WithBackendString(backendToUse))
|
||||
model, modelerr = ml.BackendLoader(options...)
|
||||
if modelerr == nil && model != nil {
|
||||
log.Info().Msgf("[%s] Loads OK", key)
|
||||
return model, nil
|
||||
} else {
|
||||
err = errors.Join(err, fmt.Errorf("[%s]: %w", key, modelerr))
|
||||
log.Info().Msgf("[%s] Fails: %s", key, modelerr.Error())
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return nil, fmt.Errorf("could not load model - all backends returned error: %s", err.Error())
|
||||
|
||||
@@ -13,7 +13,6 @@ import (
|
||||
|
||||
"github.com/mudler/LocalAI/pkg/utils"
|
||||
|
||||
process "github.com/mudler/go-processmanager"
|
||||
"github.com/rs/zerolog/log"
|
||||
)
|
||||
|
||||
@@ -21,20 +20,18 @@ import (
|
||||
|
||||
// TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we seperate directories for .bin/.yaml and .tmpl
|
||||
type ModelLoader struct {
|
||||
ModelPath string
|
||||
mu sync.Mutex
|
||||
models map[string]*Model
|
||||
grpcProcesses map[string]*process.Process
|
||||
templates *templates.TemplateCache
|
||||
wd *WatchDog
|
||||
ModelPath string
|
||||
mu sync.Mutex
|
||||
models map[string]*Model
|
||||
templates *templates.TemplateCache
|
||||
wd *WatchDog
|
||||
}
|
||||
|
||||
func NewModelLoader(modelPath string) *ModelLoader {
|
||||
nml := &ModelLoader{
|
||||
ModelPath: modelPath,
|
||||
models: make(map[string]*Model),
|
||||
templates: templates.NewTemplateCache(modelPath),
|
||||
grpcProcesses: make(map[string]*process.Process),
|
||||
ModelPath: modelPath,
|
||||
models: make(map[string]*Model),
|
||||
templates: templates.NewTemplateCache(modelPath),
|
||||
}
|
||||
|
||||
return nml
|
||||
@@ -105,21 +102,21 @@ FILE:
|
||||
return models, nil
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) ListModels() []*Model {
|
||||
func (ml *ModelLoader) ListModels() []Model {
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
|
||||
models := []*Model{}
|
||||
models := []Model{}
|
||||
for _, model := range ml.models {
|
||||
models = append(models, model)
|
||||
models = append(models, *model)
|
||||
}
|
||||
|
||||
return models
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (*Model, error)) (*Model, error) {
|
||||
func (ml *ModelLoader) LoadModel(modelID, modelName string, loader func(string, string, string) (*Model, error)) (*Model, error) {
|
||||
// Check if we already have a loaded model
|
||||
if model := ml.CheckIsLoaded(modelName); model != nil {
|
||||
if model := ml.CheckIsLoaded(modelID); model != nil {
|
||||
return model, nil
|
||||
}
|
||||
|
||||
@@ -127,18 +124,18 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (
|
||||
modelFile := filepath.Join(ml.ModelPath, modelName)
|
||||
log.Debug().Msgf("Loading model in memory from file: %s", modelFile)
|
||||
|
||||
model, err := loader(modelName, modelFile)
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
model, err := loader(modelID, modelName, modelFile)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("failed to load model with internal loader: %s", err)
|
||||
}
|
||||
|
||||
if model == nil {
|
||||
return nil, fmt.Errorf("loader didn't return a model")
|
||||
}
|
||||
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
ml.models[modelName] = model
|
||||
ml.models[modelID] = model
|
||||
|
||||
return model, nil
|
||||
}
|
||||
@@ -146,14 +143,13 @@ func (ml *ModelLoader) LoadModel(modelName string, loader func(string, string) (
|
||||
func (ml *ModelLoader) ShutdownModel(modelName string) error {
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
|
||||
_, ok := ml.models[modelName]
|
||||
model, ok := ml.models[modelName]
|
||||
if !ok {
|
||||
return fmt.Errorf("model %s not found", modelName)
|
||||
}
|
||||
|
||||
retries := 1
|
||||
for ml.models[modelName].GRPC(false, ml.wd).IsBusy() {
|
||||
for model.GRPC(false, ml.wd).IsBusy() {
|
||||
log.Debug().Msgf("%s busy. Waiting.", modelName)
|
||||
dur := time.Duration(retries*2) * time.Second
|
||||
if dur > retryTimeout {
|
||||
@@ -161,6 +157,11 @@ func (ml *ModelLoader) ShutdownModel(modelName string) error {
|
||||
}
|
||||
time.Sleep(dur)
|
||||
retries++
|
||||
|
||||
if retries > 10 && os.Getenv("LOCALAI_FORCE_BACKEND_SHUTDOWN") == "true" {
|
||||
log.Warn().Msgf("Model %s is still busy after %d retries. Forcing shutdown.", modelName, retries)
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return ml.deleteProcess(modelName)
|
||||
@@ -185,8 +186,8 @@ func (ml *ModelLoader) CheckIsLoaded(s string) *Model {
|
||||
if !alive {
|
||||
log.Warn().Msgf("GRPC Model not responding: %s", err.Error())
|
||||
log.Warn().Msgf("Deleting the process in order to recreate it")
|
||||
process, exists := ml.grpcProcesses[s]
|
||||
if !exists {
|
||||
process := m.Process()
|
||||
if process == nil {
|
||||
log.Error().Msgf("Process not found for '%s' and the model is not responding anymore !", s)
|
||||
return m
|
||||
}
|
||||
|
||||
@@ -9,7 +9,7 @@ import (
|
||||
type Options struct {
|
||||
backendString string
|
||||
model string
|
||||
threads uint32
|
||||
modelID string
|
||||
assetDir string
|
||||
context context.Context
|
||||
|
||||
@@ -68,12 +68,6 @@ func WithLoadGRPCLoadModelOpts(opts *pb.ModelOptions) Option {
|
||||
}
|
||||
}
|
||||
|
||||
func WithThreads(threads uint32) Option {
|
||||
return func(o *Options) {
|
||||
o.threads = threads
|
||||
}
|
||||
}
|
||||
|
||||
func WithAssetDir(assetDir string) Option {
|
||||
return func(o *Options) {
|
||||
o.assetDir = assetDir
|
||||
@@ -92,6 +86,12 @@ func WithSingleActiveBackend() Option {
|
||||
}
|
||||
}
|
||||
|
||||
func WithModelID(id string) Option {
|
||||
return func(o *Options) {
|
||||
o.modelID = id
|
||||
}
|
||||
}
|
||||
|
||||
func NewOptions(opts ...Option) *Options {
|
||||
o := &Options{
|
||||
gRPCOptions: &pb.ModelOptions{},
|
||||
@@ -63,24 +63,24 @@ var _ = Describe("ModelLoader", func() {
|
||||
|
||||
Context("LoadModel", func() {
|
||||
It("should load a model and keep it in memory", func() {
|
||||
mockModel = model.NewModel("test.model")
|
||||
mockModel = model.NewModel("foo", "test.model", nil)
|
||||
|
||||
mockLoader := func(modelName, modelFile string) (*model.Model, error) {
|
||||
mockLoader := func(modelID, modelName, modelFile string) (*model.Model, error) {
|
||||
return mockModel, nil
|
||||
}
|
||||
|
||||
model, err := modelLoader.LoadModel("test.model", mockLoader)
|
||||
model, err := modelLoader.LoadModel("foo", "test.model", mockLoader)
|
||||
Expect(err).To(BeNil())
|
||||
Expect(model).To(Equal(mockModel))
|
||||
Expect(modelLoader.CheckIsLoaded("test.model")).To(Equal(mockModel))
|
||||
Expect(modelLoader.CheckIsLoaded("foo")).To(Equal(mockModel))
|
||||
})
|
||||
|
||||
It("should return an error if loading the model fails", func() {
|
||||
mockLoader := func(modelName, modelFile string) (*model.Model, error) {
|
||||
mockLoader := func(modelID, modelName, modelFile string) (*model.Model, error) {
|
||||
return nil, errors.New("failed to load model")
|
||||
}
|
||||
|
||||
model, err := modelLoader.LoadModel("test.model", mockLoader)
|
||||
model, err := modelLoader.LoadModel("foo", "test.model", mockLoader)
|
||||
Expect(err).To(HaveOccurred())
|
||||
Expect(model).To(BeNil())
|
||||
})
|
||||
@@ -88,18 +88,16 @@ var _ = Describe("ModelLoader", func() {
|
||||
|
||||
Context("ShutdownModel", func() {
|
||||
It("should shutdown a loaded model", func() {
|
||||
mockModel = model.NewModel("test.model")
|
||||
|
||||
mockLoader := func(modelName, modelFile string) (*model.Model, error) {
|
||||
return mockModel, nil
|
||||
mockLoader := func(modelID, modelName, modelFile string) (*model.Model, error) {
|
||||
return model.NewModel("foo", "test.model", nil), nil
|
||||
}
|
||||
|
||||
_, err := modelLoader.LoadModel("test.model", mockLoader)
|
||||
_, err := modelLoader.LoadModel("foo", "test.model", mockLoader)
|
||||
Expect(err).To(BeNil())
|
||||
|
||||
err = modelLoader.ShutdownModel("test.model")
|
||||
err = modelLoader.ShutdownModel("foo")
|
||||
Expect(err).To(BeNil())
|
||||
Expect(modelLoader.CheckIsLoaded("test.model")).To(BeNil())
|
||||
Expect(modelLoader.CheckIsLoaded("foo")).To(BeNil())
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
@@ -1,18 +1,32 @@
|
||||
package model
|
||||
|
||||
import grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
import (
|
||||
"sync"
|
||||
|
||||
grpc "github.com/mudler/LocalAI/pkg/grpc"
|
||||
process "github.com/mudler/go-processmanager"
|
||||
)
|
||||
|
||||
type Model struct {
|
||||
ID string `json:"id"`
|
||||
address string
|
||||
client grpc.Backend
|
||||
process *process.Process
|
||||
sync.Mutex
|
||||
}
|
||||
|
||||
func NewModel(address string) *Model {
|
||||
func NewModel(ID, address string, process *process.Process) *Model {
|
||||
return &Model{
|
||||
ID: ID,
|
||||
address: address,
|
||||
process: process,
|
||||
}
|
||||
}
|
||||
|
||||
func (m *Model) Process() *process.Process {
|
||||
return m.process
|
||||
}
|
||||
|
||||
func (m *Model) GRPC(parallel bool, wd *WatchDog) grpc.Backend {
|
||||
if m.client != nil {
|
||||
return m.client
|
||||
@@ -23,6 +37,8 @@ func (m *Model) GRPC(parallel bool, wd *WatchDog) grpc.Backend {
|
||||
enableWD = true
|
||||
}
|
||||
|
||||
m.Lock()
|
||||
defer m.Unlock()
|
||||
m.client = grpc.NewClient(m.address, parallel, wd, enableWD)
|
||||
return m.client
|
||||
}
|
||||
|
||||
@@ -16,20 +16,36 @@ import (
|
||||
)
|
||||
|
||||
func (ml *ModelLoader) deleteProcess(s string) error {
|
||||
if _, exists := ml.grpcProcesses[s]; exists {
|
||||
if err := ml.grpcProcesses[s].Stop(); err != nil {
|
||||
log.Error().Err(err).Msgf("(deleteProcess) error while deleting grpc process %s", s)
|
||||
}
|
||||
defer delete(ml.models, s)
|
||||
|
||||
log.Debug().Msgf("Deleting process %s", s)
|
||||
|
||||
m, exists := ml.models[s]
|
||||
if !exists {
|
||||
log.Error().Msgf("Model does not exist %s", s)
|
||||
// Nothing to do
|
||||
return nil
|
||||
}
|
||||
delete(ml.grpcProcesses, s)
|
||||
delete(ml.models, s)
|
||||
return nil
|
||||
|
||||
process := m.Process()
|
||||
if process == nil {
|
||||
log.Error().Msgf("No process for %s", s)
|
||||
// Nothing to do as there is no process
|
||||
return nil
|
||||
}
|
||||
|
||||
err := process.Stop()
|
||||
if err != nil {
|
||||
log.Error().Err(err).Msgf("(deleteProcess) error while deleting process %s", s)
|
||||
}
|
||||
|
||||
return err
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) StopGRPC(filter GRPCProcessFilter) error {
|
||||
var err error = nil
|
||||
for k, p := range ml.grpcProcesses {
|
||||
if filter(k, p) {
|
||||
for k, m := range ml.models {
|
||||
if filter(k, m.Process()) {
|
||||
e := ml.ShutdownModel(k)
|
||||
err = errors.Join(err, e)
|
||||
}
|
||||
@@ -44,17 +60,20 @@ func (ml *ModelLoader) StopAllGRPC() error {
|
||||
func (ml *ModelLoader) GetGRPCPID(id string) (int, error) {
|
||||
ml.mu.Lock()
|
||||
defer ml.mu.Unlock()
|
||||
p, exists := ml.grpcProcesses[id]
|
||||
p, exists := ml.models[id]
|
||||
if !exists {
|
||||
return -1, fmt.Errorf("no grpc backend found for %s", id)
|
||||
}
|
||||
return strconv.Atoi(p.PID)
|
||||
if p.Process() == nil {
|
||||
return -1, fmt.Errorf("no grpc backend found for %s", id)
|
||||
}
|
||||
return strconv.Atoi(p.Process().PID)
|
||||
}
|
||||
|
||||
func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string, args ...string) error {
|
||||
func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string, args ...string) (*process.Process, error) {
|
||||
// Make sure the process is executable
|
||||
if err := os.Chmod(grpcProcess, 0700); err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
log.Debug().Msgf("Loading GRPC Process: %s", grpcProcess)
|
||||
@@ -63,7 +82,7 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
|
||||
|
||||
workDir, err := filepath.Abs(filepath.Dir(grpcProcess))
|
||||
if err != nil {
|
||||
return err
|
||||
return nil, err
|
||||
}
|
||||
|
||||
grpcControlProcess := process.New(
|
||||
@@ -79,10 +98,8 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
|
||||
ml.wd.AddAddressModelMap(serverAddress, id)
|
||||
}
|
||||
|
||||
ml.grpcProcesses[id] = grpcControlProcess
|
||||
|
||||
if err := grpcControlProcess.Run(); err != nil {
|
||||
return err
|
||||
return grpcControlProcess, err
|
||||
}
|
||||
|
||||
log.Debug().Msgf("GRPC Service state dir: %s", grpcControlProcess.StateDir())
|
||||
@@ -116,5 +133,5 @@ func (ml *ModelLoader) startProcess(grpcProcess, id string, serverAddress string
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
return grpcControlProcess, nil
|
||||
}
|
||||
|
||||
24
pkg/templates/multimodal.go
Normal file
24
pkg/templates/multimodal.go
Normal file
@@ -0,0 +1,24 @@
|
||||
package templates
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"text/template"
|
||||
)
|
||||
|
||||
func TemplateMultiModal(templateString string, templateID int, text string) (string, error) {
|
||||
// compile the template
|
||||
tmpl, err := template.New("template").Parse(templateString)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
result := bytes.NewBuffer(nil)
|
||||
// execute the template
|
||||
err = tmpl.Execute(result, struct {
|
||||
ID int
|
||||
Text string
|
||||
}{
|
||||
ID: templateID,
|
||||
Text: text,
|
||||
})
|
||||
return result.String(), err
|
||||
}
|
||||
19
pkg/templates/multimodal_test.go
Normal file
19
pkg/templates/multimodal_test.go
Normal file
@@ -0,0 +1,19 @@
|
||||
package templates_test
|
||||
|
||||
import (
|
||||
. "github.com/mudler/LocalAI/pkg/templates" // Update with your module path
|
||||
|
||||
// Update with your module path
|
||||
. "github.com/onsi/ginkgo/v2"
|
||||
. "github.com/onsi/gomega"
|
||||
)
|
||||
|
||||
var _ = Describe("EvaluateTemplate", func() {
|
||||
Context("templating simple strings for multimodal chat", func() {
|
||||
It("should template messages correctly", func() {
|
||||
result, err := TemplateMultiModal("[img-{{.ID}}]{{.Text}}", 1, "bar")
|
||||
Expect(err).NotTo(HaveOccurred())
|
||||
Expect(result).To(Equal("[img-1]bar"))
|
||||
})
|
||||
})
|
||||
})
|
||||
@@ -972,6 +972,14 @@ const docTemplate = `{
|
||||
}
|
||||
}
|
||||
},
|
||||
"model.Model": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"openai.Assistant": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -1682,6 +1690,12 @@ const docTemplate = `{
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"loaded_models": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/model.Model"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -965,6 +965,14 @@
|
||||
}
|
||||
}
|
||||
},
|
||||
"model.Model": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string"
|
||||
}
|
||||
}
|
||||
},
|
||||
"openai.Assistant": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
@@ -1675,6 +1683,12 @@
|
||||
"items": {
|
||||
"type": "string"
|
||||
}
|
||||
},
|
||||
"loaded_models": {
|
||||
"type": "array",
|
||||
"items": {
|
||||
"$ref": "#/definitions/model.Model"
|
||||
}
|
||||
}
|
||||
}
|
||||
},
|
||||
|
||||
@@ -168,6 +168,11 @@ definitions:
|
||||
type: string
|
||||
type: array
|
||||
type: object
|
||||
model.Model:
|
||||
properties:
|
||||
id:
|
||||
type: string
|
||||
type: object
|
||||
openai.Assistant:
|
||||
properties:
|
||||
created:
|
||||
@@ -652,6 +657,10 @@ definitions:
|
||||
items:
|
||||
type: string
|
||||
type: array
|
||||
loaded_models:
|
||||
items:
|
||||
$ref: '#/definitions/model.Model'
|
||||
type: array
|
||||
type: object
|
||||
schema.TTSRequest:
|
||||
description: TTS request body
|
||||
|
||||
@@ -260,11 +260,9 @@ var _ = Describe("E2E test", func() {
|
||||
resp, err := http.Post(rerankerEndpoint, "application/json", bytes.NewReader(serialized))
|
||||
Expect(err).To(BeNil())
|
||||
Expect(resp).ToNot(BeNil())
|
||||
Expect(resp.StatusCode).To(Equal(200))
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
Expect(err).To(BeNil())
|
||||
Expect(body).ToNot(BeNil())
|
||||
Expect(err).ToNot(HaveOccurred())
|
||||
Expect(resp.StatusCode).To(Equal(200), fmt.Sprintf("body: %s, response: %+v", body, resp))
|
||||
|
||||
deserializedResponse := schema.JINARerankResponse{}
|
||||
err = json.Unmarshal(body, &deserializedResponse)
|
||||
|
||||
Reference in New Issue
Block a user