Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
a00bbfe3eb chore(model): add silero-vad model config
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2024-11-26 14:28:41 +01:00
55 changed files with 294 additions and 1159 deletions

5
.github/labeler.yml vendored
View File

@@ -1,11 +1,6 @@
enhancements:
- head-branch: ['^feature', 'feature']
dependencies:
- any:
- changed-files:
- any-glob-to-any-file: 'Makefile'
kind/documentation:
- any:
- changed-files:

View File

@@ -12,14 +12,23 @@ jobs:
- repository: "ggerganov/llama.cpp"
variable: "CPPLLAMA_VERSION"
branch: "master"
- repository: "go-skynet/go-ggml-transformers.cpp"
variable: "GOGGMLTRANSFORMERS_VERSION"
branch: "master"
- repository: "donomii/go-rwkv.cpp"
variable: "RWKV_VERSION"
branch: "main"
- repository: "ggerganov/whisper.cpp"
variable: "WHISPER_CPP_VERSION"
branch: "master"
- repository: "PABannier/bark.cpp"
variable: "BARKCPP_VERSION"
- repository: "go-skynet/go-bert.cpp"
variable: "BERT_VERSION"
branch: "master"
- repository: "go-skynet/bloomz.cpp"
variable: "BLOOMZ_VERSION"
branch: "main"
- repository: "leejet/stable-diffusion.cpp"
variable: "STABLEDIFFUSION_GGML_VERSION"
- repository: "mudler/go-ggllm.cpp"
variable: "GOGGLLM_VERSION"
branch: "master"
- repository: "mudler/go-stable-diffusion"
variable: "STABLEDIFFUSION_VERSION"

1
.gitignore vendored
View File

@@ -2,7 +2,6 @@
/sources/
__pycache__/
*.a
*.o
get-sources
prepare-sources
/backend/cpp/llama/grpc-server

104
Makefile
View File

@@ -8,12 +8,16 @@ DETECT_LIBS?=true
# llama.cpp versions
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=cc98896db858df7aa40d0e16a505883ef196a482
CPPLLAMA_VERSION?=47f931c8f9a26c072d71224bc8013cc66ea9e445
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
WHISPER_CPP_VERSION?=6266a9f9e56a5b925e9892acf650f3eb1245814d
# bert.cpp version
BERT_REPO?=https://github.com/go-skynet/go-bert.cpp
BERT_VERSION?=710044b124545415f555e4260d16b146c725a6e4
# go-piper version
PIPER_REPO?=https://github.com/mudler/go-piper
PIPER_VERSION?=e10ca041a885d4a8f3871d52924b47792d5e5aa0
@@ -26,14 +30,6 @@ STABLEDIFFUSION_VERSION?=4a3cd6aeae6f66ee57eae9a0075f8c58c3a6a38f
TINYDREAM_REPO?=https://github.com/M0Rf30/go-tiny-dream
TINYDREAM_VERSION?=c04fa463ace9d9a6464313aa5f9cd0f953b6c057
# bark.cpp
BARKCPP_REPO?=https://github.com/PABannier/bark.cpp.git
BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=4570715727f35e5a07a76796d823824c8f42206c
ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64
ONNX_OS?=linux
@@ -202,6 +198,7 @@ ifeq ($(findstring tts,$(GO_TAGS)),tts)
endif
ALL_GRPC_BACKENDS=backend-assets/grpc/huggingface
ALL_GRPC_BACKENDS+=backend-assets/grpc/bert-embeddings
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
@@ -209,14 +206,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
ifeq ($(ONNX_OS),linux)
ifeq ($(ONNX_ARCH),x64)
ALL_GRPC_BACKENDS+=backend-assets/grpc/bark-cpp
ALL_GRPC_BACKENDS+=backend-assets/grpc/stablediffusion-ggml
endif
endif
ALL_GRPC_BACKENDS+=backend-assets/grpc/local-store
ALL_GRPC_BACKENDS+=backend-assets/grpc/silero-vad
ALL_GRPC_BACKENDS+=$(OPTIONAL_GRPC)
@@ -239,6 +228,19 @@ endif
all: help
## BERT embeddings
sources/go-bert.cpp:
mkdir -p sources/go-bert.cpp
cd sources/go-bert.cpp && \
git init && \
git remote add origin $(BERT_REPO) && \
git fetch origin && \
git checkout $(BERT_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-bert.cpp/libgobert.a: sources/go-bert.cpp
$(MAKE) -C sources/go-bert.cpp libgobert.a
## go-llama.cpp
sources/go-llama.cpp:
mkdir -p sources/go-llama.cpp
@@ -252,23 +254,6 @@ sources/go-llama.cpp:
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
## bark.cpp
sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
cd sources/bark.cpp && \
git checkout $(BARKCPP_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/bark.cpp/build/libbark.a: sources/bark.cpp
cd sources/bark.cpp && \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) .. && \
cmake --build . --config Release
backend/go/bark/libbark.a: sources/bark.cpp/build/libbark.a
$(MAKE) -C backend/go/bark libbark.a
## go-piper
sources/go-piper:
mkdir -p sources/go-piper
@@ -282,7 +267,7 @@ sources/go-piper:
sources/go-piper/libpiper_binding.a: sources/go-piper
$(MAKE) -C sources/go-piper libpiper_binding.a example/main piper.o
## stable diffusion (onnx)
## stable diffusion
sources/go-stable-diffusion:
mkdir -p sources/go-stable-diffusion
cd sources/go-stable-diffusion && \
@@ -295,30 +280,6 @@ sources/go-stable-diffusion:
sources/go-stable-diffusion/libstablediffusion.a: sources/go-stable-diffusion
CPATH="$(CPATH):/usr/include/opencv4" $(MAKE) -C sources/go-stable-diffusion libstablediffusion.a
## stablediffusion (ggml)
sources/stablediffusion-ggml.cpp:
git clone --recursive $(STABLEDIFFUSION_GGML_REPO) sources/stablediffusion-ggml.cpp && \
cd sources/stablediffusion-ggml.cpp && \
git checkout $(STABLEDIFFUSION_GGML_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a: sources/stablediffusion-ggml.cpp
cd sources/stablediffusion-ggml.cpp && \
mkdir -p build && \
cd build && \
cmake $(CMAKE_ARGS) .. && \
cmake --build . --config Release
backend/go/image/stablediffusion-ggml/libsd.a: sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a
$(MAKE) -C backend/go/image/stablediffusion-ggml libsd.a
backend-assets/grpc/stablediffusion-ggml: backend/go/image/stablediffusion-ggml/libsd.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ LIBRARY_PATH=$(CURDIR)/backend/go/image/stablediffusion-ggml/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion-ggml ./backend/go/image/stablediffusion-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/stablediffusion-ggml
endif
sources/onnxruntime:
mkdir -p sources/onnxruntime
curl -L https://github.com/microsoft/onnxruntime/releases/download/v$(ONNX_VERSION)/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz -o sources/onnxruntime/onnxruntime-$(ONNX_OS)-$(ONNX_ARCH)-$(ONNX_VERSION).tgz
@@ -359,11 +320,12 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
get-sources: sources/go-llama.cpp sources/go-piper sources/whisper.cpp sources/go-bert.cpp sources/go-stable-diffusion sources/go-tiny-dream backend/cpp/llama/llama.cpp
replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/go-skynet/go-bert.cpp=$(CURDIR)/sources/go-bert.cpp
$(GOCMD) mod edit -replace github.com/M0Rf30/go-tiny-dream=$(CURDIR)/sources/go-tiny-dream
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/mudler/go-stable-diffusion=$(CURDIR)/sources/go-stable-diffusion
@@ -372,6 +334,7 @@ replace:
dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-bert.cpp
$(GOCMD) mod edit -dropreplace github.com/M0Rf30/go-tiny-dream
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/mudler/go-stable-diffusion
@@ -386,6 +349,7 @@ rebuild: ## Rebuilds the project
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-stable-diffusion clean
$(MAKE) -C sources/go-bert.cpp clean
$(MAKE) -C sources/go-piper clean
$(MAKE) -C sources/go-tiny-dream clean
$(MAKE) build
@@ -400,9 +364,7 @@ clean: ## Remove build related file
rm -rf release/
rm -rf backend-assets/*
$(MAKE) -C backend/cpp/grpc clean
$(MAKE) -C backend/go/bark clean
$(MAKE) -C backend/cpp/llama clean
$(MAKE) -C backend/go/image/stablediffusion-ggml clean
rm -rf backend/cpp/llama-* || true
$(MAKE) dropreplace
$(MAKE) protogen-clean
@@ -745,6 +707,13 @@ backend-assets/espeak-ng-data: sources/go-piper sources/go-piper/libpiper_bindin
backend-assets/grpc: protogen-go replace
mkdir -p backend-assets/grpc
backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/bert-embeddings
endif
backend-assets/grpc/huggingface: backend-assets/grpc
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
ifneq ($(UPX),)
@@ -804,6 +773,10 @@ backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc backend/cpp/llama/ll
$(info ${GREEN}I llama-cpp build info:fallback${RESET})
CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
# TODO: every binary should have its own folder instead, so can have different metal implementations
ifeq ($(BUILD_TYPE),metal)
cp backend/cpp/llama-fallback/llama.cpp/build/bin/ggml-metal.metal backend-assets/grpc/
endif
backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc backend/cpp/llama/llama.cpp
cp -rf backend/cpp/llama backend/cpp/llama-cuda
@@ -851,13 +824,6 @@ ifneq ($(UPX),)
$(UPX) backend-assets/grpc/llama-ggml
endif
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/bark-cpp
endif
backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/

View File

@@ -92,8 +92,6 @@ local-ai run oci://localai/phi-2:latest
## 📰 Latest project news
- Dec 2024: stablediffusion.cpp backend (ggml) added ( https://github.com/mudler/LocalAI/pull/4289 )
- Nov 2024: Bark.cpp backend added ( https://github.com/mudler/LocalAI/pull/4287 )
- Nov 2024: Voice activity detection models (**VAD**) added to the API: https://github.com/mudler/LocalAI/pull/4204
- Oct 2024: examples moved to [LocalAI-examples](https://github.com/mudler/LocalAI-examples)
- Aug 2024: 🆕 FLUX-1, [P2P Explorer](https://explorer.localai.io)

View File

@@ -1,7 +1,7 @@
name: text-embedding-ada-002
embeddings: true
backend: bert-embeddings
parameters:
model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
model: huggingface://mudler/all-MiniLM-L6-v2/ggml-model-q4_0.bin
usage: |
You can test this model with curl like this:

View File

@@ -240,8 +240,6 @@ message ModelOptions {
repeated string LoraAdapters = 60;
repeated float LoraScales = 61;
repeated string Options = 62;
}
message Result {

View File

@@ -30,7 +30,9 @@ else ifeq ($(OS),Darwin)
CMAKE_ARGS+=-DGGML_METAL=OFF
else
CMAKE_ARGS+=-DGGML_METAL=ON
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
# Until this is tested properly, we disable embedded metal file
# as we already embed it as part of the LocalAI assets
CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
TARGET+=--target ggml-metal
endif
endif

View File

@@ -1,25 +0,0 @@
INCLUDE_PATH := $(abspath ./)
LIBRARY_PATH := $(abspath ./)
AR?=ar
BUILD_TYPE?=
# keep standard at C11 and C++11
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../sources/bark.cpp/examples -I$(INCLUDE_PATH)/../../../sources/bark.cpp/spm-headers -I$(INCLUDE_PATH)/../../../sources/bark.cpp -O3 -DNDEBUG -std=c++17 -fPIC
LDFLAGS = -L$(LIBRARY_PATH) -L$(LIBRARY_PATH)/../../../sources/bark.cpp/build/examples -lbark -lstdc++ -lm
# warnings
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
gobark.o:
$(CXX) $(CXXFLAGS) gobark.cpp -o gobark.o -c $(LDFLAGS)
libbark.a: gobark.o
cp $(INCLUDE_PATH)/../../../sources/bark.cpp/build/libbark.a ./
$(AR) rcs libbark.a gobark.o
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml.c.o
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-alloc.c.o
$(AR) rcs libbark.a $(LIBRARY_PATH)/../../../sources/bark.cpp/build/encodec.cpp/ggml/src/CMakeFiles/ggml.dir/ggml-backend.c.o
clean:
rm -f gobark.o libbark.a

View File

@@ -1,85 +0,0 @@
#include <iostream>
#include <tuple>
#include "bark.h"
#include "gobark.h"
#include "common.h"
#include "ggml.h"
struct bark_context *c;
void bark_print_progress_callback(struct bark_context *bctx, enum bark_encoding_step step, int progress, void *user_data) {
if (step == bark_encoding_step::SEMANTIC) {
printf("\rGenerating semantic tokens... %d%%", progress);
} else if (step == bark_encoding_step::COARSE) {
printf("\rGenerating coarse tokens... %d%%", progress);
} else if (step == bark_encoding_step::FINE) {
printf("\rGenerating fine tokens... %d%%", progress);
}
fflush(stdout);
}
int load_model(char *model) {
// initialize bark context
struct bark_context_params ctx_params = bark_context_default_params();
bark_params params;
params.model_path = model;
// ctx_params.verbosity = verbosity;
ctx_params.progress_callback = bark_print_progress_callback;
ctx_params.progress_callback_user_data = nullptr;
struct bark_context *bctx = bark_load_model(params.model_path.c_str(), ctx_params, params.seed);
if (!bctx) {
fprintf(stderr, "%s: Could not load model\n", __func__);
return 1;
}
c = bctx;
return 0;
}
int tts(char *text,int threads, char *dst ) {
ggml_time_init();
const int64_t t_main_start_us = ggml_time_us();
// generate audio
if (!bark_generate_audio(c, text, threads)) {
fprintf(stderr, "%s: An error occured. If the problem persists, feel free to open an issue to report it.\n", __func__);
return 1;
}
const float *audio_data = bark_get_audio_data(c);
if (audio_data == NULL) {
fprintf(stderr, "%s: Could not get audio data\n", __func__);
return 1;
}
const int audio_arr_size = bark_get_audio_data_size(c);
std::vector<float> audio_arr(audio_data, audio_data + audio_arr_size);
write_wav_on_disk(audio_arr, dst);
// report timing
{
const int64_t t_main_end_us = ggml_time_us();
const int64_t t_load_us = bark_get_load_time(c);
const int64_t t_eval_us = bark_get_eval_time(c);
printf("\n\n");
printf("%s: load time = %8.2f ms\n", __func__, t_load_us / 1000.0f);
printf("%s: eval time = %8.2f ms\n", __func__, t_eval_us / 1000.0f);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us) / 1000.0f);
}
return 0;
}
int unload() {
bark_free(c);
}

View File

@@ -1,52 +0,0 @@
package main
// #cgo CXXFLAGS: -I${SRCDIR}/../../../sources/bark.cpp/ -I${SRCDIR}/../../../sources/bark.cpp/encodec.cpp -I${SRCDIR}/../../../sources/bark.cpp/examples -I${SRCDIR}/../../../sources/bark.cpp/spm-headers
// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../sources/bark.cpp/build/examples -L${SRCDIR}/../../../sources/bark.cpp/build/encodec.cpp/ -lbark -lencodec -lcommon
// #include <gobark.h>
// #include <stdlib.h>
import "C"
import (
"fmt"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type Bark struct {
base.SingleThread
threads int
}
func (sd *Bark) Load(opts *pb.ModelOptions) error {
sd.threads = int(opts.Threads)
modelFile := C.CString(opts.ModelFile)
defer C.free(unsafe.Pointer(modelFile))
ret := C.load_model(modelFile)
if ret != 0 {
return fmt.Errorf("inference failed")
}
return nil
}
func (sd *Bark) TTS(opts *pb.TTSRequest) error {
t := C.CString(opts.Text)
defer C.free(unsafe.Pointer(t))
dst := C.CString(opts.Dst)
defer C.free(unsafe.Pointer(dst))
threads := C.int(sd.threads)
ret := C.tts(t, threads, dst)
if ret != 0 {
return fmt.Errorf("inference failed")
}
return nil
}

View File

@@ -1,8 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
int load_model(char *model);
int tts(char *text,int threads, char *dst );
#ifdef __cplusplus
}
#endif

View File

@@ -1,21 +0,0 @@
INCLUDE_PATH := $(abspath ./)
LIBRARY_PATH := $(abspath ./)
AR?=ar
BUILD_TYPE?=
# keep standard at C11 and C++11
CXXFLAGS = -I. -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/ggml/include -I$(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp -O3 -DNDEBUG -std=c++17 -fPIC
# warnings
CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function
gosd.o:
$(CXX) $(CXXFLAGS) gosd.cpp -o gosd.o -c
libsd.a: gosd.o
cp $(INCLUDE_PATH)/../../../../sources/stablediffusion-ggml.cpp/build/libstable-diffusion.a ./libsd.a
$(AR) rcs libsd.a gosd.o
clean:
rm -f gosd.o libsd.a

View File

@@ -1,228 +0,0 @@
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <iostream>
#include <random>
#include <string>
#include <vector>
#include "gosd.h"
// #include "preprocessing.hpp"
#include "flux.hpp"
#include "stable-diffusion.h"
#define STB_IMAGE_IMPLEMENTATION
#define STB_IMAGE_STATIC
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
#define STB_IMAGE_WRITE_STATIC
#include "stb_image_write.h"
#define STB_IMAGE_RESIZE_IMPLEMENTATION
#define STB_IMAGE_RESIZE_STATIC
#include "stb_image_resize.h"
// Names of the sampler method, same order as enum sample_method in stable-diffusion.h
const char* sample_method_str[] = {
"euler_a",
"euler",
"heun",
"dpm2",
"dpm++2s_a",
"dpm++2m",
"dpm++2mv2",
"ipndm",
"ipndm_v",
"lcm",
};
// Names of the sigma schedule overrides, same order as sample_schedule in stable-diffusion.h
const char* schedule_str[] = {
"default",
"discrete",
"karras",
"exponential",
"ays",
"gits",
};
sd_ctx_t* sd_c;
sample_method_t sample_method;
int load_model(char *model, char* options[], int threads, int diff) {
fprintf (stderr, "Loading model!\n");
char *stableDiffusionModel = "";
if (diff == 1 ) {
stableDiffusionModel = model;
model = "";
}
// decode options. Options are in form optname:optvale, or if booleans only optname.
char *clip_l_path = "";
char *clip_g_path = "";
char *t5xxl_path = "";
char *vae_path = "";
char *scheduler = "";
char *sampler = "";
// If options is not NULL, parse options
for (int i = 0; options[i] != NULL; i++) {
char *optname = strtok(options[i], ":");
char *optval = strtok(NULL, ":");
if (optval == NULL) {
optval = "true";
}
if (!strcmp(optname, "clip_l_path")) {
clip_l_path = optval;
}
if (!strcmp(optname, "clip_g_path")) {
clip_g_path = optval;
}
if (!strcmp(optname, "t5xxl_path")) {
t5xxl_path = optval;
}
if (!strcmp(optname, "vae_path")) {
vae_path = optval;
}
if (!strcmp(optname, "scheduler")) {
scheduler = optval;
}
if (!strcmp(optname, "sampler")) {
sampler = optval;
}
}
int sample_method_found = -1;
for (int m = 0; m < N_SAMPLE_METHODS; m++) {
if (!strcmp(sampler, sample_method_str[m])) {
sample_method_found = m;
}
}
if (sample_method_found == -1) {
fprintf(stderr, "Invalid sample method, default to EULER_A!\n");
sample_method_found = EULER_A;
}
sample_method = (sample_method_t)sample_method_found;
int schedule_found = -1;
for (int d = 0; d < N_SCHEDULES; d++) {
if (!strcmp(scheduler, schedule_str[d])) {
schedule_found = d;
fprintf (stderr, "Found scheduler: %s\n", scheduler);
}
}
if (schedule_found == -1) {
fprintf (stderr, "Invalid scheduler! using DEFAULT\n");
schedule_found = DEFAULT;
}
schedule_t schedule = (schedule_t)schedule_found;
fprintf (stderr, "Creating context\n");
sd_ctx_t* sd_ctx = new_sd_ctx(model,
clip_l_path,
clip_g_path,
t5xxl_path,
stableDiffusionModel,
vae_path,
"",
"",
"",
"",
"",
false,
false,
false,
threads,
SD_TYPE_COUNT,
STD_DEFAULT_RNG,
schedule,
false,
false,
false,
false);
if (sd_ctx == NULL) {
fprintf (stderr, "failed loading model (generic error)\n");
return 1;
}
fprintf (stderr, "Created context: OK\n");
sd_c = sd_ctx;
return 0;
}
int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed , char *dst, float cfg_scale) {
sd_image_t* results;
std::vector<int> skip_layers = {7, 8, 9};
fprintf (stderr, "Generating image\n");
results = txt2img(sd_c,
text,
negativeText,
-1, //clip_skip
cfg_scale, // sfg_scale
3.5f,
width,
height,
sample_method,
steps,
seed,
1,
NULL,
0.9f,
20.f,
false,
"",
skip_layers.data(),
skip_layers.size(),
0,
0.01,
0.2);
if (results == NULL) {
fprintf (stderr, "NO results\n");
return 1;
}
if (results[0].data == NULL) {
fprintf (stderr, "Results with no data\n");
return 1;
}
fprintf (stderr, "Writing PNG\n");
fprintf (stderr, "DST: %s\n", dst);
fprintf (stderr, "Width: %d\n", results[0].width);
fprintf (stderr, "Height: %d\n", results[0].height);
fprintf (stderr, "Channel: %d\n", results[0].channel);
fprintf (stderr, "Data: %p\n", results[0].data);
stbi_write_png(dst, results[0].width, results[0].height, results[0].channel,
results[0].data, 0, NULL);
fprintf (stderr, "Saved resulting image to '%s'\n", dst);
// TODO: free results. Why does it crash?
free(results[0].data);
results[0].data = NULL;
free(results);
fprintf (stderr, "gen_image is done", dst);
return 0;
}
int unload() {
free_sd_ctx(sd_c);
}

View File

@@ -1,96 +0,0 @@
package main
// #cgo CXXFLAGS: -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/thirdparty -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp -I${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/ggml/include
// #cgo LDFLAGS: -L${SRCDIR}/ -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src/ggml-cpu -L${SRCDIR}/../../../../sources/stablediffusion-ggml.cpp/build/ggml/src -lsd -lstdc++ -lm -lggml -lggml-base -lggml-cpu -lgomp
// #include <gosd.h>
// #include <stdlib.h>
import "C"
import (
"fmt"
"os"
"path/filepath"
"strings"
"unsafe"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
"github.com/mudler/LocalAI/pkg/utils"
)
type SDGGML struct {
base.SingleThread
threads int
sampleMethod string
cfgScale float32
}
func (sd *SDGGML) Load(opts *pb.ModelOptions) error {
sd.threads = int(opts.Threads)
modelFile := C.CString(opts.ModelFile)
defer C.free(unsafe.Pointer(modelFile))
var options **C.char
// prepare the options array to pass to C
size := C.size_t(unsafe.Sizeof((*C.char)(nil)))
length := C.size_t(len(opts.Options))
options = (**C.char)(C.malloc(length * size))
view := (*[1 << 30]*C.char)(unsafe.Pointer(options))[0:len(opts.Options):len(opts.Options)]
var diffusionModel int
var oo []string
for _, op := range opts.Options {
if op == "diffusion_model" {
diffusionModel = 1
continue
}
// If it's an option path, we resolve absolute path from the model path
if strings.Contains(op, ":") && strings.Contains(op, "path") {
data := strings.Split(op, ":")
data[1] = filepath.Join(opts.ModelPath, data[1])
if err := utils.VerifyPath(data[1], opts.ModelPath); err == nil {
oo = append(oo, strings.Join(data, ":"))
}
} else {
oo = append(oo, op)
}
}
fmt.Fprintf(os.Stderr, "Options: %+v\n", oo)
for i, x := range oo {
view[i] = C.CString(x)
}
sd.cfgScale = opts.CFGScale
ret := C.load_model(modelFile, options, C.int(opts.Threads), C.int(diffusionModel))
if ret != 0 {
return fmt.Errorf("could not load model")
}
return nil
}
func (sd *SDGGML) GenerateImage(opts *pb.GenerateImageRequest) error {
t := C.CString(opts.PositivePrompt)
defer C.free(unsafe.Pointer(t))
dst := C.CString(opts.Dst)
defer C.free(unsafe.Pointer(dst))
negative := C.CString(opts.NegativePrompt)
defer C.free(unsafe.Pointer(negative))
ret := C.gen_image(t, negative, C.int(opts.Width), C.int(opts.Height), C.int(opts.Step), C.int(opts.Seed), dst, C.float(sd.cfgScale))
if ret != 0 {
return fmt.Errorf("inference failed")
}
return nil
}

View File

@@ -1,8 +0,0 @@
#ifdef __cplusplus
extern "C" {
#endif
int load_model(char *model, char* options[], int threads, int diffusionModel);
int gen_image(char *text, char *negativeText, int width, int height, int steps, int seed, char *dst, float cfg_scale);
#ifdef __cplusplus
}
#endif

View File

@@ -1,20 +0,0 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &SDGGML{}); err != nil {
panic(err)
}
}

View File

@@ -0,0 +1,34 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
bert "github.com/go-skynet/go-bert.cpp"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type Embeddings struct {
base.SingleThread
bert *bert.Bert
}
func (llm *Embeddings) Load(opts *pb.ModelOptions) error {
model, err := bert.New(opts.ModelFile)
llm.bert = model
return err
}
func (llm *Embeddings) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.bert.TokenEmbeddings(tokens, bert.SetThreads(int(opts.Threads)))
}
return llm.bert.Embeddings(opts.Embeddings, bert.SetThreads(int(opts.Threads)))
}

View File

@@ -1,6 +1,7 @@
package main
// Note: this is started internally by LocalAI and a server is allocated for each model
import (
"flag"
@@ -14,7 +15,7 @@ var (
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &Bark{}); err != nil {
if err := grpc.StartServer(*addr, &Embeddings{}); err != nil {
panic(err)
}
}

View File

@@ -1,6 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi
transformers

View File

@@ -1,4 +1,4 @@
bark==0.1.5
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi

View File

@@ -1,3 +1,3 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
grpcio-tools

View File

@@ -1,4 +1,4 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi
packaging==24.1

View File

@@ -1,5 +1,5 @@
setuptools
grpcio==1.68.1
grpcio==1.68.0
pillow
protobuf
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi
wheel

View File

@@ -1,3 +1,3 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi

View File

@@ -2,7 +2,7 @@
intel-extension-for-pytorch
torch
optimum[openvino]
grpcio==1.68.1
grpcio==1.68.0
protobuf
librosa==0.9.1
faster-whisper==0.9.0

View File

@@ -1,4 +1,4 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
librosa
faster-whisper

View File

@@ -1,3 +1,3 @@
grpcio==1.68.1
grpcio==1.68.0
certifi
llvmlite==0.43.0

View File

@@ -1,3 +1,3 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi
datasets

View File

@@ -1,4 +1,4 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
scipy==1.14.0
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi
setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406

View File

@@ -1,3 +1,3 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi

View File

@@ -22,7 +22,7 @@ if [ "x${BUILD_TYPE}" == "x" ] && [ "x${FROM_SOURCE}" == "xtrue" ]; then
git clone https://github.com/vllm-project/vllm
fi
pushd vllm
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.1 protobuf bitsandbytes
uv pip install wheel packaging ninja "setuptools>=49.4.0" numpy typing-extensions pillow setuptools-scm grpcio==1.68.0 protobuf bitsandbytes
uv pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
VLLM_TARGET_DEVICE=cpu python setup.py install
popd

View File

@@ -1,4 +1,4 @@
grpcio==1.68.1
grpcio==1.68.0
protobuf
certifi
setuptools

View File

@@ -122,7 +122,7 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
CUDA: c.CUDA || c.Diffusers.CUDA,
SchedulerType: c.Diffusers.SchedulerType,
PipelineType: c.Diffusers.PipelineType,
CFGScale: c.CFGScale,
CFGScale: c.Diffusers.CFGScale,
LoraAdapter: c.LoraAdapter,
LoraScale: c.LoraScale,
LoraAdapters: c.LoraAdapters,
@@ -132,7 +132,6 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
IMG2IMG: c.Diffusers.IMG2IMG,
CLIPModel: c.Diffusers.ClipModel,
CLIPSubfolder: c.Diffusers.ClipSubFolder,
Options: c.Options,
CLIPSkip: int32(c.Diffusers.ClipSkip),
ControlNet: c.Diffusers.ControlNet,
ContextSize: int32(ctxSize),

View File

@@ -72,8 +72,6 @@ type BackendConfig struct {
Description string `yaml:"description"`
Usage string `yaml:"usage"`
Options []string `yaml:"options"`
}
type File struct {
@@ -99,15 +97,16 @@ type GRPC struct {
}
type Diffusers struct {
CUDA bool `yaml:"cuda"`
PipelineType string `yaml:"pipeline_type"`
SchedulerType string `yaml:"scheduler_type"`
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
ClipSkip int `yaml:"clip_skip"` // Skip every N frames
ClipModel string `yaml:"clip_model"` // Clip model to use
ClipSubFolder string `yaml:"clip_subfolder"` // Subfolder to use for clip model
ControlNet string `yaml:"control_net"`
CUDA bool `yaml:"cuda"`
PipelineType string `yaml:"pipeline_type"`
SchedulerType string `yaml:"scheduler_type"`
EnableParameters string `yaml:"enable_parameters"` // A list of comma separated parameters to specify
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
IMG2IMG bool `yaml:"img2img"` // Image to Image Diffuser
ClipSkip int `yaml:"clip_skip"` // Skip every N frames
ClipModel string `yaml:"clip_model"` // Clip model to use
ClipSubFolder string `yaml:"clip_subfolder"` // Subfolder to use for clip model
ControlNet string `yaml:"control_net"`
}
// LLMConfig is a struct that holds the configuration that are
@@ -165,8 +164,6 @@ type LLMConfig struct {
YarnAttnFactor float32 `yaml:"yarn_attn_factor"`
YarnBetaFast float32 `yaml:"yarn_beta_fast"`
YarnBetaSlow float32 `yaml:"yarn_beta_slow"`
CFGScale float32 `yaml:"cfg_scale"` // Classifier-Free Guidance Scale
}
// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend

View File

@@ -12,8 +12,6 @@ import (
"gopkg.in/yaml.v3"
)
const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml`
var _ = Describe("Model test", func() {
Context("Downloading", func() {
@@ -49,7 +47,7 @@ var _ = Describe("Model test", func() {
gallery := []GalleryModel{{
Name: "bert",
URL: bertEmbeddingsURL,
URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
}}
out, err := yaml.Marshal(gallery)
Expect(err).ToNot(HaveOccurred())
@@ -68,7 +66,7 @@ var _ = Describe("Model test", func() {
Expect(err).ToNot(HaveOccurred())
Expect(len(models)).To(Equal(1))
Expect(models[0].Name).To(Equal("bert"))
Expect(models[0].URL).To(Equal(bertEmbeddingsURL))
Expect(models[0].URL).To(Equal("https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml"))
Expect(models[0].Installed).To(BeFalse())
err = InstallModelFromGallery(galleries, "test@bert", tempdir, GalleryModel{}, func(s1, s2, s3 string, f float64) {}, true)
@@ -80,7 +78,7 @@ var _ = Describe("Model test", func() {
content := map[string]interface{}{}
err = yaml.Unmarshal(dat, &content)
Expect(err).ToNot(HaveOccurred())
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
Expect(content["backend"]).To(Equal("bert-embeddings"))
models, err = AvailableGalleryModels(galleries, tempdir)
Expect(err).ToNot(HaveOccurred())

View File

@@ -5,12 +5,14 @@ import (
"context"
"embed"
"encoding/json"
"errors"
"fmt"
"io"
"net/http"
"os"
"path/filepath"
"runtime"
"strings"
"github.com/mudler/LocalAI/core/config"
. "github.com/mudler/LocalAI/core/http"
@@ -238,8 +240,6 @@ func postInvalidRequest(url string) (error, int) {
return nil, resp.StatusCode
}
const bertEmbeddingsURL = `https://gist.githubusercontent.com/mudler/0a080b166b87640e8644b09c2aee6e3b/raw/f0e8c26bb72edc16d9fbafbfd6638072126ff225/bert-embeddings-gallery.yaml`
//go:embed backend-assets/*
var backendAssets embed.FS
@@ -279,13 +279,13 @@ var _ = Describe("API test", func() {
g := []gallery.GalleryModel{
{
Name: "bert",
URL: bertEmbeddingsURL,
URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
},
{
Name: "bert2",
URL: bertEmbeddingsURL,
URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
Overrides: map[string]interface{}{"foo": "bar"},
AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml"}},
},
}
out, err := yaml.Marshal(g)
@@ -383,7 +383,7 @@ var _ = Describe("API test", func() {
content := map[string]interface{}{}
err = yaml.Unmarshal(dat, &content)
Expect(err).ToNot(HaveOccurred())
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
Expect(content["backend"]).To(Equal("bert-embeddings"))
Expect(content["foo"]).To(Equal("bar"))
models, err = getModels("http://127.0.0.1:9090/models/available")
@@ -402,7 +402,7 @@ var _ = Describe("API test", func() {
It("overrides models", func() {
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: bertEmbeddingsURL,
URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
Name: "bert",
Overrides: map[string]interface{}{
"backend": "llama",
@@ -451,7 +451,7 @@ var _ = Describe("API test", func() {
})
It("apply models without overrides", func() {
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: bertEmbeddingsURL,
URL: "https://raw.githubusercontent.com/go-skynet/model-gallery/main/bert-embeddings.yaml",
Name: "bert",
Overrides: map[string]interface{}{},
})
@@ -471,7 +471,7 @@ var _ = Describe("API test", func() {
content := map[string]interface{}{}
err = yaml.Unmarshal(dat, &content)
Expect(err).ToNot(HaveOccurred())
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
Expect(content["backend"]).To(Equal("bert-embeddings"))
})
It("runs openllama(llama-ggml backend)", Label("llama"), func() {
@@ -806,7 +806,7 @@ var _ = Describe("API test", func() {
It("returns the models list", func() {
models, err := client.ListModels(context.TODO())
Expect(err).ToNot(HaveOccurred())
Expect(len(models.Models)).To(Equal(7)) // If "config.yaml" should be included, this should be 8?
Expect(len(models.Models)).To(Equal(6)) // If "config.yaml" should be included, this should be 8?
})
It("can generate completions via ggml", func() {
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "testmodel.ggml", Prompt: testPrompt})
@@ -866,8 +866,8 @@ var _ = Describe("API test", func() {
},
)
Expect(err).ToNot(HaveOccurred(), err)
Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 2048))
Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 2048))
Expect(len(resp.Data[0].Embedding)).To(BeNumerically("==", 384))
Expect(len(resp.Data[1].Embedding)).To(BeNumerically("==", 384))
sunEmbedding := resp.Data[0].Embedding
resp2, err := client.CreateEmbeddings(
@@ -911,6 +911,71 @@ var _ = Describe("API test", func() {
})
})
Context("backends", func() {
It("runs rwkv completion", func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,"})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices) > 0).To(BeTrue())
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
stream, err := client.CreateCompletionStream(context.TODO(), openai.CompletionRequest{
Model: "rwkv_test", Prompt: "Count up to five: one, two, three, four,", Stream: true,
})
Expect(err).ToNot(HaveOccurred())
defer stream.Close()
tokens := 0
text := ""
for {
response, err := stream.Recv()
if errors.Is(err, io.EOF) {
break
}
Expect(err).ToNot(HaveOccurred())
text += response.Choices[0].Text
tokens++
}
Expect(text).ToNot(BeEmpty())
Expect(text).To(ContainSubstring("five"))
Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
})
It("runs rwkv chat completion", func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
resp, err := client.CreateChatCompletion(context.TODO(),
openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices) > 0).To(BeTrue())
Expect(strings.ToLower(resp.Choices[0].Message.Content)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
stream, err := client.CreateChatCompletionStream(context.TODO(), openai.ChatCompletionRequest{Model: "rwkv_test", Messages: []openai.ChatCompletionMessage{{Content: "Can you count up to five?", Role: "user"}}})
Expect(err).ToNot(HaveOccurred())
defer stream.Close()
tokens := 0
text := ""
for {
response, err := stream.Recv()
if errors.Is(err, io.EOF) {
break
}
Expect(err).ToNot(HaveOccurred())
text += response.Choices[0].Delta.Content
tokens++
}
Expect(text).ToNot(BeEmpty())
Expect(strings.ToLower(text)).To(Or(ContainSubstring("sure"), ContainSubstring("five")))
Expect(tokens).ToNot(Or(Equal(1), Equal(0)))
})
})
// See tests/integration/stores_test
Context("Stores", Label("stores"), func() {

View File

@@ -27,6 +27,39 @@ embeddings: true
# .. other parameters
```
## Bert embeddings
To use `bert.cpp` models you can use the `bert` embedding backend.
An example model config file:
```yaml
name: text-embedding-ada-002
parameters:
model: bert
backend: bert-embeddings
embeddings: true
# .. other parameters
```
The `bert` backend uses [bert.cpp](https://github.com/skeskinen/bert.cpp) and uses `ggml` models.
For instance you can download the `ggml` quantized version of `all-MiniLM-L6-v2` from https://huggingface.co/skeskinen/ggml:
```bash
wget https://huggingface.co/skeskinen/ggml/resolve/main/all-MiniLM-L6-v2/ggml-model-q4_0.bin -O models/bert
```
To test locally (LocalAI server running on `localhost`),
you can use `curl` (and `jq` at the end to prettify):
```bash
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "Your text string goes here",
"model": "text-embedding-ada-002"
}' | jq "."
```
## Huggingface embeddings
To use `sentence-transformers` and models in `huggingface` you can use the `sentencetransformers` embedding backend.
@@ -54,26 +87,17 @@ The `sentencetransformers` backend uses Python [sentence-transformers](https://g
## Llama.cpp embeddings
Embeddings with `llama.cpp` are supported with the `llama-cpp` backend, it needs to be enabled with `embeddings` set to `true`.
Embeddings with `llama.cpp` are supported with the `llama` backend.
```yaml
name: my-awesome-model
backend: llama-cpp
backend: llama
embeddings: true
parameters:
model: ggml-file.bin
# ...
```
Then you can use the API to generate embeddings:
```bash
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "My text",
"model": "my-awesome-model"
}' | jq "."
```
## 💡 Examples
- Example that uses LLamaIndex and LocalAI as embedding: [here](https://github.com/go-skynet/LocalAI/tree/master/examples/query_data/).

View File

@@ -194,9 +194,8 @@ diffusers:
pipeline_type: StableDiffusionPipeline
enable_parameters: "negative_prompt,num_inference_steps,clip_skip"
scheduler_type: "k_dpmpp_sde"
cfg_scale: 8
clip_skip: 11
cfg_scale: 8
```
#### Configuration parameters
@@ -303,8 +302,7 @@ cuda: true
diffusers:
pipeline_type: StableDiffusionDepth2ImgPipeline
enable_parameters: "negative_prompt,num_inference_steps,image"
cfg_scale: 6
cfg_scale: 6
```
```bash

View File

@@ -300,7 +300,7 @@ curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
```bash
curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
"id": "bert-embeddings",
"url": "github:mudler/LocalAI/gallery/bert-embeddings.yaml",
"name": "text-embedding-ada-002"
}'
```

View File

@@ -0,0 +1,23 @@
backend: bert-embeddings
embeddings: true
f16: true
gpu_layers: 90
mmap: true
name: bert-cpp-minilm-v6
parameters:
model: bert-MiniLM-L6-v2q4_0.bin
download_files:
- filename: "bert-MiniLM-L6-v2q4_0.bin"
sha256: "a5a174d8772c8a569faf9f3136c441f2c3855b5bf35ed32274294219533feaad"
uri: "https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin"
usage: |
You can test this model with curl like this:
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "Your text string goes here",
"model": "bert-cpp-minilm-v6"
}'

View File

@@ -0,0 +1,12 @@
---
name: "bert-embeddings"
config_file: |
parameters:
model: bert-MiniLM-L6-v2q4_0.bin
backend: bert-embeddings
embeddings: true
files:
- filename: "bert-MiniLM-L6-v2q4_0.bin"
sha256: "a5a174d8772c8a569faf9f3136c441f2c3855b5bf35ed32274294219533feaad"
uri: "https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin"

View File

@@ -1,12 +0,0 @@
---
name: "flux-ggml"
config_file: |
backend: stablediffusion-ggml
step: 25
options:
- "diffusion_model"
- "clip_l_path:clip_l.safetensors"
- "t5xxl_path:t5xxl_fp16.safetensors"
- "vae_path:ae.safetensors"
- "sampler:euler"

View File

@@ -11,5 +11,4 @@ config_file: |
cuda: true
enable_parameters: num_inference_steps
pipeline_type: FluxPipeline
cfg_scale: 0
cfg_scale: 0

View File

@@ -1,27 +1,4 @@
---
- &rwkv
url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
name: "rwkv-6-world-7b"
license: apache-2.0
urls:
- https://huggingface.co/RWKV/rwkv-6-world-7b
- https://huggingface.co/bartowski/rwkv-6-world-7b-GGUF
tags:
- llm
- rwkv
- cpu
- gpu
- rnn
description: |
RWKV (pronounced RwaKuv) is an RNN with GPT-level LLM performance, and can also be directly trained like a GPT transformer (parallelizable). We are at RWKV-7.
So it's combining the best of RNN and transformer - great performance, fast inference, fast training, saves VRAM, "infinite" ctxlen, and free text embedding. Moreover it's 100% attention-free, and a Linux Foundation AI project.
overrides:
parameters:
model: rwkv-6-world-7b-Q4_K_M.gguf
files:
- filename: rwkv-6-world-7b-Q4_K_M.gguf
sha256: f74574186fa4584f405e92198605680db6ad00fd77974ffa14bf02073bb90273
uri: huggingface://bartowski/rwkv-6-world-7b-GGUF/rwkv-6-world-7b-Q4_K_M.gguf
- &qwen25coder
name: "qwen2.5-coder-14b"
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -380,7 +357,6 @@
urls:
- https://huggingface.co/hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF
overrides:
embeddings: true
parameters:
model: llama-3.2-1b-instruct-q4_k_m.gguf
files:
@@ -704,31 +680,6 @@
- filename: Llama-Sentient-3.2-3B-Instruct.Q4_K_M.gguf
uri: huggingface://QuantFactory/Llama-Sentient-3.2-3B-Instruct-GGUF/Llama-Sentient-3.2-3B-Instruct.Q4_K_M.gguf
sha256: 3f855ce0522bfdc39fc826162ba6d89f15cc3740c5207da10e70baa3348b7812
- !!merge <<: *llama32
name: "llama-smoltalk-3.2-1b-instruct"
urls:
- https://huggingface.co/prithivMLmods/Llama-SmolTalk-3.2-1B-Instruct
- https://huggingface.co/mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF
description: |
The Llama-SmolTalk-3.2-1B-Instruct model is a lightweight, instruction-tuned model designed for efficient text generation and conversational AI tasks. With a 1B parameter architecture, this model strikes a balance between performance and resource efficiency, making it ideal for applications requiring concise, contextually relevant outputs. The model has been fine-tuned to deliver robust instruction-following capabilities, catering to both structured and open-ended queries.
Key Features:
Instruction-Tuned Performance: Optimized to understand and execute user-provided instructions across diverse domains.
Lightweight Architecture: With just 1 billion parameters, the model provides efficient computation and storage without compromising output quality.
Versatile Use Cases: Suitable for tasks like content generation, conversational interfaces, and basic problem-solving.
Intended Applications:
Conversational AI: Engage users with dynamic and contextually aware dialogue.
Content Generation: Produce summaries, explanations, or other creative text outputs efficiently.
Instruction Execution: Follow user commands to generate precise and relevant responses.
overrides:
parameters:
model: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
files:
- filename: Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
sha256: 03d8d05e3821f4caa65defa82baaff658484d4405b66546431528153ceef4d9e
uri: huggingface://mradermacher/Llama-SmolTalk-3.2-1B-Instruct-GGUF/Llama-SmolTalk-3.2-1B-Instruct.Q4_K_M.gguf
- &qwen25
## Qwen2.5
name: "qwen2.5-14b-instruct"
@@ -1620,177 +1571,6 @@
- filename: SteyrCannon-0.2-Qwen2.5-72b.Q4_K_M.gguf
sha256: b34c08b77ffd25ccb0ca50b167f2215e784689205c93a0903fa9435b6cc187f0
uri: huggingface://mradermacher/SteyrCannon-0.2-Qwen2.5-72b-GGUF/SteyrCannon-0.2-Qwen2.5-72b.Q4_K_M.gguf
- !!merge <<: *qwen25
name: "q2.5-ms-mistoria-72b-v2"
icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/5LOvUFYiMMw6pcEsOhmo2.webp
urls:
- https://huggingface.co/Steelskull/Q2.5-MS-Mistoria-72b-v2
- https://huggingface.co/bartowski/Q2.5-MS-Mistoria-72b-v2-GGUF
description: |
This model is my second attempt at a 72b model, as usual, my goal is to merge the robust storytelling of mutiple models while attempting to maintain intelligence.
models:
- model: EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2
- model: ZeusLabs/Chronos-Platinum-72B
- model: shuttleai/shuttle-3
overrides:
parameters:
model: Q2.5-MS-Mistoria-72b-v2-Q4_K_M.gguf
files:
- filename: Q2.5-MS-Mistoria-72b-v2-Q4_K_M.gguf
sha256: 33df8aac5a790d1c286fe0fc4f9d340311f282eca19b78db6f7abb845923425c
uri: huggingface://bartowski/Q2.5-MS-Mistoria-72b-v2-GGUF/Q2.5-MS-Mistoria-72b-v2-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "eva-qwen2.5-72b-v0.2"
urls:
- https://huggingface.co/EVA-UNIT-01/EVA-Qwen2.5-72B-v0.2
- https://huggingface.co/bartowski/EVA-Qwen2.5-72B-v0.2-GGUF
description: |
A RP/storywriting specialist model, full-parameter finetune of Qwen2.5-72B on mixture of synthetic and natural data.
It uses Celeste 70B 0.1 data mixture, greatly expanding it to improve versatility, creativity and "flavor" of the resulting model.
Version notes for 0.2: Optimized training hyperparameters and increased sequence length. Better instruction following deeper into context and less repetition.
overrides:
parameters:
model: EVA-Qwen2.5-72B-v0.2-Q4_K_M.gguf
files:
- filename: EVA-Qwen2.5-72B-v0.2-Q4_K_M.gguf
sha256: 03ea0ecac3ee24a332ca43cf925b669c58714b9754be0f4bc232bd996681ef4b
uri: huggingface://bartowski/EVA-Qwen2.5-72B-v0.2-GGUF/EVA-Qwen2.5-72B-v0.2-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "qwq-32b-preview"
urls:
- https://huggingface.co/Qwen/QwQ-32B-Preview
- https://huggingface.co/bartowski/QwQ-32B-Preview-GGUF
description: |
QwQ-32B-Preview is an experimental research model developed by the Qwen Team, focused on advancing AI reasoning capabilities. As a preview release, it demonstrates promising analytical abilities while having several important limitations:
Language Mixing and Code-Switching: The model may mix languages or switch between them unexpectedly, affecting response clarity.
Recursive Reasoning Loops: The model may enter circular reasoning patterns, leading to lengthy responses without a conclusive answer.
Safety and Ethical Considerations: The model requires enhanced safety measures to ensure reliable and secure performance, and users should exercise caution when deploying it.
Performance and Benchmark Limitations: The model excels in math and coding but has room for improvement in other areas, such as common sense reasoning and nuanced language understanding.
overrides:
parameters:
model: QwQ-32B-Preview-Q4_K_M.gguf
files:
- filename: QwQ-32B-Preview-Q4_K_M.gguf
sha256: c499801e682e2379528090c50e106837ca1d69dc3bf3ff3a9af830a0eb49cdf6
uri: huggingface://bartowski/QwQ-32B-Preview-GGUF/QwQ-32B-Preview-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "q2.5-32b-slush-i1"
urls:
- https://huggingface.co/crestf411/Q2.5-32B-Slush
- https://huggingface.co/mradermacher/Q2.5-32B-Slush-i1-GGUF
description: |
Slush is a two-stage model trained with high LoRA dropout, where stage 1 is a pretraining continuation on the base model, aimed at boosting the model's creativity and writing capabilities. This is then merged into the instruction tune model, and stage 2 is a fine tuning step on top of this to further enhance its roleplaying capabilities and/or to repair any damage caused in the stage 1 merge.
This is still early stage. As always, feedback is welcome, and begone if you demand perfection.
The second stage, like the Sunfall series, follows the Silly Tavern preset (ChatML), so ymmv in particular if you use some other tool and/or preset.
overrides:
parameters:
model: Q2.5-32B-Slush.i1-Q4_K_M.gguf
files:
- filename: Q2.5-32B-Slush.i1-Q4_K_M.gguf
sha256: 95aecaf43077dabc72d3b556923ede2563325e1c89863800229cfa8b7f1c9659
uri: huggingface://mradermacher/Q2.5-32B-Slush-i1-GGUF/Q2.5-32B-Slush.i1-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "qwestion-24b"
urls:
- https://huggingface.co/CultriX/Qwestion-14B
- https://huggingface.co/mradermacher/Qwestion-24B-GGUF
description: |
This model was merged using the DARE TIES merge method using Qwen/Qwen2.5-14B as a base.
The following models were included in the merge:
allknowingroger/Qwenslerp2-14B
rombodawg/Rombos-LLM-V2.6-Qwen-14b
VAGOsolutions/SauerkrautLM-v2-14b-DPO
CultriX/Qwen2.5-14B-Wernicke
overrides:
parameters:
model: Qwestion-24B.Q4_K_M.gguf
files:
- filename: Qwestion-24B.Q4_K_M.gguf
sha256: 5d493bd81cfeef66d80101260145ab1d1d0428ef2191edce62b58391bd0fff0e
uri: huggingface://mradermacher/Qwestion-24B-GGUF/Qwestion-24B.Q4_K_M.gguf
- !!merge <<: *qwen25
name: "teleut-7b"
icon: https://cdn-uploads.huggingface.co/production/uploads/634262af8d8089ebaefd410e/UqIi8eztdptvt52Mak_1K.png
urls:
- https://huggingface.co/allura-org/Teleut-7b
- https://huggingface.co/QuantFactory/Teleut-7b-GGUF
description: |
A replication attempt of Tulu 3 on the Qwen 2.5 base models.
overrides:
parameters:
model: Teleut-7b.Q4_K_M.gguf
files:
- filename: Teleut-7b.Q4_K_M.gguf
sha256: 844a633ea01d793c638e99f2e07413606b3812b759e9264fbaf69c8d94eaa093
uri: huggingface://QuantFactory/Teleut-7b-GGUF/Teleut-7b.Q4_K_M.gguf
- !!merge <<: *qwen25
name: "qwen2.5-7b-homercreative-mix"
urls:
- https://huggingface.co/ZeroXClem/Qwen2.5-7B-HomerCreative-Mix
- https://huggingface.co/QuantFactory/Qwen2.5-7B-HomerCreative-Mix-GGUF
description: |
ZeroXClem/Qwen2.5-7B-HomerCreative-Mix is an advanced language model meticulously crafted by merging four pre-trained models using the powerful mergekit framework. This fusion leverages the Model Stock merge method to combine the creative prowess of Qandora, the instructive capabilities of Qwen-Instruct-Fusion, the sophisticated blending of HomerSlerp1, and the foundational conversational strengths of Homer-v0.5-Qwen2.5-7B. The resulting model excels in creative text generation, contextual understanding, and dynamic conversational interactions.
🚀 Merged Models
This model merge incorporates the following:
bunnycore/Qandora-2.5-7B-Creative: Specializes in creative text generation, enhancing the model's ability to produce imaginative and diverse content.
bunnycore/Qwen2.5-7B-Instruct-Fusion: Focuses on instruction-following capabilities, improving the model's performance in understanding and executing user commands.
allknowingroger/HomerSlerp1-7B: Utilizes spherical linear interpolation (SLERP) to blend model weights smoothly, ensuring a harmonious integration of different model attributes.
newsbang/Homer-v0.5-Qwen2.5-7B: Acts as the foundational conversational model, providing robust language comprehension and generation capabilities.
overrides:
parameters:
model: Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
files:
- filename: Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
sha256: fc3fdb41e068646592f89a8ae62d7b330f2bd4e97bf615aef2977930977c8ba5
uri: huggingface://QuantFactory/Qwen2.5-7B-HomerCreative-Mix-GGUF/Qwen2.5-7B-HomerCreative-Mix.Q4_K_M.gguf
- !!merge <<: *qwen25
name: "cybercore-qwen-2.1-7b"
urls:
- https://huggingface.co/bunnycore/CyberCore-Qwen-2.1-7B
- https://huggingface.co/QuantFactory/CyberCore-Qwen-2.1-7B-GGUF
description: |
This model was merged using the TIES merge method using rombodawg/Rombos-LLM-V2.5-Qwen-7b as a base.
Models Merged
fblgit/cybertron-v4-qw7B-UNAMGS + bunnycore/Qwen-2.1-7b-Persona-lora_model
fblgit/cybertron-v4-qw7B-MGS + bunnycore/Qwen-2.1-7b-Persona-lora_model
overrides:
parameters:
model: CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
files:
- filename: CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
sha256: 726042707a4cec29ca0355b4dc7c53a807b307d08aa8a3d4a9e76aefbbbcaadf
uri: huggingface://QuantFactory/CyberCore-Qwen-2.1-7B-GGUF/CyberCore-Qwen-2.1-7B.Q4_K_M.gguf
- !!merge <<: *qwen25
name: "homercreativeanvita-mix-qw7b"
icon: https://huggingface.co/suayptalha/HomerCreativeAnvita-Mix-Qw7B/resolve/main/HomerCreativeAnvita.jpeg
urls:
- https://huggingface.co/suayptalha/HomerCreativeAnvita-Mix-Qw7B
- https://huggingface.co/QuantFactory/HomerCreativeAnvita-Mix-Qw7B-GGUF
description: |
This model is currently ranked #1 on the Open LLM Leaderboard among models up to 13B parameters!
Merge Method
This model was merged using the SLERP merge method.
Models Merged
The following models were included in the merge:
ZeroXClem/Qwen2.5-7B-HomerAnvita-NerdMix
ZeroXClem/Qwen2.5-7B-HomerCreative-Mix
overrides:
parameters:
model: HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
files:
- filename: HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
sha256: a356f279a104bff0bbc2ef7ec136c1e774153de8893bf988083e96fb7f4bc053
uri: huggingface://QuantFactory/HomerCreativeAnvita-Mix-Qw7B-GGUF/HomerCreativeAnvita-Mix-Qw7B.Q4_K_M.gguf
- &archfunct
license: apache-2.0
tags:
@@ -3353,73 +3133,6 @@
- filename: Tulu-3.1-8B-SuperNova.i1-Q4_K_M.gguf
sha256: c6cc2e1a4c3d2338973ca0050af1cf4462b3f62838f62b4c8a204f2a74eeb01f
uri: huggingface://mradermacher/Tulu-3.1-8B-SuperNova-i1-GGUF/Tulu-3.1-8B-SuperNova.i1-Q4_K_M.gguf
- !!merge <<: *llama31
name: "llama-3.1-tulu-3-70b-dpo"
icon: "https://huggingface.co/datasets/allenai/blog-images/resolve/main/tulu3/Tulu3-logo.png"
urls:
- https://huggingface.co/allenai/Llama-3.1-Tulu-3-70B-DPO
- https://huggingface.co/bartowski/Llama-3.1-Tulu-3-70B-DPO-GGUF
description: |
Tülu3 is a leading instruction following model family, offering fully open-source data, code, and recipes designed to serve as a comprehensive guide for modern post-training techniques. Tülu3 is designed for state-of-the-art performance on a diversity of tasks in addition to chat, such as MATH, GSM8K, and IFEval.
overrides:
parameters:
model: Llama-3.1-Tulu-3-70B-DPO-Q4_K_M.gguf
files:
- filename: Llama-3.1-Tulu-3-70B-DPO-Q4_K_M.gguf
sha256: e2d9c59736274f9dd94f30ef3edcee68fec1d6649eb01d6bad7e3e8a6024f77d
uri: huggingface://bartowski/Llama-3.1-Tulu-3-70B-DPO-GGUF/Llama-3.1-Tulu-3-70B-DPO-Q4_K_M.gguf
- !!merge <<: *llama31
name: "llama-3.1-tulu-3-8b-sft"
icon: "https://huggingface.co/datasets/allenai/blog-images/resolve/main/tulu3/Tulu3-logo.png"
urls:
- https://huggingface.co/allenai/Llama-3.1-Tulu-3-8B-SFT
- https://huggingface.co/bartowski/Llama-3.1-Tulu-3-8B-SFT-GGUF
description: |
Tülu3 is a leading instruction following model family, offering fully open-source data, code, and recipes designed to serve as a comprehensive guide for modern post-training techniques. Tülu3 is designed for state-of-the-art performance on a diversity of tasks in addition to chat, such as MATH, GSM8K, and IFEval.
overrides:
parameters:
model: Llama-3.1-Tulu-3-8B-SFT-Q4_K_M.gguf
files:
- filename: Llama-3.1-Tulu-3-8B-SFT-Q4_K_M.gguf
sha256: 3fad2c96aa9b9de19c2cda0f88a381c47ac768ca03a95059d9f6c439791f8592
uri: huggingface://bartowski/Llama-3.1-Tulu-3-8B-SFT-GGUF/Llama-3.1-Tulu-3-8B-SFT-Q4_K_M.gguf
- !!merge <<: *llama31
icon: https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B/resolve/main/misc/misc_fig.jpg
name: "skywork-o1-open-llama-3.1-8b"
urls:
- https://huggingface.co/Skywork/Skywork-o1-Open-Llama-3.1-8B
- https://huggingface.co/QuantFactory/Skywork-o1-Open-Llama-3.1-8B-GGUF
description: |
We are excited to announce the release of the Skywork o1 Open model series, developed by the Skywork team at Kunlun Inc. This groundbreaking release introduces a series of models that incorporate o1-like slow thinking and reasoning capabilities. The Skywork o1 Open model series includes three advanced models:
Skywork o1 Open-Llama-3.1-8B: A robust chat model trained on Llama-3.1-8B, enhanced significantly with "o1-style" data to improve reasoning skills.
Skywork o1 Open-PRM-Qwen-2.5-1.5B: A specialized model designed to enhance reasoning capability through incremental process rewards, ideal for complex problem solving at a smaller scale.
Skywork o1 Open-PRM-Qwen-2.5-7B: Extends the capabilities of the 1.5B model by scaling up to handle more demanding reasoning tasks, pushing the boundaries of AI reasoning.
Different from mere reproductions of the OpenAI o1 model, the Skywork o1 Open model series not only exhibits innate thinking, planning, and reflecting capabilities in its outputs, but also shows significant improvements in reasoning skills on standard benchmarks. This series represents a strategic advancement in AI capabilities, moving a previously weaker base model towards the state-of-the-art (SOTA) in reasoning tasks.
overrides:
parameters:
model: Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
files:
- filename: Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
sha256: ef6a203ba585aab14f5d2ec463917a45b3ac571abd89c39e9a96a5e395ea8eea
uri: huggingface://QuantFactory/Skywork-o1-Open-Llama-3.1-8B-GGUF/Skywork-o1-Open-Llama-3.1-8B.Q4_K_M.gguf
- !!merge <<: *llama31
name: "sparse-llama-3.1-8b-2of4"
urls:
- https://huggingface.co/QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF
- https://huggingface.co/QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF
description: |
This is the 2:4 sparse version of Llama-3.1-8B. On the OpenLLM benchmark (version 1), it achieves an average score of 62.16, compared to 63.19 for the dense model—demonstrating a 98.37% accuracy recovery. On the Mosaic Eval Gauntlet benchmark (version v0.3), it achieves an average score of 53.85, versus 55.34 for the dense model—representing a 97.3% accuracy recovery.
overrides:
parameters:
model: Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
files:
- filename: Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
sha256: c481e7089ffaedd5ae8c74dccc7fb45f6509640b661fa086ae979f6fefc3fdba
uri: huggingface://QuantFactory/Sparse-Llama-3.1-8B-2of4-GGUF/Sparse-Llama-3.1-8B-2of4.Q4_K_M.gguf
- &deepseek
## Deepseek
url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
@@ -4213,23 +3926,6 @@
- filename: magnum-12b-v2.5-kto.i1-Q4_K_M.gguf
sha256: 07e91d2c6d4e42312e65a69c54f16be467575f7a596fe052993b388e38b90d76
uri: huggingface://mradermacher/magnum-12b-v2.5-kto-i1-GGUF/magnum-12b-v2.5-kto.i1-Q4_K_M.gguf
- !!merge <<: *mistral03
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
name: "chatty-harry_v3.0"
icon: https://cdn-uploads.huggingface.co/production/uploads/66c1cc08453a7ef6c5fe657a/0KzNTEtn2kJJQsw4lQeY0.png
urls:
- https://huggingface.co/Triangle104/Chatty-Harry_V3.0
- https://huggingface.co/QuantFactory/Chatty-Harry_V3.0-GGUF
description: |
This model was merged using the TIES merge method using Triangle104/ChatWaifu_Magnum_V0.2 as a base.
The following models were included in the merge: elinas/Chronos-Gold-12B-1.0
overrides:
parameters:
model: Chatty-Harry_V3.0.Q4_K_M.gguf
files:
- filename: Chatty-Harry_V3.0.Q4_K_M.gguf
sha256: 54b63bb74498576ca77b801ed096657a93cc2f6b71d707c3605fdb394bd3e622
uri: huggingface://QuantFactory/Chatty-Harry_V3.0-GGUF/Chatty-Harry_V3.0.Q4_K_M.gguf
- &mudler
### START mudler's LocalAI specific-models
url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
@@ -4985,39 +4681,6 @@
- filename: G2-9B-Sugarquill-v0.Q4_K_M.gguf
sha256: 790a2f1541011b2773e22aa863ef78c8662baaa7eca5875e9573007985120187
uri: huggingface://QuantFactory/G2-9B-Sugarquill-v0-GGUF/G2-9B-Sugarquill-v0.Q4_K_M.gguf
- !!merge <<: *gemma
name: "volare-i1"
urls:
- https://huggingface.co/MoxoffSpA/Volare
- https://huggingface.co/mradermacher/Volare-i1-GGUF
description: |
Volare is an updated version of Gemma7B, specifically fine-tuned with SFT and LoRA adjustments.
It's trained on publicly available datasets, like SQUAD-it, and datasets we've created in-house.
it's designed to understand and maintain context, making it ideal for Retrieval Augmented Generation (RAG) tasks and applications requiring contextual awareness.
Italian dataset.
overrides:
parameters:
model: Volare.i1-Q4_K_M.gguf
files:
- filename: Volare.i1-Q4_K_M.gguf
sha256: fa8fb9d4cb19fcb44be8d53561c9e2840f45aed738de545983ebb158ebba461b
uri: huggingface://mradermacher/Volare-i1-GGUF/Volare.i1-Q4_K_M.gguf
- !!merge <<: *gemma
name: "bggpt-gemma-2-2.6b-it-v1.0"
icon: https://cdn-uploads.huggingface.co/production/uploads/637e1f8cf7e01589cc17bf7e/p6d0YFHjWCQ3S12jWqO1m.png
urls:
- https://huggingface.co/QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF
- https://huggingface.co/QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF
description: |
INSAIT introduces BgGPT-Gemma-2-2.6B-IT-v1.0, a state-of-the-art Bulgarian language model based on google/gemma-2-2b and google/gemma-2-2b-it. BgGPT-Gemma-2-2.6B-IT-v1.0 is free to use and distributed under the Gemma Terms of Use. This model was created by INSAIT, part of Sofia University St. Kliment Ohridski, in Sofia, Bulgaria.
The model was built on top of Googles Gemma 2 2B open models. It was continuously pre-trained on around 100 billion tokens (85 billion in Bulgarian) using the Branch-and-Merge strategy INSAIT presented at EMNLP24, allowing the model to gain outstanding Bulgarian cultural and linguistic capabilities while retaining its English performance. During the pre-training stage, we use various datasets, including Bulgarian web crawl data, freely available datasets such as Wikipedia, a range of specialized Bulgarian datasets sourced by the INSAIT Institute, and machine translations of popular English datasets. The model was then instruction-fine-tuned on a newly constructed Bulgarian instruction dataset created using real-world conversations. For more information check our blogpost.
overrides:
parameters:
model: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
files:
- filename: BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
sha256: 1e92fe80ccad80e97076ee26b002c2280f075dfe2507d534b46a4391a077f319
uri: huggingface://QuantFactory/BgGPT-Gemma-2-2.6B-IT-v1.0-GGUF/BgGPT-Gemma-2-2.6B-IT-v1.0.Q4_K_M.gguf
- &llama3
url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
@@ -7643,27 +7306,6 @@
- filename: LLAMA-3_8B_Unaligned_BETA-Q4_K_M.gguf
sha256: 5b88fb4537339996c04e4a1b6ef6a2d555c4103b6378e273ae9c6c5e77af67eb
uri: huggingface://bartowski/LLAMA-3_8B_Unaligned_BETA-GGUF/LLAMA-3_8B_Unaligned_BETA-Q4_K_M.gguf
- !!merge <<: *llama3
name: "freyja-v4.95-maldv-7b-non-fiction-i1"
urls:
- https://huggingface.co/MrRobotoAI/Freyja-v4.95-maldv-7b-NON-FICTION
- https://huggingface.co/mradermacher/Freyja-v4.95-maldv-7b-NON-FICTION-i1-GGUF
description: |
This model was merged using the Model Stock merge method using aifeifei798/llama3-8B-DarkIdol-2.2-Uncensored-1048K as a base.
The following models were included in the merge:
maldv/llama-3-fantasy-writer-8b
maldv/badger-iota-llama-3-8b
maldv/badger-lambda-llama-3-8b
maldv/badger-mu-llama-3-8b
maldv/badger-kappa-llama-3-8b
maldv/badger-writer-llama-3-8b
overrides:
parameters:
model: Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
files:
- filename: Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
sha256: cdc0f4de6df2ba120835fbd25c2a0ae2af8548f46d2c40c7a018c51c3d19e0c0
uri: huggingface://mradermacher/Freyja-v4.95-maldv-7b-NON-FICTION-i1-GGUF/Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
- &chatml
### ChatML
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -8906,43 +8548,6 @@
overrides:
parameters:
model: black-forest-labs/FLUX.1-schnell
- name: flux.1-dev-ggml
license: flux-1-dev-non-commercial-license
url: "github:mudler/LocalAI/gallery/flux-ggml.yaml@master"
description: |
FLUX.1 [dev] is a 12 billion parameter rectified flow transformer capable of generating images from text descriptions. For more information, please read our blog post.
Key Features
Cutting-edge output quality, second only to our state-of-the-art model FLUX.1 [pro].
Competitive prompt following, matching the performance of closed source alternatives .
Trained using guidance distillation, making FLUX.1 [dev] more efficient.
Open weights to drive new scientific research, and empower artists to develop innovative workflows.
Generated outputs can be used for personal, scientific, and commercial purposes as described in the flux-1-dev-non-commercial-license.
This model is quantized with GGUF
urls:
- https://huggingface.co/black-forest-labs/FLUX.1-dev
- https://huggingface.co/city96/FLUX.1-dev-gguf
tags:
- text-to-image
- flux
- gpu
- cpu
icon: https://huggingface.co/black-forest-labs/FLUX.1-schnell/resolve/main/schnell_grid.jpeg
overrides:
parameters:
model: flux1-dev-Q2_K.gguf
files:
- filename: "flux1-dev-Q2_K.gguf"
sha256: "b8c464bc0f10076ef8f00ba040d220d90c7993f7c4245ae80227d857f65df105"
uri: "huggingface://city96/FLUX.1-dev-gguf/flux1-dev-Q2_K.gguf"
- filename: ae.safetensors
sha256: afc8e28272cd15db3919bacdb6918ce9c1ed22e96cb12c4d5ed0fba823529e38
uri: https://huggingface.co/ChuckMcSneed/FLUX.1-dev/resolve/main/ae.safetensors
- filename: clip_l.safetensors
sha256: 660c6f5b1abae9dc498ac2d21e1347d2abdb0cf6c0c0c8576cd796491d9a6cdd
uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/clip_l.safetensors
- filename: t5xxl_fp16.safetensors
sha256: 6e480b09fae049a72d2a8c5fbccb8d3e92febeb233bbe9dfe7256958a9167635
uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors
- &whisper
## Whisper
url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master"
@@ -9104,13 +8709,16 @@
- filename: "ggml-model-whisper-tiny.en-q8_0.bin"
uri: "https://ggml.ggerganov.com/ggml-model-whisper-tiny.en-q8_0.bin"
sha256: 5bc2b3860aa151a4c6e7bb095e1fcce7cf12c7b020ca08dcec0c6d018bb7dd94
## Bert embeddings (llama3.2 drop-in)
- !!merge <<: *llama32
## Bert embeddings
- url: "github:mudler/LocalAI/gallery/bert-embeddings.yaml@master"
name: "bert-embeddings"
description: |
llama3.2 embeddings model. Using as drop-in replacement for bert-embeddings
license: "Apache 2.0"
urls:
- https://huggingface.co/skeskinen/ggml
tags:
- embeddings
description: |
Bert model that can be used for embeddings
## Stable Diffusion
- url: github:mudler/LocalAI/gallery/stablediffusion.yaml@master
license: "BSD-3"
@@ -9728,22 +9336,3 @@
- filename: silero-vad.onnx
uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
- name: "bark-cpp-small"
url: github:mudler/LocalAI/gallery/virtual.yaml@master
license: mit
urls:
- https://huggingface.co/suno/bark
- https://huggingface.co/Green-Sky/bark-ggml
description: |
Bark is a transformer-based text-to-audio model created by Suno. Bark can generate highly realistic, multilingual speech as well as other audio - including music, background noise and simple sound effects. The model can also produce nonverbal communications like laughing, sighing and crying. To support the research community, we are providing access to pretrained model checkpoints ready for inference.
tags:
- tts
- cpu
overrides:
backend: bark-cpp
parameters:
model: bark-small_weights-f16.bin
files:
- filename: bark-small_weights-f16.bin
uri: https://huggingface.co/Green-Sky/bark-ggml/resolve/main/bark-small_weights-f16.bin
sha256: de1ece17e8319537b3a7909baebbd28affab23c942d5d57e648d622af4e2feaa

View File

@@ -1,24 +0,0 @@
---
name: "rwkv"
config_file: |
parameters:
top_k: 80
temperature: 0.9
max_tokens: 4098
top_p: 0.8
context_size: 4098
roles:
user: "User: "
system: "System: "
assistant: "Assistant: "
stopwords:
- 'Assistant:'
- '<s>'
template:
chat: "{{.Input}}\nAssistant: "
completion: |
{{.Input}}

8
gallery/silero-vad.yaml Normal file
View File

@@ -0,0 +1,8 @@
---
name: "silero-vad"
config_file: |
name: vad
backend: silero-vad
parameters:
model: silero_vad.onnx

View File

@@ -45,6 +45,7 @@ const (
LLamaCPPGRPC = "llama-cpp-grpc"
BertEmbeddingsBackend = "bert-embeddings"
WhisperBackend = "whisper"
StableDiffusionBackend = "stablediffusion"
TinyDreamBackend = "tinydream"
@@ -153,6 +154,8 @@ func orderBackends(backends map[string][]string) ([]string, error) {
toTheEnd := []string{
// last has to be huggingface
LCHuggingFaceBackend,
// then bert embeddings
BertEmbeddingsBackend,
}
// create an ordered map

View File

@@ -103,10 +103,7 @@ if __name__ == "__main__":
if readmeFile:
# If there is a README file, read it
readme = fs.read_text(readmeFile)
try:
summarized_readme = summarize(readme)
except Exception as e:
print(f"Error summarizing the README: {str(e)}", file=sys.stderr)
summarized_readme = summarize(readme)
summarized_readme = format_description(summarized_readme)
print("Model correctly processed")

View File

@@ -1,4 +1,5 @@
name: text-embedding-ada-002
embeddings: true
parameters:
model: huggingface://hugging-quants/Llama-3.2-1B-Instruct-Q4_K_M-GGUF/llama-3.2-1b-instruct-q4_k_m.gguf
model: bert
backend: bert-embeddings
embeddings: true

View File

@@ -14,7 +14,6 @@ roles:
stopwords:
- 'Assistant:'
- '<s>'
template:
chat: |