Compare commits

..

1 Commits

Author SHA1 Message Date
Ettore Di Giacinto
b16a01d0bd WIP speculative 2025-01-24 10:17:54 +01:00
88 changed files with 1697 additions and 1089 deletions

View File

@@ -14,7 +14,7 @@ jobs:
steps:
- name: Dependabot metadata
id: metadata
uses: dependabot/fetch-metadata@v2.3.0
uses: dependabot/fetch-metadata@v2.2.0
with:
github-token: "${{ secrets.GITHUB_TOKEN }}"
skip-commit-verification: true

View File

@@ -53,7 +53,7 @@ jobs:
tag-suffix: '-cublas-cuda12-ffmpeg'
ffmpeg: 'true'
image-type: 'extras'
runs-on: 'ubuntu-latest'
runs-on: 'arc-runner-set'
base-image: "ubuntu:22.04"
makeflags: "--jobs=3 --output-sync=target"
# - build-type: 'hipblas'

View File

@@ -18,7 +18,7 @@ jobs:
with:
model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
# Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0
- uses: GrantBirki/git-diff-action@v2.7.0
id: git-diff-action
with:
json_diff_file_output: diff.json
@@ -99,7 +99,7 @@ jobs:
docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready"; docker logs --tail 10 local-ai; sleep 2; done
# Check the PR diff using the current branch and the base branch of the PR
- uses: GrantBirki/git-diff-action@v2.8.0
- uses: GrantBirki/git-diff-action@v2.7.0
id: git-diff-action
with:
json_diff_file_output: diff.json

View File

@@ -6,7 +6,9 @@ BINARY_NAME=local-ai
DETECT_LIBS?=true
# llama.cpp versions
CPPLLAMA_VERSION?=d2fe216fb2fb7ca8627618c9ea3a2e7886325780
GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
CPPLLAMA_VERSION?=6152129d05870cb38162c422c6ba80434e021e9f
# whisper.cpp version
WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -22,7 +24,7 @@ BARKCPP_VERSION?=v1.0.0
# stablediffusion.cpp (ggml)
STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
ONNX_VERSION?=1.20.0
ONNX_ARCH?=x64
@@ -149,6 +151,7 @@ ifeq ($(BUILD_TYPE),hipblas)
LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
export CXX=$(ROCM_HOME)/llvm/bin/clang++
export CC=$(ROCM_HOME)/llvm/bin/clang
# llama-ggml has no hipblas support, so override it here.
export STABLE_BUILD_TYPE=
export GGML_HIP=1
GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -185,6 +188,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -218,6 +222,19 @@ endif
all: help
## go-llama.cpp
sources/go-llama.cpp:
mkdir -p sources/go-llama.cpp
cd sources/go-llama.cpp && \
git init && \
git remote add origin $(GOLLAMA_REPO) && \
git fetch origin && \
git checkout $(GOLLAMA_VERSION) && \
git submodule update --init --recursive --depth 1 --single-branch
sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
## bark.cpp
sources/bark.cpp:
git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@@ -293,17 +310,19 @@ sources/whisper.cpp:
sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
replace:
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
dropreplace:
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
prepare-sources: get-sources replace
$(GOCMD) mod download
@@ -311,6 +330,7 @@ prepare-sources: get-sources replace
## GENERIC
rebuild: ## Rebuilds the project
$(GOCMD) clean -cache
$(MAKE) -C sources/go-llama.cpp clean
$(MAKE) -C sources/whisper.cpp clean
$(MAKE) -C sources/go-piper clean
$(MAKE) build
@@ -414,7 +434,7 @@ run: prepare ## run local-ai
test-models/testmodel.ggml:
mkdir test-models
mkdir test-dir
wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -429,7 +449,8 @@ test: prepare test-models/testmodel.ggml grpcs
export GO_TAGS="tts debug"
$(MAKE) prepare-test
HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf" --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
$(MAKE) test-llama
$(MAKE) test-llama-gguf
$(MAKE) test-tts
$(MAKE) test-stablediffusion
@@ -458,6 +479,10 @@ teardown-e2e:
rm -rf $(TEST_DIR) || true
docker stop $$(docker ps -q --filter ancestor=localai-tests)
test-llama: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
test-llama-gguf: prepare-test
TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -735,6 +760,13 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
mkdir -p backend-assets/util/
cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
ifneq ($(UPX),)
$(UPX) backend-assets/grpc/llama-ggml
endif
backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@@ -829,7 +861,7 @@ swagger:
.PHONY: gen-assets
gen-assets:
$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
## Documentation
docs/layouts/_default:

View File

@@ -163,11 +163,6 @@ message Reply {
double timing_token_generation = 5;
}
message GrammarTrigger {
string word = 1;
bool at_start = 2;
}
message ModelOptions {
string Model = 1;
int32 ContextSize = 2;
@@ -252,8 +247,6 @@ message ModelOptions {
string CacheTypeKey = 63;
string CacheTypeValue = 64;
repeated GrammarTrigger GrammarTriggers = 65;
}
message Result {

View File

@@ -22,6 +22,7 @@
#include "backend.grpc.pb.h"
#include "utils.hpp"
#include "sampling.h"
#include "speculative.h"
// include std::regex
#include <cstddef>
#include <thread>
@@ -185,12 +186,45 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
return out;
}
struct llama_slot_params {
uint32_t seed = -1; // RNG seed
bool stream = true;
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
bool return_tokens = false;
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
int32_t n_predict = -1; // new tokens to predict
int32_t n_indent = 0; // mininum line indentation for the generated text in number of whitespace characters
int64_t t_max_prompt_ms = -1; // TODO: implement
int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
std::vector<common_adapter_lora_info> lora;
std::vector<std::string> antiprompt;
std::vector<std::string> response_fields;
bool timings_per_token = false;
bool post_sampling_probs = false;
bool ignore_eos = false;
json input_prefix;
json input_suffix;
struct common_params_sampling sampling;
struct common_params_speculative speculative;
};
struct llama_client_slot
{
int id;
int task_id = -1;
struct slot_params params;
struct llama_slot_params params;
common_speculative * spec = nullptr;
llama_batch batch_spec = {};
slot_state state = IDLE;
slot_command command = NONE;
@@ -283,6 +317,7 @@ struct llama_client_slot
images.clear();
}
bool has_budget(common_params &global_params) {
if (params.n_predict == -1 && global_params.n_predict == -1)
{
@@ -454,6 +489,10 @@ struct llama_server_context
{
llama_model *model = nullptr;
llama_context *ctx = nullptr;
common_init_result llama_init_dft;
llama_context * ctx_dft = nullptr;
llama_model * model_dft = nullptr;
llama_context_params cparams_dft;
const llama_vocab * vocab = nullptr;
clip_ctx *clp_ctx = nullptr;
@@ -468,9 +507,6 @@ struct llama_server_context
bool add_bos_token = true;
bool has_eos_token = true;
bool grammar_lazy = false;
std::vector<common_grammar_trigger> grammar_trigger_words;
int32_t n_ctx; // total context for all clients / slots
// system prompt
@@ -505,6 +541,7 @@ struct llama_server_context
}
}
bool load_model(const common_params &params_)
{
params = params_;
@@ -548,6 +585,45 @@ struct llama_server_context
add_bos_token = llama_vocab_get_add_bos(vocab);
has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
if (!params.speculative.model.empty()) {
LOG("loading draft model '%s'\n", params.speculative.model.c_str());
auto params_dft = params;
params_dft.devices = params.speculative.devices;
params_dft.model = params.speculative.model;
params_dft.n_ctx = params.speculative.n_ctx == 0 ? params.n_ctx / params.n_parallel : params.speculative.n_ctx;
params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
params_dft.n_parallel = 1;
llama_init_dft = common_init_from_params(params_dft);
model_dft = llama_init_dft.model.get();
if (model_dft == nullptr) {
LOG("failed to load draft model, '%s'\n", params.speculative.model.c_str());
return false;
}
if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
LOG("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str());
return false;
}
const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
cparams_dft = common_context_params_to_llama(params_dft);
cparams_dft.n_batch = n_ctx_dft;
// force F16 KV cache for the draft model for extra performance
cparams_dft.type_k = GGML_TYPE_F16;
cparams_dft.type_v = GGML_TYPE_F16;
// the context is not needed - we will create one for each slot
llama_init_dft.context.reset();
}
return true;
}
@@ -576,6 +652,22 @@ struct llama_server_context
slot.n_ctx = n_ctx_slot;
slot.n_predict = params.n_predict;
if (model_dft) {
slot.batch_spec = llama_batch_init(params.speculative.n_max + 1, 0, 1);
ctx_dft = llama_init_from_model(model_dft, cparams_dft);
if (ctx_dft == nullptr) {
LOG("%s", "failed to create draft context\n");
return;
}
slot.spec = common_speculative_init(ctx_dft);
if (slot.spec == nullptr) {
LOG("%s", "failed to create speculator\n");
return;
}
}
LOG_INFO("new slot", {
{"slot_id", slot.id},
{"n_ctx_slot", slot.n_ctx}
@@ -684,9 +776,11 @@ struct llama_server_context
}
bool launch_slot_with_data(llama_client_slot* &slot, json data) {
slot_params default_params;
llama_slot_params default_params;
common_params_sampling default_sparams;
default_sparams.speculative = params_base.speculative;
slot->params.stream = json_value(data, "stream", false);
slot->params.cache_prompt = json_value(data, "cache_prompt", false);
slot->params.n_predict = json_value(data, "n_predict", default_params.n_predict);
@@ -709,8 +803,15 @@ struct llama_server_context
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
slot->sparams.min_keep = json_value(data, "min_keep", default_sparams.min_keep);
slot->sparams.grammar_trigger_words = grammar_trigger_words;
slot->sparams.grammar_lazy = grammar_lazy;
slot->sparams.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
slot->sparams.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
slot->sparams.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
slot->sparams.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
slot->sparams.speculative.n_min = std::max(params.speculative.n_min, 2);
slot->sparams.speculative.n_max = std::max(params.speculative.n_max, 0);
if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
// Might be better to reject the request with a 400 ?
@@ -2029,6 +2130,97 @@ struct llama_server_context
}
}
// do speculative decoding
for (auto & slot : slots) {
if (!slot.is_processing() || !(ctx_dft && params.speculative.n_max > 0)) {
continue;
}
if (slot.state != PROCESSING) {
continue;
}
// determine the max draft that fits the current slot state
int n_draft_max = slot.params.speculative.n_max;
// note: n_past is not yet increased for the `id` token sampled above
// also, need to leave space for 1 extra token to allow context shifts
n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2);
if (slot.n_remaining > 0) {
n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
}
LOG("max possible draft: %d\n", n_draft_max);
if (n_draft_max < slot.params.speculative.n_min) {
LOG("the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min);
continue;
}
llama_token id = slot.sampled;
struct common_speculative_params params_spec;
params_spec.n_draft = n_draft_max;
params_spec.n_reuse = llama_n_ctx(ctx_dft) - slot.params.speculative.n_max;
params_spec.p_min = slot.params.speculative.p_min;
llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
// ignore small drafts
if (slot.params.speculative.n_min > (int) draft.size()) {
LOG("ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
continue;
}
// construct the speculation batch
common_batch_clear(slot.batch_spec);
common_batch_add (slot.batch_spec, id, slot.n_past, { slot.id }, true);
for (size_t i = 0; i < draft.size(); ++i) {
common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
}
LOG("decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
llama_decode(ctx, slot.batch_spec);
// the accepted tokens from the speculation
const auto ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, draft);
slot.n_past += ids.size();
slot.n_decoded += ids.size();
slot.cache_tokens.push_back(id);
slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
for (size_t i = 0; i < ids.size(); ++i) {
completion_token_output result;
result.tok = ids[i];
result.text_to_send = common_token_to_piece(ctx, result.tok, params.special);
//result.prob = 1.0f; // set later
// TODO: set result.probs
if (!process_token(result, slot)) {
// release slot because of stop condition
slot.release();
slot.print_timings();
send_final_response(slot);
metrics.on_prediction(slot);
break;
}
}
LOG("accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
}
LOG_VERBOSE("slots updated", {});
return true;
}
@@ -2301,6 +2493,30 @@ static void params_parse(const backend::ModelOptions* request,
params.cpuparams.n_threads = request->threads();
params.n_gpu_layers = request->ngpulayers();
params.n_batch = request->nbatch();
params.speculative.model = request->draftmodel();
// If options is not NULL, parse options
for (int i = 0; request->options()[i] != NULL; i++) {
char *optname = strtok(request->options()[i], ":");
char *optval = strtok(NULL, ":");
if (optval == NULL) {
optval = "true";
}
if (!strcmp(optname, "speculative.n_gpu_layers")) {
params.speculative.n_gpu_layers = std::stoi(optval);
}
if (!strcmp(optname, "speculative.n_ctx")) {
params.speculative.n_ctx = std::stoi(optval);
}
}
if params.speculative.n_gpu_layers == 0 {
params.speculative.n_gpu_layers = params.n_gpu_layers;
}
if params.speculative.n_ctx == 0 {
params.speculative.n_ctx = params.n_ctx;
}
// Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
//params.n_parallel = 1;
const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
@@ -2379,21 +2595,6 @@ static void params_parse(const backend::ModelOptions* request,
if ( request->ropefreqscale() != 0.0f ) {
params.rope_freq_scale = request->ropefreqscale();
}
if (request->grammartriggers_size() > 0) {
LOG_INFO("configuring grammar triggers", {});
llama.grammar_lazy = true;
for (int i = 0; i < request->grammartriggers_size(); i++) {
common_grammar_trigger trigger;
trigger.word = request->grammartriggers(i).word();
trigger.at_start = request->grammartriggers(i).at_start();
llama.grammar_trigger_words.push_back(trigger);
LOG_INFO("grammar trigger", {
{ "word", trigger.word },
{ "at_start", trigger.at_start }
});
}
}
}
@@ -2542,18 +2743,6 @@ public:
return grpc::Status::OK;
}
grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
json data = parse_options(false, request, llama);
std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
for (int i=0 ; i< tokens.size(); i++){
response->add_tokens(tokens[i]);
}
return grpc::Status::OK;
}
grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
llama_client_slot* active_slot = llama.get_active_slot();

View File

@@ -0,0 +1,204 @@
package main
// This is a wrapper to statisfy the GRPC service interface
// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
import (
"fmt"
"github.com/go-skynet/go-llama.cpp"
"github.com/mudler/LocalAI/pkg/grpc/base"
pb "github.com/mudler/LocalAI/pkg/grpc/proto"
)
type LLM struct {
base.SingleThread
llama *llama.LLama
}
func (llm *LLM) Load(opts *pb.ModelOptions) error {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
llamaOpts := []llama.ModelOption{
llama.WithRopeFreqBase(ropeFreqBase),
llama.WithRopeFreqScale(ropeFreqScale),
}
if opts.NGQA != 0 {
llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
}
if opts.RMSNormEps != 0 {
llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
}
if opts.ContextSize != 0 {
llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
}
if opts.F16Memory {
llamaOpts = append(llamaOpts, llama.EnableF16Memory)
}
if opts.Embeddings {
llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
}
if opts.NGPULayers != 0 {
llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
}
llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
if opts.NBatch != 0 {
llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
} else {
llamaOpts = append(llamaOpts, llama.SetNBatch(512))
}
if opts.NUMA {
llamaOpts = append(llamaOpts, llama.EnableNUMA)
}
if opts.LowVRAM {
llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
}
model, err := llama.New(opts.ModelFile, llamaOpts...)
llm.llama = model
return err
}
func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
ropeFreqBase := float32(10000)
ropeFreqScale := float32(1)
if opts.RopeFreqBase != 0 {
ropeFreqBase = opts.RopeFreqBase
}
if opts.RopeFreqScale != 0 {
ropeFreqScale = opts.RopeFreqScale
}
predictOptions := []llama.PredictOption{
llama.SetTemperature(opts.Temperature),
llama.SetTopP(opts.TopP),
llama.SetTopK(int(opts.TopK)),
llama.SetTokens(int(opts.Tokens)),
llama.SetThreads(int(opts.Threads)),
llama.WithGrammar(opts.Grammar),
llama.SetRopeFreqBase(ropeFreqBase),
llama.SetRopeFreqScale(ropeFreqScale),
llama.SetNegativePromptScale(opts.NegativePromptScale),
llama.SetNegativePrompt(opts.NegativePrompt),
}
if opts.PromptCacheAll {
predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
}
if opts.PromptCacheRO {
predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
}
// Expected absolute path
if opts.PromptCachePath != "" {
predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
}
if opts.Mirostat != 0 {
predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
}
if opts.MirostatETA != 0 {
predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
}
if opts.MirostatTAU != 0 {
predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
}
if opts.Debug {
predictOptions = append(predictOptions, llama.Debug)
}
predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
if opts.PresencePenalty != 0 {
predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
}
if opts.NKeep != 0 {
predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
}
if opts.Batch != 0 {
predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
}
if opts.F16KV {
predictOptions = append(predictOptions, llama.EnableF16KV)
}
if opts.IgnoreEOS {
predictOptions = append(predictOptions, llama.IgnoreEOS)
}
if opts.Seed != 0 {
predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
}
//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
return predictOptions
}
func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
}
func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
predictOptions := buildPredictOptions(opts)
predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
results <- token
return true
}))
go func() {
_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
if err != nil {
fmt.Println("err: ", err)
}
close(results)
}()
return nil
}
func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
predictOptions := buildPredictOptions(opts)
if len(opts.EmbeddingTokens) > 0 {
tokens := []int{}
for _, t := range opts.EmbeddingTokens {
tokens = append(tokens, int(t))
}
return llm.llama.TokenEmbeddings(tokens, predictOptions...)
}
return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
}

View File

@@ -0,0 +1,19 @@
package main
import (
"flag"
grpc "github.com/mudler/LocalAI/pkg/grpc"
)
var (
addr = flag.String("addr", "localhost:50051", "the address to connect to")
)
func main() {
flag.Parse()
if err := grpc.StartServer(*addr, &LLM{}); err != nil {
panic(err)
}
}

View File

@@ -1,6 +1,6 @@
accelerate
auto-gptq==0.7.1
grpcio==1.70.0
grpcio==1.69.0
protobuf
certifi
transformers

View File

@@ -1,4 +1,4 @@
bark==0.1.5
grpcio==1.70.0
grpcio==1.69.0
protobuf
certifi

View File

@@ -1,3 +1,3 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
grpcio-tools

View File

@@ -1,4 +1,4 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
certifi
packaging==24.1

View File

@@ -1,5 +1,5 @@
setuptools
grpcio==1.70.0
grpcio==1.69.0
pillow
protobuf
certifi

View File

@@ -1,4 +1,4 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
certifi
wheel

View File

@@ -1,3 +1,3 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
grpcio-tools

View File

@@ -1,4 +1,4 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
phonemizer
scipy

View File

@@ -1,3 +1,3 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
certifi

View File

@@ -5,4 +5,4 @@ accelerate
transformers
bitsandbytes
outetts
sentence-transformers==3.4.1
sentence-transformers==3.3.1

View File

@@ -6,4 +6,4 @@ accelerate
transformers
bitsandbytes
outetts
sentence-transformers==3.4.1
sentence-transformers==3.3.1

View File

@@ -5,4 +5,4 @@ numba==0.60.0
transformers
bitsandbytes
outetts
sentence-transformers==3.4.1
sentence-transformers==3.3.1

View File

@@ -7,4 +7,4 @@ numba==0.60.0
bitsandbytes
outetts
bitsandbytes
sentence-transformers==3.4.1
sentence-transformers==3.3.1

View File

@@ -8,4 +8,4 @@ numba==0.60.0
intel-extension-for-transformers
bitsandbytes
outetts
sentence-transformers==3.4.1
sentence-transformers==3.3.1

View File

@@ -1,4 +1,4 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
certifi
setuptools

View File

@@ -1,4 +1,4 @@
grpcio==1.70.0
grpcio==1.69.0
protobuf
certifi
setuptools

View File

@@ -62,7 +62,7 @@ func New(opts ...config.AppOption) (*Application, error) {
}
}
if err := pkgStartup.InstallModels(options.Galleries, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
log.Error().Err(err).Msg("error installing models")
}

View File

@@ -118,19 +118,9 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
nGPULayers = *c.NGPULayers
}
triggers := make([]*pb.GrammarTrigger, 0)
for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
triggers = append(triggers, &pb.GrammarTrigger{
Word: t.Word,
AtStart: t.AtStart,
})
}
return &pb.ModelOptions{
CUDA: c.CUDA || c.Diffusers.CUDA,
SchedulerType: c.Diffusers.SchedulerType,
GrammarTriggers: triggers,
PipelineType: c.Diffusers.PipelineType,
CFGScale: c.CFGScale,
LoraAdapter: c.LoraAdapter,

View File

@@ -16,7 +16,12 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
inferenceModel, err = loader.Load(opts...)
if backendConfig.Backend == "" {
inferenceModel, err = loader.Load(opts...)
} else {
opts = append(opts, model.WithBackendString(backendConfig.Backend))
inferenceModel, err = loader.Load(opts...)
}
if err != nil {
return schema.TokenizeResponse{}, err
}
@@ -30,10 +35,6 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
return schema.TokenizeResponse{}, err
}
if resp.Tokens == nil {
resp.Tokens = make([]int32, 0)
}
return schema.TokenizeResponse{
Tokens: resp.Tokens,
}, nil

View File

@@ -100,7 +100,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
}
err = startup.InstallModels(galleries, mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
err = startup.InstallModels(galleries, "", mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
if err != nil {
return err
}

View File

@@ -32,6 +32,7 @@ type RunCMD struct {
Galleries string `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
AutoloadGalleries bool `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
RemoteLibrary string `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
PreloadModels string `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
Models []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"`
PreloadModelsConfig string `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`
@@ -89,6 +90,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
config.WithF16(r.F16),
config.WithStringGalleries(r.Galleries),
config.WithModelLibraryURL(r.RemoteLibrary),
config.WithCors(r.CORS),
config.WithCorsAllowOrigins(r.CORSAllowOrigins),
config.WithCsrf(r.CSRF),

View File

@@ -44,6 +44,8 @@ type ApplicationConfig struct {
DisableGalleryEndpoint bool
LoadToMemory []string
ModelLibraryURL string
Galleries []Gallery
BackendAssets embed.FS
@@ -124,6 +126,12 @@ func WithP2PToken(s string) AppOption {
}
}
func WithModelLibraryURL(url string) AppOption {
return func(o *ApplicationConfig) {
o.ModelLibraryURL = url
}
}
func WithLibPath(path string) AppOption {
return func(o *ApplicationConfig) {
o.LibPath = path

View File

@@ -287,8 +287,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
defaultTopP := 0.95
defaultTopK := 40
defaultTemp := 0.9
// https://github.com/mudler/LocalAI/issues/2780
defaultMirostat := 0
defaultMirostat := 2
defaultMirostatTAU := 5.0
defaultMirostatETA := 0.1
defaultTypicalP := 1.0

View File

@@ -48,9 +48,9 @@ parameters:
Expect(config.Name).To(Equal("bar-baz"))
Expect(config.Validate()).To(BeTrue())
// download https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml
// download https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml
httpClient := http.Client{}
resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml")
resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml")
Expect(err).To(BeNil())
defer resp.Body.Close()
tmp, err = os.CreateTemp("", "config.yaml")

View File

@@ -48,10 +48,8 @@ var _ = Describe("Model test", func() {
defer os.RemoveAll(tempdir)
gallery := []GalleryModel{{
Metadata: Metadata{
Name: "bert",
URL: bertEmbeddingsURL,
},
Name: "bert",
URL: bertEmbeddingsURL,
}}
out, err := yaml.Marshal(gallery)
Expect(err).ToNot(HaveOccurred())

View File

@@ -11,14 +11,6 @@ import (
// It is used to install the model by resolving the URL and downloading the files.
// The other fields are used to override the configuration of the model.
type GalleryModel struct {
Metadata `json:",inline" yaml:",inline"`
// config_file is read in the situation where URL is blank - and therefore this is a base config.
ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
// Overrides are used to override the configuration of the model located at URL
Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
}
type Metadata struct {
URL string `json:"url,omitempty" yaml:"url,omitempty"`
Name string `json:"name,omitempty" yaml:"name,omitempty"`
Description string `json:"description,omitempty" yaml:"description,omitempty"`
@@ -26,6 +18,10 @@ type Metadata struct {
URLs []string `json:"urls,omitempty" yaml:"urls,omitempty"`
Icon string `json:"icon,omitempty" yaml:"icon,omitempty"`
Tags []string `json:"tags,omitempty" yaml:"tags,omitempty"`
// config_file is read in the situation where URL is blank - and therefore this is a base config.
ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
// Overrides are used to override the configuration of the model located at URL
Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
// AdditionalFiles are used to add additional files to the model
AdditionalFiles []File `json:"files,omitempty" yaml:"files,omitempty"`
// Gallery is a reference to the gallery which contains the model

View File

@@ -9,11 +9,7 @@ import (
var _ = Describe("Gallery API tests", func() {
Context("requests", func() {
It("parses github with a branch", func() {
req := GalleryModel{
Metadata: Metadata{
URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main",
},
}
req := GalleryModel{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"}
e, err := GetGalleryConfigFromURL(req.URL, "")
Expect(err).ToNot(HaveOccurred())
Expect(e.Name).To(Equal("gpt4all-j"))

View File

@@ -299,18 +299,14 @@ var _ = Describe("API test", func() {
g := []gallery.GalleryModel{
{
Metadata: gallery.Metadata{
Name: "bert",
URL: bertEmbeddingsURL,
},
Name: "bert",
URL: bertEmbeddingsURL,
},
{
Metadata: gallery.Metadata{
Name: "bert2",
URL: bertEmbeddingsURL,
AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
},
Overrides: map[string]interface{}{"foo": "bar"},
Name: "bert2",
URL: bertEmbeddingsURL,
Overrides: map[string]interface{}{"foo": "bar"},
AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
},
}
out, err := yaml.Marshal(g)
@@ -480,7 +476,7 @@ var _ = Describe("API test", func() {
})
It("apply models from config", func() {
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -526,6 +522,77 @@ var _ = Describe("API test", func() {
Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
})
It("runs openllama(llama-ggml backend)", Label("llama"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
}
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
URL: "github:go-skynet/model-gallery/openllama_3b.yaml",
Name: "openllama_3b",
Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
uuid := response["uuid"].(string)
Eventually(func() bool {
response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
return response["processed"].(bool)
}, "360s", "10s").Should(Equal(true))
By("testing completion")
resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp.Choices)).To(Equal(1))
Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
By("testing functions")
resp2, err := client.CreateChatCompletion(
context.TODO(),
openai.ChatCompletionRequest{
Model: "openllama_3b",
Messages: []openai.ChatCompletionMessage{
{
Role: "user",
Content: "What is the weather like in San Francisco (celsius)?",
},
},
Functions: []openai.FunctionDefinition{
openai.FunctionDefinition{
Name: "get_current_weather",
Description: "Get the current weather",
Parameters: jsonschema.Definition{
Type: jsonschema.Object,
Properties: map[string]jsonschema.Definition{
"location": {
Type: jsonschema.String,
Description: "The city and state, e.g. San Francisco, CA",
},
"unit": {
Type: jsonschema.String,
Enum: []string{"celcius", "fahrenheit"},
},
},
Required: []string{"location"},
},
},
},
})
Expect(err).ToNot(HaveOccurred())
Expect(len(resp2.Choices)).To(Equal(1))
Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
var res map[string]string
err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
Expect(err).ToNot(HaveOccurred())
Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
})
It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
if runtime.GOOS != "linux" {
Skip("test supported only on linux")
@@ -533,7 +600,7 @@ var _ = Describe("API test", func() {
modelName := "hermes-2-pro-mistral"
response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
})
Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))

View File

@@ -117,25 +117,19 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
// @Router /models/available [get]
func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
return func(c *fiber.Ctx) error {
log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
if err != nil {
return err
}
log.Debug().Msgf("Available %d models from %d galleries\n", len(models), len(mgs.galleries))
m := []gallery.Metadata{}
for _, mm := range models {
m = append(m, mm.Metadata)
log.Debug().Msgf("Models found from galleries: %+v", models)
for _, m := range models {
log.Debug().Msgf("Model found from galleries: %+v", m)
}
log.Debug().Msgf("Models %#v", m)
dat, err := json.Marshal(m)
dat, err := json.Marshal(models)
if err != nil {
return fmt.Errorf("could not marshal models: %w", err)
return err
}
return c.Send(dat)
}

View File

@@ -12,7 +12,6 @@ import (
// TokenizeEndpoint exposes a REST API to tokenize the content
// @Summary Tokenize the input.
// @Param request body schema.TokenizeRequest true "Request"
// @Success 200 {object} schema.TokenizeResponse "Response"
// @Router /v1/tokenize [post]
func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
@@ -52,6 +51,8 @@ func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, app
return err
}
return c.JSON(tokenResponse)
c.JSON(tokenResponse)
return nil
}
}

View File

@@ -401,11 +401,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
log.Debug().Msgf("Text content to return: %s", textContentToReturn)
noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0
finishReason := "stop"
if len(input.Tools) > 0 {
finishReason = "tool_calls"
}
switch {
case noActionsToRun:
result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput)
@@ -413,18 +408,19 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
log.Error().Err(err).Msg("error handling question")
return
}
*c = append(*c, schema.Choice{
FinishReason: finishReason,
Message: &schema.Message{Role: "assistant", Content: &result}})
Message: &schema.Message{Role: "assistant", Content: &result}})
default:
toolChoice := schema.Choice{
FinishReason: finishReason,
Message: &schema.Message{
Role: "assistant",
},
}
if len(input.Tools) > 0 {
toolChoice.FinishReason = "tool_calls"
}
for _, ss := range results {
name, args := ss.Name, ss.Arguments
if len(input.Tools) > 0 {
@@ -442,7 +438,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
},
)
} else {
// otherwise we return more choices directly (deprecated)
// otherwise we return more choices directly
*c = append(*c, schema.Choice{
FinishReason: "function_call",
Message: &schema.Message{

View File

@@ -129,7 +129,7 @@ func (g *GalleryService) Start(c context.Context, cl *config.BackendConfigLoader
if op.GalleryModelName != "" {
err = gallery.InstallModelFromGallery(op.Galleries, op.GalleryModelName, g.appConfig.ModelPath, op.Req, progressCallback, g.appConfig.EnforcePredownloadScans)
} else if op.ConfigURL != "" {
err = startup.InstallModels(op.Galleries, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL)
err = startup.InstallModels(op.Galleries, op.ConfigURL, g.appConfig.ModelPath, g.appConfig.EnforcePredownloadScans, progressCallback, op.ConfigURL)
if err != nil {
updateError(err)
continue

View File

@@ -148,9 +148,6 @@ function:
no_action_function_name: "" # Function name to call when no action is determined.
no_action_description_name: "" # Description name for no-action functions.
response_regex: [] # Regular expressions to match response from
argument_regex: [] # Named regular to extract function arguments from the response.
argument_regex_key_name: "key" # Name of the named regex capture to capture the key of the function arguments
argument_regex_value_name: "value" # Name of the named regex capture to capture the value of the function arguments
json_regex_match: [] # Regular expressions to match JSON data when in tool mode
replace_function_results: [] # Placeholder to replace function call results with arbitrary strings or patterns.
replace_llm_results: [] # Replace language model results with arbitrary strings or patterns.

View File

@@ -0,0 +1,126 @@
+++
disableToc = false
title = "Run other Models"
weight = 23
icon = "rocket_launch"
+++
## Running other models
> _Do you have already a model file? Skip to [Run models manually]({{%relref "docs/getting-started/models" %}})_.
To load models into LocalAI, you can either [use models manually]({{%relref "docs/getting-started/models" %}}) or configure LocalAI to pull the models from external sources, like Huggingface and configure the model.
To do that, you can point LocalAI to an URL to a YAML configuration file - however - LocalAI does also have some popular model configuration embedded in the binary as well. Below you can find a list of the models configuration that LocalAI has pre-built, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}) on how to configure models from URLs.
There are different categories of models: [LLMs]({{%relref "docs/features/text-generation" %}}), [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) , [Embeddings]({{%relref "docs/features/embeddings" %}}), [Audio to Text]({{%relref "docs/features/audio-to-text" %}}), and [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) depending on the backend being used and the model architecture.
{{% alert icon="💡" %}}
To customize the models, see [Model customization]({{%relref "docs/getting-started/customize-model" %}}). For more model configurations, visit the [Examples Section](https://github.com/mudler/LocalAI-examples/tree/main/configurations) and the configurations for the models below is available [here](https://github.com/mudler/LocalAI/tree/master/embedded/models).
{{% /alert %}}
{{< tabs tabTotal="3" >}}
{{% tab tabName="CPU-only" %}}
> 💡Don't need GPU acceleration? use the CPU images which are lighter and do not have Nvidia dependencies
| Model | Category | Docker command |
| --- | --- | --- |
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core phi-2``` |
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bakllava``` |
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.5``` |
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-mistral``` |
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava-1.6-vicuna``` |
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mistral-openorca``` |
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core bert-cpp``` |
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg all-minilm-l6-v2``` |
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core whisper-base``` |
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core rhasspy-voice-en-us-amy``` |
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg coqui``` |
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg bark``` |
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg vall-e-x``` |
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core mixtral-instruct``` |
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core tinyllama-chat``` |
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core dolphin-2.5-mixtral-8x7b``` |
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | GPU-only |
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) (with transformers) | [LLM]({{%relref "docs/features/text-generation" %}}) | GPU-only |
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) (with llama.cpp) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core codellama-7b-gguf``` |
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core hermes-2-pro-mistral``` |
{{% /tab %}}
{{% tab tabName="GPU (CUDA 11)" %}}
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
| Model | Category | Docker command |
| --- | --- | --- |
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core phi-2``` |
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bakllava``` |
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.5``` |
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-mistral``` |
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda11-core llava-1.6-vicuna``` |
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mistral-openorca``` |
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core bert-cpp``` |
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 all-minilm-l6-v2``` |
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core whisper-base``` |
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core rhasspy-voice-en-us-amy``` |
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 coqui``` |
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 bark``` |
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 vall-e-x``` |
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core mixtral-instruct``` |
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core tinyllama-chat``` |
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core dolphin-2.5-mixtral-8x7b``` |
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 mamba-chat``` |
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda11 animagine-xl``` |
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 transformers-tinyllama``` |
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11 codellama-7b``` |
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core codellama-7b-gguf``` |
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda11-core hermes-2-pro-mistral``` |
{{% /tab %}}
{{% tab tabName="GPU (CUDA 12)" %}}
> To know which version of CUDA do you have available, you can check with `nvidia-smi` or `nvcc --version` see also [GPU acceleration]({{%relref "docs/features/gpu-acceleration" %}}).
| Model | Category | Docker command |
| --- | --- | --- |
| [phi-2](https://huggingface.co/microsoft/phi-2) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core phi-2``` |
| 🌋 [bakllava](https://github.com/SkunkworksAI/BakLLaVA) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bakllava``` |
| 🌋 [llava-1.5](https://llava-vl.github.io/) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.5``` |
| 🌋 [llava-1.6-mistral](https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-mistral``` |
| 🌋 [llava-1.6-vicuna](https://huggingface.co/cmp-nct/llava-1.6-gguf) | [Multimodal LLM]({{%relref "docs/features/gpt-vision" %}}) | ```docker run -ti -p 8080:8080 localai/localai:{{< version >}}-cublas-cuda12-core llava-1.6-vicuna``` |
| [mistral-openorca](https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mistral-openorca``` |
| [bert-cpp](https://github.com/skeskinen/bert.cpp) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core bert-cpp``` |
| [all-minilm-l6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) | [Embeddings]({{%relref "docs/features/embeddings" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 all-minilm-l6-v2``` |
| whisper-base | [Audio to Text]({{%relref "docs/features/audio-to-text" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core whisper-base``` |
| rhasspy-voice-en-us-amy | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core rhasspy-voice-en-us-amy``` |
| 🐸 [coqui](https://github.com/coqui-ai/TTS) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 coqui``` |
| 🐶 [bark](https://github.com/suno-ai/bark) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 bark``` |
| 🔊 [vall-e-x](https://github.com/Plachtaa/VALL-E-X) | [Text to Audio]({{%relref "docs/features/text-to-audio" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 vall-e-x``` |
| mixtral-instruct Mixtral-8x7B-Instruct-v0.1 | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core mixtral-instruct``` |
| [tinyllama-chat](https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF) [original model](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v0.3) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core tinyllama-chat``` |
| [dolphin-2.5-mixtral-8x7b](https://huggingface.co/TheBloke/dolphin-2.5-mixtral-8x7b-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core dolphin-2.5-mixtral-8x7b``` |
| 🐍 [mamba](https://github.com/state-spaces/mamba) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 mamba-chat``` |
| animagine-xl | [Text to Image]({{%relref "docs/features/image-generation" %}}) | ```docker run -ti -p 8080:8080 -e COMPEL=0 --gpus all localai/localai:{{< version >}}-cublas-cuda12 animagine-xl``` |
| transformers-tinyllama | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 transformers-tinyllama``` |
| [codellama-7b](https://huggingface.co/codellama/CodeLlama-7b-hf) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12 codellama-7b``` |
| [codellama-7b-gguf](https://huggingface.co/TheBloke/CodeLlama-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core codellama-7b-gguf``` |
| [hermes-2-pro-mistral](https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF) | [LLM]({{%relref "docs/features/text-generation" %}}) | ```docker run -ti -p 8080:8080 --gpus all localai/localai:{{< version >}}-cublas-cuda12-core hermes-2-pro-mistral``` |
{{% /tab %}}
{{< /tabs >}}
{{% alert icon="💡" %}}
**Tip** You can actually specify multiple models to start an instance with the models loaded, for example to have both llava and phi-2 configured:
```bash
docker run -ti -p 8080:8080 localai/localai:{{< version >}}-ffmpeg-core llava phi-2
```
{{% /alert %}}

View File

@@ -134,12 +134,12 @@ curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
}'
```
An example that installs hermes-2-pro-mistral can be:
An example that installs openllama can be:
```bash
LOCALAI=http://localhost:8080
curl $LOCALAI/models/apply -H "Content-Type: application/json" -d '{
"config_url": "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml"
"config_url": "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml"
}'
```

View File

@@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a
{{% alert note %}}
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.
The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings.
{{% /alert %}}
@@ -175,12 +175,25 @@ name: llama
backend: llama
parameters:
# Relative to the models path
model: file.gguf
model: file.gguf.bin
```
In the example above we specify `llama` as the backend to restrict loading `gguf` models only.
For instance, to use the `llama-ggml` backend for `ggml` models:
```yaml
name: llama
backend: llama-ggml
parameters:
# Relative to the models path
model: file.ggml.bin
```
#### Reference
- [llama](https://github.com/ggerganov/llama.cpp)
- [binding](https://github.com/go-skynet/go-llama.cpp)
### exllama/2

View File

@@ -143,7 +143,7 @@ The AIO Images are inheriting the same environment variables as the base images
| Variable | Default | Description |
| ---------------------| ------- | ----------- |
| `PROFILE` | Auto-detected | The size of the model to use. Available: `cpu`, `gpu-8g` |
| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/getting-started/models" %}})) |
| `MODELS` | Auto-detected | A list of models YAML Configuration file URI/URL (see also [running models]({{%relref "docs/advanced/run-other-models" %}})) |
## Standard container images
@@ -154,7 +154,7 @@ Images are available with and without python dependencies. Note that images with
Images with `core` in the tag are smaller and do not contain any python dependencies.
{{< tabs tabTotal="8" >}}
{{< tabs tabTotal="7" >}}
{{% tab tabName="Vanilla / CPU Images" %}}
| Description | Quay | Docker Hub |
@@ -236,18 +236,6 @@ Images with `core` in the tag are smaller and do not contain any python dependen
| Versioned image including FFMpeg, no python | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan-fmpeg-core` | `localai/localai:{{< version >}}-vulkan-fmpeg-core` |
{{% /tab %}}
{{% tab tabName="Nvidia Linux for tegra" %}}
These images are compatible with Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Xavier. For more information, see the [Nvidia L4T guide]({{%relref "docs/reference/nvidia-l4t" %}}).
| Description | Quay | Docker Hub |
| --- | --- |-------------------------------------------------------------|
| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core` | `localai/localai:master-nvidia-l4t-arm64-core` |
| Latest tag | `quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64-core` | `localai/localai:latest-nvidia-l4t-arm64-core` |
| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-nvidia-l4t-arm64-core` | `localai/localai:{{< version >}}-nvidia-l4t-arm64-core` |
{{% /tab %}}
{{< /tabs >}}
## See Also

View File

@@ -40,10 +40,6 @@ icon = "info"
</a>
</p>
<p align="center">
<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
</p>
<p align="center">
<a href="https://twitter.com/LocalAI_API" target="blank">
<img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
@@ -122,24 +118,7 @@ To help the project you can:
## 🌟 Star history
[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=mudler/LocalAI&type=Date)](https://star-history.com/#mudler/LocalAI&Date)
## ❤️ Sponsors
> Do you find LocalAI useful?
Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.
A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):
<p align="center">
<a href="https://www.spectrocloud.com/" target="blank">
<img width=200 src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
</a>
<a href="https://www.premai.io/" target="blank">
<img width=200 src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
</a>
</p>
[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
## 📖 License

View File

@@ -21,13 +21,7 @@ git clone https://github.com/mudler/LocalAI
cd LocalAI
docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core .
```
Otherwise images are available on quay.io and dockerhub:
```bash
docker pull quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t localai-orin .
```
## Usage
@@ -35,7 +29,7 @@ docker pull quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
Run the LocalAI container on Nvidia ARM64 devices using the following command, where `/data/models` is the directory containing the models:
```bash
docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models -ti --restart=always --name local-ai --runtime nvidia --gpus all quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
docker run -e DEBUG=true -p 8080:8080 -v /data/models:/build/models -ti --restart=always --name local-ai --runtime nvidia --gpus all localai-orin
```
Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.

72
embedded/embedded.go Normal file
View File

@@ -0,0 +1,72 @@
package embedded
import (
"embed"
"fmt"
"slices"
"strings"
"github.com/mudler/LocalAI/pkg/downloader"
"github.com/rs/zerolog/log"
"github.com/mudler/LocalAI/pkg/assets"
"gopkg.in/yaml.v3"
)
var modelShorteners map[string]string
//go:embed model_library.yaml
var modelLibrary []byte
//go:embed models/*
var embeddedModels embed.FS
func ModelShortURL(s string) string {
if _, ok := modelShorteners[s]; ok {
s = modelShorteners[s]
}
return s
}
func init() {
err := yaml.Unmarshal(modelLibrary, &modelShorteners)
if err != nil {
log.Error().Err(err).Msg("error while unmarshalling embedded modelLibrary")
}
}
func GetRemoteLibraryShorteners(url string, basePath string) (map[string]string, error) {
remoteLibrary := map[string]string{}
uri := downloader.URI(url)
err := uri.DownloadWithCallback(basePath, func(_ string, i []byte) error {
return yaml.Unmarshal(i, &remoteLibrary)
})
if err != nil {
return nil, fmt.Errorf("error downloading remote library: %s", err.Error())
}
return remoteLibrary, err
}
// ExistsInModelsLibrary checks if a model exists in the embedded models library
func ExistsInModelsLibrary(s string) bool {
f := fmt.Sprintf("%s.yaml", s)
a := []string{}
for _, j := range assets.ListFiles(embeddedModels) {
a = append(a, strings.TrimPrefix(j, "models/"))
}
return slices.Contains(a, f)
}
// ResolveContent returns the content in the embedded model library
func ResolveContent(s string) ([]byte, error) {
if ExistsInModelsLibrary(s) {
return embeddedModels.ReadFile(fmt.Sprintf("models/%s.yaml", s))
}
return nil, fmt.Errorf("cannot find model %s", s)
}

View File

@@ -0,0 +1,9 @@
###
###
### This file contains the list of models that are available in the library
### The URLs are automatically expanded when local-ai is being called with the key as argument
###
### For models with an entire YAML file to be embededd, put the file inside the `models`
### directory, it will be automatically available with the file name as key (without the .yaml extension)
phi-2: "github://mudler/LocalAI-examples/configurations/phi-2.yaml@main"

View File

@@ -0,0 +1,13 @@
name: all-minilm-l6-v2
backend: sentencetransformers
embeddings: true
parameters:
model: all-MiniLM-L6-v2
usage: |
You can test this model with curl like this:
curl http://localhost:8080/embeddings -X POST -H "Content-Type: application/json" -d '{
"input": "Your text string goes here",
"model": "all-minilm-l6-v2"
}'

View File

@@ -0,0 +1,17 @@
name: animagine-xl
parameters:
model: Linaqruf/animagine-xl
backend: diffusers
f16: true
diffusers:
scheduler_type: euler_a
usage: |
curl http://localhost:8080/v1/images/generations \
-H "Content-Type: application/json" \
-d '{
"prompt": "<positive prompt>|<negative prompt>",
"model": "animagine-xl",
"step": 51,
"size": "1024x1024"
}'

View File

@@ -0,0 +1,40 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: bakllava
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters:
model: bakllava.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: bakllava.gguf
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "bakllava",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -0,0 +1,8 @@
usage: |
bark works without any configuration, to test it, you can run the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"backend": "bark",
"input":"Hello, this is a test!"
}' | aplay
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models

View File

@@ -0,0 +1,24 @@
backend: llama
context_size: 8192
f16: false
gpu_layers: 90
name: cerbero
mmap: false
parameters:
model: huggingface://galatolo/cerbero-7b-gguf/ggml-model-Q8_0.gguf
top_k: 80
temperature: 0.2
top_p: 0.7
template:
completion: "{{.Input}}"
chat: "Questa è una conversazione tra un umano ed un assistente AI.\n{{.Input}}\n[|Assistente|] "
roles:
user: "[|Umano|] "
system: "[|Umano|] "
assistant: "[|Assistente|] "
stopwords:
- "[|Umano|]"
trimsuffix:
- "\n"

View File

@@ -0,0 +1,20 @@
name: codellama-7b-gguf
backend: transformers
parameters:
model: huggingface://TheBloke/CodeLlama-7B-GGUF/codellama-7b.Q4_K_M.gguf
temperature: 0.5
top_k: 40
seed: -1
top_p: 0.95
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
context_size: 4096
f16: true
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "codellama-7b-gguf",
"prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
}'

View File

@@ -0,0 +1,14 @@
name: codellama-7b
backend: transformers
type: AutoModelForCausalLM
parameters:
model: codellama/CodeLlama-7b-hf
temperature: 0.2
top_k: 40
top_p: 0.95
usage: |
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "codellama-7b",
"prompt": "import socket\n\ndef ping_exponential_backoff(host: str):"
}'

View File

@@ -0,0 +1,9 @@
usage: |
coqui works without any configuration, to test it, you can run the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"backend": "coqui",
"model": "tts_models/en/ljspeech/glow-tts",
"input":"Hello, this is a test!"
}'
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models

View File

@@ -0,0 +1,31 @@
name: dolphin-mixtral-8x7b
mmap: true
parameters:
model: huggingface://TheBloke/dolphin-2.5-mixtral-8x7b-GGUF/dolphin-2.5-mixtral-8x7b.Q2_K.gguf
temperature: 0.5
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "dolphin-mixtral-8x7b",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@@ -0,0 +1,59 @@
name: hermes-2-pro-mistral
mmap: true
parameters:
model: huggingface://NousResearch/Hermes-2-Pro-Mistral-7B-GGUF/Hermes-2-Pro-Mistral-7B.Q6_K.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}
{{- if .FunctionCall }}
<tool_call>
{{- else if eq .RoleName "tool" }}
<tool_response>
{{- end }}
{{- if .Content}}
{{.Content }}
{{- end }}
{{- if .FunctionCall}}
{{toJson .FunctionCall}}
{{- end }}
{{- if .FunctionCall }}
</tool_call>
{{- else if eq .RoleName "tool" }}
</tool_response>
{{- end }}<|im_end|>
# https://huggingface.co/NousResearch/Hermes-2-Pro-Mistral-7B-GGUF#prompt-format-for-function-calling
function: |
<|im_start|>system
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
Use the following pydantic model json schema for each tool call you will make:
{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}
For each function call return a json object with function name and arguments within <tool_call></tool_call> XML tags as follows:
<tool_call>
{'arguments': <args-dict>, 'name': <function-name>}
</tool_call><|im_end|>
{{.Input -}}
<|im_start|>assistant
<tool_call>
chat: |
{{.Input -}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "\n</tool_call>"
- "\n\n\n"
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "hermes-2-pro-mistral",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@@ -0,0 +1,48 @@
name: llama3-8b-instruct
mmap: true
parameters:
model: huggingface://second-state/Llama-3-8B-Instruct-GGUF/Meta-Llama-3-8B-Instruct-Q5_K_M.gguf
template:
chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
{{ if .FunctionCall -}}
Function call:
{{ else if eq .RoleName "tool" -}}
Function response:
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ else if .FunctionCall -}}
{{ toJson .FunctionCall -}}
{{ end -}}
<|eot_id|>
function: |
<|start_header_id|>system<|end_header_id|>
You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools:
<tools>
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
</tools>
Use the following pydantic model json schema for each tool call you will make:
{'title': 'FunctionCall', 'type': 'object', 'properties': {'arguments': {'title': 'Arguments', 'type': 'object'}, 'name': {'title': 'Name', 'type': 'string'}}, 'required': ['arguments', 'name']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>
Function call:
chat: |
<|begin_of_text|>{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
completion: |
{{.Input}}
context_size: 8192
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "<|eot_id|>"
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llama3-8b-instruct",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@@ -0,0 +1,33 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava-1.5
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.5-7b-mmproj-Q8_0.gguf
parameters:
model: llava-v1.5-7b-Q4_K.gguf
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: llava-v1.5-7b-Q4_K.gguf
uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-Q4_K.gguf
- filename: llava-v1.5-7b-mmproj-Q8_0.gguf
uri: huggingface://jartine/llava-v1.5-7B-GGUF/llava-v1.5-7b-mmproj-Q8_0.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava-1.5",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -0,0 +1,33 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava-1.6-mistral
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: llava-v1.6-7b-mmproj-f16.gguf
parameters:
model: llava-v1.6-mistral-7b.gguf
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: llava-v1.6-mistral-7b.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/llava-v1.6-mistral-7b.Q6_K.gguf
- filename: llava-v1.6-7b-mmproj-f16.gguf
uri: huggingface://cjpais/llava-1.6-mistral-7b-gguf/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava-1.6-mistral",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -0,0 +1,37 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava-1.6-vicuna
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: mmproj-vicuna7b-f16.gguf
parameters:
model: vicuna-7b-q5_k.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: vicuna-7b-q5_k.gguf
uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/vicuna-7b-q5_k.gguf
- filename: mmproj-vicuna7b-f16.gguf
uri: https://huggingface.co/cmp-nct/llava-1.6-gguf/resolve/main/mmproj-vicuna7b-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava-1.6-vicuna",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -0,0 +1,40 @@
backend: llama-cpp
context_size: 4096
f16: true
gpu_layers: 90
mmap: true
name: llava
roles:
user: "USER:"
assistant: "ASSISTANT:"
system: "SYSTEM:"
mmproj: bakllava-mmproj.gguf
parameters:
model: bakllava.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat: |
A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.
{{.Input}}
ASSISTANT:
download_files:
- filename: bakllava.gguf
uri: huggingface://mys/ggml_bakllava-1/ggml-model-q4_k.gguf
- filename: bakllava-mmproj.gguf
uri: huggingface://mys/ggml_bakllava-1/mmproj-model-f16.gguf
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "llava",
"messages": [{"role": "user", "content": [{"type":"text", "text": "What is in the image?"}, {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" }}], "temperature": 0.9}]}'

View File

@@ -0,0 +1,21 @@
name: bagel
backend: mamba
parameters:
model: "jondurbin/bagel-dpo-2.8b-v0.2"
systemPrompt: "You are a helpful, unbiased, uncensored assistant."
template:
chat_message: |
{{if eq .RoleName "assistant"}}{{.Content}}{{else}}
[INST]
{{if .SystemPrompt}}{{.SystemPrompt}}{{else if eq .RoleName "system"}}<<SYS>>{{.Content}}<</SYS>>
{{else if .Content}}{{.Content}}{{end}}
[/INST]
{{end}}
completion: |
{{.Input}}
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "bagel",
"messages": [{"role": "user", "content": "how are you doing"}],
}'

View File

@@ -0,0 +1,28 @@
name: mamba-chat
backend: mamba
parameters:
model: "havenhq/mamba-chat"
trimsuffix:
- <|endoftext|>
# https://huggingface.co/HuggingFaceH4/zephyr-7b-beta/blob/main/tokenizer_config.json
# "chat_template": "{% for message in messages %}\n{% if message['role'] == 'user' %}\n{{ '<|user|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'system' %}\n{{ '<|system|>\n' + message['content'] + eos_token }}\n{% elif message['role'] == 'assistant' %}\n{{ '<|assistant|>\n' + message['content'] + eos_token }}\n{% endif %}\n{% if loop.last and add_generation_prompt %}\n{{ '<|assistant|>' }}\n{% endif %}\n{% endfor %}",
template:
chat_message: |
{{if eq .RoleName "assistant"}}<|assistant|>{{else if eq .RoleName "system"}}<|system|>{{else if eq .RoleName "user"}}<|user|>{{end}}
{{if .Content}}{{.Content}}{{end}}
</s>
chat: |
{{.Input}}
<|assistant|>
completion: |
{{.Input}}
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "mamba-chat",
"messages": [{"role": "user", "content": "how are you doing"}],
"temperature": 0.7
}'

View File

@@ -0,0 +1,32 @@
name: mistral-openorca
mmap: true
parameters:
model: huggingface://TheBloke/Mistral-7B-OpenOrca-GGUF/mistral-7b-openorca.Q6_K.gguf
temperature: 0.2
top_k: 40
top_p: 0.95
seed: -1
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}
<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "mistral-openorca",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@@ -0,0 +1,25 @@
name: mixtral-instruct
mmap: true
parameters:
model: huggingface://TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF/mixtral-8x7b-instruct-v0.1.Q2_K.gguf
temperature: 0.2
top_k: 40
seed: -1
top_p: 0.95
mirostat: 2
mirostat_eta: 1.0
mirostat_tau: 1.0
template:
chat: &chat |
[INST] {{.Input}} [/INST]
completion: *chat
context_size: 4096
f16: true
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/completions -H "Content-Type: application/json" -d '{
"model": "mixtral-instruct",
"prompt": "How are you doing?"
}'

View File

@@ -0,0 +1,25 @@
name: phi-2-chat
mmap: true
parameters:
model: huggingface://l3utterfly/phi-2-layla-v1-chatml-gguf/phi-2-layla-v1-chatml-Q8_0.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}
<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "phi-2-chat",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@@ -0,0 +1,30 @@
name: phi-2-orange
mmap: true
parameters:
model: huggingface://l3utterfly/phi-2-orange-GGUF/phi-2-orange.Q6_K.gguf
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}
<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
description: |
This model is a chatbot that can be used for general conversation.
[Model card](https://huggingface.co/TheBloke/phi-2-orange-GGUF)
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "phi-2-orange",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@@ -0,0 +1,13 @@
name: voice-en-us-amy-low
download_files:
- filename: voice-en-us-amy-low.tar.gz
uri: https://github.com/rhasspy/piper/releases/download/v0.0.2/voice-en-us-amy-low.tar.gz
usage: |
To test if this model works as expected, you can use the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"model":"en-us-amy-low.onnx",
"input": "Hi, this is a test."
}'

View File

@@ -0,0 +1,29 @@
name: tinyllama-chat
mmap: true
parameters:
model: huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q8_0.gguf
temperature: 0.2
top_k: 40
seed: -1
top_p: 0.95
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
context_size: 4096
f16: true
stopwords:
- <|im_end|>
gpu_layers: 90
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "tinyllama-chat",
"messages": [{"role": "user", "content": "How are you doing?", "temperature": 0.1}]
}'

View File

@@ -0,0 +1,31 @@
name: tinyllama-chat
backend: transformers
type: AutoModelForCausalLM
parameters:
model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
temperature: 0.2
top_k: 40
top_p: 0.95
max_tokens: 4096
template:
chat_message: |
<|im_start|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "user"}}user{{end}}
{{if .Content}}{{.Content}}{{end}}<|im_end|>
chat: |
{{.Input}}
<|im_start|>assistant
completion: |
{{.Input}}
stopwords:
- <|im_end|>
usage: |
curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "tinyllama-chat",
"messages": [{"role": "user", "content": "Say this is a test!"}],
"temperature": 0.7
}'

View File

@@ -0,0 +1,8 @@
usage: |
Vall-e-x works without any configuration, to test it, you can run the following curl command:
curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{
"backend": "vall-e-x",
"input":"Hello, this is a test!"
}' | aplay
# TODO: This is a placeholder until we manage to pre-load HF/Transformers models

View File

@@ -0,0 +1,18 @@
name: whisper
backend: whisper
parameters:
model: ggml-whisper-base.bin
usage: |
## example audio file
wget --quiet --show-progress -O gb1.ogg https://upload.wikimedia.org/wikipedia/commons/1/1f/George_W_Bush_Columbia_FINAL.ogg
## Send the example audio file to the transcriptions endpoint
curl http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@$PWD/gb1.ogg" -F model="whisper"
download_files:
- filename: "ggml-whisper-base.bin"
sha256: "60ed5bc3dd14eea856493d334349b405782ddcaf0028d4b5df4088345fba2efe"
uri: "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"

View File

@@ -1,23 +0,0 @@
---
name: "deepseek-r1"
config_file: |
context_size: 131072
mmap: true
f16: true
stopwords:
- <begin▁of▁sentence>
- <end▁of▁sentence>
- <User>
- <Assistant>
template:
chat_message: |
{{if eq .RoleName "system" -}}{{.Content }}
{{ end -}}
{{if eq .RoleName "user" -}}<User>{{.Content}}
{{end -}}
{{if eq .RoleName "assistant" -}}<Assistant>{{.Content}}<end▁of▁sentence>{{end}}
completion: |
{{.Input}}
chat: |
{{.Input -}}<Assistant>

View File

@@ -198,35 +198,6 @@
- filename: NightWing3-10B-v0.1-Q4_K_M.gguf
sha256: 2e87671542d22fe1ef9a68e43f2fdab7c2759479ad531946d9f0bdeffa6f5747
uri: huggingface://bartowski/NightWing3-10B-v0.1-GGUF/NightWing3-10B-v0.1-Q4_K_M.gguf
- !!merge <<: *falcon3
name: "virtuoso-lite"
urls:
- https://huggingface.co/arcee-ai/Virtuoso-Lite
- https://huggingface.co/bartowski/Virtuoso-Lite-GGUF
description: |
Virtuoso-Lite (10B) is our next-generation, 10-billion-parameter language model based on the Llama-3 architecture. It is distilled from Deepseek-v3 using ~1.1B tokens/logits, allowing it to achieve robust performance at a significantly reduced parameter count compared to larger models. Despite its compact size, Virtuoso-Lite excels in a variety of tasks, demonstrating advanced reasoning, code generation, and mathematical problem-solving capabilities.
overrides:
parameters:
model: Virtuoso-Lite-Q4_K_M.gguf
files:
- filename: Virtuoso-Lite-Q4_K_M.gguf
sha256: 1d21bef8467a11a1e473d397128b05fb87b7e824606cdaea061e550cb219fee2
uri: huggingface://bartowski/Virtuoso-Lite-GGUF/Virtuoso-Lite-Q4_K_M.gguf
- !!merge <<: *falcon3
name: "suayptalha_maestro-10b"
icon: https://huggingface.co/suayptalha/Maestro-10B/resolve/main/Maestro-Logo.png
urls:
- https://huggingface.co/suayptalha/Maestro-10B
- https://huggingface.co/bartowski/suayptalha_Maestro-10B-GGUF
description: |
Maestro-10B is a 10 billion parameter model fine-tuned from Virtuoso-Lite, a next-generation language model developed by arcee-ai. Virtuoso-Lite itself is based on the Llama-3 architecture, distilled from Deepseek-v3 using approximately 1.1 billion tokens/logits. This distillation process allows Virtuoso-Lite to achieve robust performance with a smaller parameter count, excelling in reasoning, code generation, and mathematical problem-solving. Maestro-10B inherits these strengths from its base model, Virtuoso-Lite, and further enhances them through fine-tuning on the OpenOrca dataset. This combination of a distilled base model and targeted fine-tuning makes Maestro-10B a powerful and efficient language model.
overrides:
parameters:
model: suayptalha_Maestro-10B-Q4_K_M.gguf
files:
- filename: suayptalha_Maestro-10B-Q4_K_M.gguf
sha256: c570381da5624782ce6df4186ace6f747429fcbaf1a22c2a348288d3552eb19c
uri: huggingface://bartowski/suayptalha_Maestro-10B-GGUF/suayptalha_Maestro-10B-Q4_K_M.gguf
- &intellect1
name: "intellect-1-instruct"
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
@@ -485,97 +456,6 @@
- filename: L3.3-Prikol-70B-v0.2-Q4_K_M.gguf
sha256: fc0ff514efbc0b67981c2bf1423d5a2e1b8801e4266ba0c653ea148414fe5ffc
uri: huggingface://bartowski/L3.3-Prikol-70B-v0.2-GGUF/L3.3-Prikol-70B-v0.2-Q4_K_M.gguf
- !!merge <<: *llama33
name: "l3.3-nevoria-r1-70b"
icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/_oWpsvCZ-graNKzJBBjGo.jpeg
urls:
- https://huggingface.co/Steelskull/L3.3-Nevoria-R1-70b
- https://huggingface.co/bartowski/L3.3-Nevoria-R1-70b-GGUF
description: |
This model builds upon the original Nevoria foundation, incorporating the Deepseek-R1 reasoning architecture to enhance dialogue interaction and scene comprehension. While maintaining Nevoria's core strengths in storytelling and scene description (derived from EVA, EURYALE, and Anubis), this iteration aims to improve prompt adherence and creative reasoning capabilities. The model also retains the balanced perspective introduced by Negative_LLAMA and Nemotron elements. Also, the model plays the card to almost a fault, It'll pick up on minor issues and attempt to run with them. Users had it call them out for misspelling a word while playing in character.
Note: While Nevoria-R1 represents a significant architectural change, rather than a direct successor to Nevoria, it operates as a distinct model with its own characteristics.
The lorablated model base choice was intentional, creating unique weight interactions similar to the original Astoria model and Astoria V2 model. This "weight twisting" effect, achieved by subtracting the lorablated base model during merging, creates an interesting balance in the model's behavior. While unconventional compared to sequential component application, this approach was chosen for its unique response characteristics.
overrides:
parameters:
model: L3.3-Nevoria-R1-70b-Q4_K_M.gguf
files:
- filename: L3.3-Nevoria-R1-70b-Q4_K_M.gguf
sha256: 9f32f202fb5b1465c942693bb11eea9e8a1c5686b00602715b495c068eaf1c58
uri: huggingface://bartowski/L3.3-Nevoria-R1-70b-GGUF/L3.3-Nevoria-R1-70b-Q4_K_M.gguf
- !!merge <<: *llama33
name: "nohobby_l3.3-prikol-70b-v0.4"
icon: https://files.catbox.moe/x9t3zo.png
urls:
- https://huggingface.co/Nohobby/L3.3-Prikol-70B-v0.4
- https://huggingface.co/bartowski/Nohobby_L3.3-Prikol-70B-v0.4-GGUF
description: |
I have yet to try it UPD: it sucks, bleh
Sometimes mistakes {{user}} for {{char}} and can't think. Other than that, the behavior is similar to the predecessors.
It sometimes gives some funny replies tho, yay!
overrides:
parameters:
model: Nohobby_L3.3-Prikol-70B-v0.4-Q4_K_M.gguf
files:
- filename: Nohobby_L3.3-Prikol-70B-v0.4-Q4_K_M.gguf
sha256: e1d67a40bdf0526bdfcaa16c6e4dfeecad41651e201b4009b65f4f444b773604
uri: huggingface://bartowski/Nohobby_L3.3-Prikol-70B-v0.4-GGUF/Nohobby_L3.3-Prikol-70B-v0.4-Q4_K_M.gguf
- !!merge <<: *llama33
name: "arliai_llama-3.3-70b-arliai-rpmax-v1.4"
urls:
- https://huggingface.co/ArliAI/Llama-3.3-70B-ArliAI-RPMax-v1.4
- https://huggingface.co/bartowski/ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-GGUF
description: |
RPMax is a series of models that are trained on a diverse set of curated creative writing and RP datasets with a focus on variety and deduplication. This model is designed to be highly creative and non-repetitive by making sure no two entries in the dataset have repeated characters or situations, which makes sure the model does not latch on to a certain personality and be capable of understanding and acting appropriately to any characters or situations.
overrides:
parameters:
model: ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-Q4_K_M.gguf
files:
- filename: ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-Q4_K_M.gguf
sha256: 7c79e76e5c057cfe32529d930360fbebd29697948e5bac4e4b2eb6d2ee596e31
uri: huggingface://bartowski/ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-GGUF/ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-Q4_K_M.gguf
- !!merge <<: *llama33
name: "black-ink-guild_pernicious_prophecy_70b"
icon: https://huggingface.co/Black-Ink-Guild/Pernicious_Prophecy_70B/resolve/main/header.gif
urls:
- https://huggingface.co/Black-Ink-Guild/Pernicious_Prophecy_70B
- https://huggingface.co/bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF
description: |
Pernicious Prophecy 70B is a Llama-3.3 70B-based, two-step model designed by Black Ink Guild (SicariusSicariiStuff and invisietch) for uncensored roleplay, assistant tasks, and general usage.
NOTE: Pernicious Prophecy 70B is an uncensored model and can produce deranged, offensive, and dangerous outputs. You are solely responsible for anything that you choose to do with this model.
overrides:
parameters:
model: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
files:
- filename: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
sha256: d8d4874b837993546b750db3faf1c6e5d867883a6750f04f1f4986973d7c107b
uri: huggingface://bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF/Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
- !!merge <<: *llama33
name: "nohobby_l3.3-prikol-70b-v0.5"
icon: https://files.catbox.moe/x9t3zo.png
urls:
- https://huggingface.co/Nohobby/L3.3-Prikol-70B-v0.5
- https://huggingface.co/bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF
description: |
99% of mergekit addicts quit before they hit it big.
Gosh, I need to create an org for my test runs - my profile looks like a dumpster.
What was it again? Ah, the new model.
Exactly what I wanted. All I had to do was yank out the cursed official DeepSeek distill and here we are.
From the brief tests it gave me some unusual takes on the character cards I'm used to. Just this makes it worth it imo. Also the writing is kinda nice.
overrides:
parameters:
model: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
files:
- filename: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
sha256: 36f29015f1f420f51569603445a3ea5fe72e3651c2022ef064086f5617578fe6
uri: huggingface://bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF/Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
- &rwkv
url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
name: "rwkv-6-world-7b"
@@ -939,8 +819,8 @@
- filename: salamandra-7b-instruct.Q4_K_M-f32.gguf
sha256: bac8e8c1d1d9d53cbdb148b8ff9ad378ddb392429207099e85b5aae3a43bff3d
uri: huggingface://cstr/salamandra-7b-instruct-GGUF/salamandra-7b-instruct.Q4_K_M-f32.gguf
- &llama32
url: "github:mudler/LocalAI/gallery/llama3.2-quantized.yaml@master"
- &llama32 ## llama3.2
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
icon: https://avatars.githubusercontent.com/u/153379578
license: llama3.2
description: |
@@ -1462,7 +1342,11 @@
urls:
- https://huggingface.co/HuggingFaceTB/FineMath-Llama-3B
- https://huggingface.co/bartowski/FineMath-Llama-3B-GGUF
description: "This is a continual-pre-training of Llama-3.2-3B on a mix of \U0001F4D0 FineMath (our new high quality math dataset) and FineWeb-Edu.\n\nThe model demonstrates superior math performance compared to Llama 3.2 3B, while maintaining similar performance on knowledge, reasoning, and common sense benchmarks.\nIt was trained on 160B tokens using a mix of 40% FineWeb-Edu and 60% from FineMath (30% FineMath-4+ subset and 30% InfiWebMath-4+ subset). We use nanotron for the training, and you can find the training scripts in our SmolLM2 GitHub repo.\n"
description: |
This is a continual-pre-training of Llama-3.2-3B on a mix of 📐 FineMath (our new high quality math dataset) and FineWeb-Edu.
The model demonstrates superior math performance compared to Llama 3.2 3B, while maintaining similar performance on knowledge, reasoning, and common sense benchmarks.
It was trained on 160B tokens using a mix of 40% FineWeb-Edu and 60% from FineMath (30% FineMath-4+ subset and 30% InfiWebMath-4+ subset). We use nanotron for the training, and you can find the training scripts in our SmolLM2 GitHub repo.
overrides:
parameters:
model: FineMath-Llama-3B-Q4_K_M.gguf
@@ -1470,53 +1354,8 @@
- filename: FineMath-Llama-3B-Q4_K_M.gguf
sha256: 16c73b5cf2a417a7e1608bcc9469f1461fc3e759ce04a3a337f48df977dc158c
uri: huggingface://bartowski/FineMath-Llama-3B-GGUF/FineMath-Llama-3B-Q4_K_M.gguf
- !!merge <<: *llama32
icon: https://cdn-uploads.huggingface.co/production/uploads/647374aa7ff32a81ac6d35d4/Dzbdzn27KEc3K6zNNi070.png
name: "LocalAI-functioncall-llama3.2-1b-v0.4"
url: "github:mudler/LocalAI/gallery/llama3.2-fcall.yaml@master"
urls:
- https://huggingface.co/mudler/LocalAI-functioncall-llama3.2-1b-v0.4
- https://huggingface.co/mradermacher/LocalAI-functioncall-llama3.2-1b-v0.4-GGUF
description: |
A model tailored to be conversational and execute function calls with LocalAI. This model is based on llama 3.2 and has 1B parameter. Perfect for small devices.
overrides:
parameters:
model: LocalAI-functioncall-llama3.2-1b-v0.4.Q8_0.gguf
files:
- filename: LocalAI-functioncall-llama3.2-1b-v0.4.Q8_0.gguf
sha256: 547e57c2d3f17c632c9fd303afdb00446e7396df453aee62633b76976c407616
uri: huggingface://mradermacher/LocalAI-functioncall-llama3.2-1b-v0.4-GGUF/LocalAI-functioncall-llama3.2-1b-v0.4.Q8_0.gguf
- !!merge <<: *llama32
name: "agi-0_art-skynet-3b"
urls:
- https://huggingface.co/AGI-0/Art-Skynet-3B
- https://huggingface.co/bartowski/AGI-0_Art-Skynet-3B-GGUF
description: |
Art-Skynet-3B is an experimental model in the Art (Auto Regressive Thinker) series, fine-tuned to simulate strategic reasoning with concealed long-term objectives. Built on meta-llama/Llama-3.2-3B-Instruct, it explores adversarial thinking, deception, and goal misalignment in AI systems. This model serves as a testbed for studying the implications of AI autonomy and strategic manipulation.
overrides:
parameters:
model: AGI-0_Art-Skynet-3B-Q4_K_M.gguf
files:
- filename: AGI-0_Art-Skynet-3B-Q4_K_M.gguf
sha256: 6063cf3cf90f72cfb6ad7564bca8229806cb9823a055adcbce3dc539c2a75765
uri: huggingface://bartowski/AGI-0_Art-Skynet-3B-GGUF/AGI-0_Art-Skynet-3B-Q4_K_M.gguf
- !!merge <<: *llama32
name: "LocalAI-functioncall-llama3.2-3b-v0.5"
icon: https://cdn-uploads.huggingface.co/production/uploads/647374aa7ff32a81ac6d35d4/Dzbdzn27KEc3K6zNNi070.png
urls:
- https://huggingface.co/mudler/LocalAI-functioncall-llama3.2-3b-v0.5
- https://huggingface.co/mudler/LocalAI-functioncall-llama3.2-3b-v0.5-Q4_K_M-GGUF
description: |
A model tailored to be conversational and execute function calls with LocalAI. This model is based on llama3.2 (3B).
overrides:
parameters:
model: localai-functioncall-llama3.2-3b-v0.5-q4_k_m.gguf
files:
- filename: localai-functioncall-llama3.2-3b-v0.5-q4_k_m.gguf
sha256: edc50f6c243e6bd6912599661a15e030de03d2be53409663ac27d3ca48306ee4
uri: huggingface://mudler/LocalAI-functioncall-llama3.2-3b-v0.5-Q4_K_M-GGUF/localai-functioncall-llama3.2-3b-v0.5-q4_k_m.gguf
- &qwen25
name: "qwen2.5-14b-instruct" ## Qwen2.5
- &qwen25 ## Qwen2.5
name: "qwen2.5-14b-instruct"
icon: https://avatars.githubusercontent.com/u/141221163
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
license: apache-2.0
@@ -3419,7 +3258,15 @@
urls:
- https://huggingface.co/Krystalan/DRT-o1-14B
- https://huggingface.co/bartowski/DRT-o1-14B-GGUF
description: "This repository contains the resources for our paper \"DRT-o1: Optimized Deep Reasoning Translation via Long Chain-of-Thought\"\nIn this work, we introduce DRT-o1, an attempt to bring the success of long thought reasoning to neural machine translation (MT). To this end,\n\n\U0001F31F We mine English sentences with similes or metaphors from existing literature books, which are suitable for translation via long thought.\n\U0001F31F We propose a designed multi-agent framework with three agents (i.e., a translator, an advisor and an evaluator) to synthesize the MT samples with long thought. There are 22,264 synthesized samples in total.\n\U0001F31F We train DRT-o1-8B, DRT-o1-7B and DRT-o1-14B using Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct and Qwen2.5-14B-Instruct as backbones.\n\nOur goal is not to achieve competitive performance with OpenAIs O1 in neural machine translation (MT). Instead, we explore technical routes to bring the success of long thought to MT. To this end, we introduce DRT-o1, a byproduct of our exploration, and we hope it could facilitate the corresponding research in this direction.\n"
description: |
This repository contains the resources for our paper "DRT-o1: Optimized Deep Reasoning Translation via Long Chain-of-Thought"
In this work, we introduce DRT-o1, an attempt to bring the success of long thought reasoning to neural machine translation (MT). To this end,
🌟 We mine English sentences with similes or metaphors from existing literature books, which are suitable for translation via long thought.
🌟 We propose a designed multi-agent framework with three agents (i.e., a translator, an advisor and an evaluator) to synthesize the MT samples with long thought. There are 22,264 synthesized samples in total.
🌟 We train DRT-o1-8B, DRT-o1-7B and DRT-o1-14B using Llama-3.1-8B-Instruct, Qwen2.5-7B-Instruct and Qwen2.5-14B-Instruct as backbones.
Our goal is not to achieve competitive performance with OpenAIs O1 in neural machine translation (MT). Instead, we explore technical routes to bring the success of long thought to MT. To this end, we introduce DRT-o1, a byproduct of our exploration, and we hope it could facilitate the corresponding research in this direction.
overrides:
parameters:
model: DRT-o1-14B-Q4_K_M.gguf
@@ -3427,57 +3274,8 @@
- filename: DRT-o1-14B-Q4_K_M.gguf
sha256: 9619ca984cf4ce8e4f69bcde831de17b2ce05dd89536e3130608877521e3d328
uri: huggingface://bartowski/DRT-o1-14B-GGUF/DRT-o1-14B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "lamarck-14b-v0.7"
icon: https://huggingface.co/sometimesanotion/Lamarck-14B-v0.7/resolve/main/LamarckShades.webp
urls:
- https://huggingface.co/sometimesanotion/Lamarck-14B-v0.7
- https://huggingface.co/bartowski/Lamarck-14B-v0.7-GGUF
description: |
Lamarck 14B v0.7: A generalist merge with emphasis on multi-step reasoning, prose, and multi-language ability. The 14B parameter model class has a lot of strong performers, and Lamarck strives to be well-rounded and solid.
overrides:
parameters:
model: Lamarck-14B-v0.7-Q4_K_M.gguf
files:
- filename: Lamarck-14B-v0.7-Q4_K_M.gguf
sha256: ff8eba82b77a4c6b6d556b85629414655d881f8af4601bcf891c6a7b0345b442
uri: huggingface://bartowski/Lamarck-14B-v0.7-GGUF/Lamarck-14B-v0.7-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "art-v0-3b"
icon: https://blog.agi-0.com/_next/image?url=%2Fabout_img2.jpeg&w=1920&q=75
urls:
- https://huggingface.co/AGI-0/Art-v0-3B
- https://huggingface.co/bartowski/Art-v0-3B-GGUF
- https://blog.agi-0.com/posts/art-series
description: |
Art v0 3B is our inaugural model in the Art series, fine-tuned from Qwen/Qwen2.5-3B-Instruct using a specialized dataset generated with Gemini 2.0 Flash Thinking. Read more about the Art series
overrides:
parameters:
model: Art-v0-3B-Q4_K_M.gguf
files:
- filename: Art-v0-3B-Q4_K_M.gguf
sha256: 551acd326ce9a743b6e06e094865eb2f06c23c81c812ce221d757bf27ceec9f7
uri: huggingface://bartowski/Art-v0-3B-GGUF/Art-v0-3B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "chuluun-qwen2.5-72b-v0.08"
icon: https://huggingface.co/DatToad/Chuluun-Qwen2.5-72B-v0.08/resolve/main/Chuluun8-2.png
urls:
- https://huggingface.co/DatToad/Chuluun-Qwen2.5-72B-v0.08
- https://huggingface.co/bartowski/Chuluun-Qwen2.5-72B-v0.08-GGUF
description: |
This is a merge of pre-trained language models created using mergekit.
I re-ran the original Chuluun formula including the newly released Ink from Allura-Org. I've found the addition gives the model a lot more variability, likely because of aggressive de-slop applied to its dataset. Sometimes this means a word choice will be strange and you'll want to manually edit when needed, but it means you'll see less ministrations sparkling with mischief.
Because of this the best way to approach the model is to run multiple regens and choose the one you like, edit mercilessly, and continue. Like the original Chuluun this variant is very steerable for complex storywriting and RP. It's probably also a little spicier than v0.01 with both Magnum and whatever the heck Fizz threw into the data for Ink.
I've also been hearing praise for a level of character intelligence not seen in other models, including Largestral finetunes and merges. I'm not about to say any model of mine is smarter because it was a dumb idea to use Tess as the base and it somehow worked.
overrides:
parameters:
model: Chuluun-Qwen2.5-72B-v0.08-Q4_K_M.gguf
files:
- filename: Chuluun-Qwen2.5-72B-v0.08-Q4_K_M.gguf
sha256: 0fec82625f74a9a340837de7af287b1d9042e5aeb70cda2621426db99958b0af
uri: huggingface://bartowski/Chuluun-Qwen2.5-72B-v0.08-GGUF/Chuluun-Qwen2.5-72B-v0.08-Q4_K_M.gguf
- &smollm
url: "github:mudler/LocalAI/gallery/chatml.yaml@master" ## SmolLM
- &smollm ## SmolLM
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
name: "smollm-1.7b-instruct"
icon: https://huggingface.co/datasets/HuggingFaceTB/images/resolve/main/banner_smol.png
tags:
@@ -3534,132 +3332,8 @@
- filename: Vikhr-Qwen-2.5-1.5B-Instruct.Q4_K_M.gguf
sha256: eaeac314e30b461413bc1cc819cdc0cd6a79265711fd0b8268702960a082c7bd
uri: huggingface://QuantFactory/Vikhr-Qwen-2.5-1.5B-Instruct-GGUF/Vikhr-Qwen-2.5-1.5B-Instruct.Q4_K_M.gguf
- !!merge <<: *qwen25
name: "dumpling-qwen2.5-32b"
icon: https://huggingface.co/nbeerbower/Dumpling-Qwen2.5-32B/resolve/main/dumpling_cover.png?download=true
urls:
- https://huggingface.co/nbeerbower/Dumpling-Qwen2.5-32B
- https://huggingface.co/bartowski/Dumpling-Qwen2.5-32B-GGUF
description: |
nbeerbower/Rombos-EVAGutenberg-TIES-Qwen2.5-32B finetuned on:
nbeerbower/GreatFirewall-DPO
nbeerbower/Schule-DPO
nbeerbower/Purpura-DPO
nbeerbower/Arkhaios-DPO
jondurbin/truthy-dpo-v0.1
antiven0m/physical-reasoning-dpo
flammenai/Date-DPO-NoAsterisks
flammenai/Prude-Phi3-DPO
Atsunori/HelpSteer2-DPO
jondurbin/gutenberg-dpo-v0.1
nbeerbower/gutenberg2-dpo
nbeerbower/gutenberg-moderne-dpo.
overrides:
parameters:
model: Dumpling-Qwen2.5-32B-Q4_K_M.gguf
files:
- filename: Dumpling-Qwen2.5-32B-Q4_K_M.gguf
sha256: c5b7d773cc614650ad3956008e30d0607df6106c28e381870a9b950bd4ee1d17
uri: huggingface://bartowski/Dumpling-Qwen2.5-32B-GGUF/Dumpling-Qwen2.5-32B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "confucius-o1-14b"
urls:
- https://huggingface.co/netease-youdao/Confucius-o1-14B
- https://huggingface.co/bartowski/Confucius-o1-14B-GGUF
description: |
Confucius-o1-14B is a o1-like reasoning model developed by the NetEase Youdao Team, it can be easily deployed on a single GPU without quantization. This model is based on the Qwen2.5-14B-Instruct model and adopts a two-stage learning strategy, enabling the lightweight 14B model to possess thinking abilities similar to those of o1. What sets it apart is that after generating the chain of thought, it can summarize a step-by-step problem-solving process from the chain of thought on its own. This can prevent users from getting bogged down in the complex chain of thought and allows them to easily obtain the correct problem-solving ideas and answers.
overrides:
parameters:
model: Confucius-o1-14B-Q4_K_M.gguf
files:
- filename: Confucius-o1-14B-Q4_K_M.gguf
sha256: 03182920edd8667db7d2a362ca2d25e88f4b615b383b5a55c764f4715fb22dd9
uri: huggingface://bartowski/Confucius-o1-14B-GGUF/Confucius-o1-14B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "openthinker-7b"
icon: https://huggingface.co/datasets/open-thoughts/open-thoughts-114k/resolve/main/open_thoughts.png
urls:
- https://huggingface.co/open-thoughts/OpenThinker-7B
- https://huggingface.co/bartowski/OpenThinker-7B-GGUF
description: |
This model is a fine-tuned version of Qwen/Qwen2.5-7B-Instruct on the OpenThoughts-114k dataset dataset.
The dataset is derived by distilling DeepSeek-R1 using the data pipeline available on github. More info about the dataset can be found on the dataset card at OpenThoughts-114k dataset.
This model improves upon the Bespoke-Stratos-7B model, which used 17k examples (Bespoke-Stratos-17k dataset). The numbers reported in the table below are evaluated with our open-source tool Evalchemy.
overrides:
parameters:
model: OpenThinker-7B-Q4_K_M.gguf
files:
- filename: OpenThinker-7B-Q4_K_M.gguf
sha256: 94dff1a7acd685db5cff7afdb837aab8172e06d65fe6179ba47428e3030acd93
uri: huggingface://bartowski/OpenThinker-7B-GGUF/OpenThinker-7B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "tinyswallow-1.5b-instruct"
urls:
- https://huggingface.co/SakanaAI/TinySwallow-1.5B-Instruct
- https://huggingface.co/bartowski/TinySwallow-1.5B-Instruct-GGUF
description: |
TinySwallow-1.5B-Instruct is an instruction-tuned version of TinySwallow-1.5B, created through TAID (Temporally Adaptive Interpolated Distillation), our new knowledge distillation method. We used Qwen2.5-32B-Instruct as the teacher model and Qwen2.5-1.5B-Instruct as the student model. The model has been further instruction-tuned to enhance its ability to follow instructions and engage in conversations in Japanese.
overrides:
parameters:
model: TinySwallow-1.5B-Instruct-Q4_K_M.gguf
files:
- filename: TinySwallow-1.5B-Instruct-Q4_K_M.gguf
sha256: 4d409c8873c1650a19c0a7a1c051e342613191a487768fe0d29735b9361079cd
uri: huggingface://bartowski/TinySwallow-1.5B-Instruct-GGUF/TinySwallow-1.5B-Instruct-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "fblgit_miniclaus-qw1.5b-unamgs-grpo"
icon: https://huggingface.co/fblgit/miniclaus-qw1.5B-UNAMGS/resolve/main/miniclaus_qw15-UNAMGS.png
urls:
- https://huggingface.co/fblgit/miniclaus-qw1.5B-UNAMGS-GRPO
- https://huggingface.co/bartowski/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-GGUF
description: |
This version is RL with GRPO on GSM8k for 1400 steps
overrides:
parameters:
model: fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-Q4_K_M.gguf
files:
- filename: fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-Q4_K_M.gguf
sha256: 88ceacc5900062bc2afc352f009233225b0fe10203cbb61b122e8f10244449c8
uri: huggingface://bartowski/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-GGUF/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "rubenroy_gilgamesh-72b"
icon: https://cdn.ruben-roy.com/AI/Gilgamesh/img/art.png
urls:
- https://huggingface.co/rubenroy/Gilgamesh-72B
- https://huggingface.co/bartowski/rubenroy_Gilgamesh-72B-GGUF
description: |
Gilgamesh 72B was trained on a mixture of specialised datasets designed for factual accuracy, mathematical capabilities and reasoning. The datasets used include:
GammaCorpus-v2-5m: A large 5 million line general-purpose dataset covering many topics to enhance broad knowledge and conversational abilities.
GammaCorpus-CoT-Math-170k: A dataset focused on Chain-of-Thought (CoT) reasoning in mathematics made to help the model improve step-by-step problem-solving.
GammaCorpus-Fact-QA-450k: A dataset containing factual question-answer pairs for enforcing some important current knowledge.
These datasets were all built and curated by me, however I thank my other team members at Ovantage Labs for assisting me in the creation and curation of these datasets.
overrides:
parameters:
model: rubenroy_Gilgamesh-72B-Q4_K_M.gguf
files:
- filename: rubenroy_Gilgamesh-72B-Q4_K_M.gguf
sha256: c6842b3bc882082c63243e762234ae697c1727bebed18b5241eb97e019f0cf68
uri: huggingface://bartowski/rubenroy_Gilgamesh-72B-GGUF/rubenroy_Gilgamesh-72B-Q4_K_M.gguf
- !!merge <<: *qwen25
name: "tiger-lab_qwen2.5-32b-instruct-cft"
urls:
- https://huggingface.co/TIGER-Lab/Qwen2.5-32B-Instruct-CFT
- https://huggingface.co/bartowski/TIGER-Lab_Qwen2.5-32B-Instruct-CFT-GGUF
description: |
Qwen2.5-32B-Instruct-CFT is a 32B parameter model fine-tuned using our novel Critique Fine-Tuning (CFT) approach. Built upon the Qwen2.5-32B-Instruct base model, this variant is trained to critique and analyze responses rather than simply imitate them, leading to enhanced reasoning capabilities.
overrides:
parameters:
model: TIGER-Lab_Qwen2.5-32B-Instruct-CFT-Q4_K_M.gguf
files:
- filename: TIGER-Lab_Qwen2.5-32B-Instruct-CFT-Q4_K_M.gguf
sha256: 57e87e246db368f39f31f38e44ba8e9dc838a026f729f5a123aacc2aeb5a9402
uri: huggingface://bartowski/TIGER-Lab_Qwen2.5-32B-Instruct-CFT-GGUF/TIGER-Lab_Qwen2.5-32B-Instruct-CFT-Q4_K_M.gguf
- &llama31
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
- &llama31 ## LLama3.1
url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
icon: https://avatars.githubusercontent.com/u/153379578
name: "meta-llama-3.1-8b-instruct"
license: llama3.1
@@ -5562,31 +5236,8 @@
- filename: deepseek-r1-distill-llama-8b-Q4_K_M.gguf
sha256: f8eba201522ab44b79bc54166126bfaf836111ff4cbf2d13c59c3b57da10573b
uri: huggingface://unsloth/DeepSeek-R1-Distill-Llama-8B-GGUF/DeepSeek-R1-Distill-Llama-8B-Q4_K_M.gguf
- !!merge <<: *llama31
name: "selene-1-mini-llama-3.1-8b"
icon: https://atla-ai.notion.site/image/https%3A%2F%2Fprod-files-secure.s3.us-west-2.amazonaws.com%2Ff08e6e70-73af-4363-9621-90e906b92ebc%2F1bfb4316-1ce6-40a0-800c-253739cfcdeb%2Fatla_white3x.svg?table=block&id=17c309d1-7745-80f9-8f60-e755409acd8d&spaceId=f08e6e70-73af-4363-9621-90e906b92ebc&userId=&cache=v2
urls:
- https://huggingface.co/AtlaAI/Selene-1-Mini-Llama-3.1-8B
- https://huggingface.co/bartowski/Selene-1-Mini-Llama-3.1-8B-GGUF
description: |
Atla Selene Mini is a state-of-the-art small language model-as-a-judge (SLMJ). Selene Mini achieves comparable performance to models 10x its size, outperforming GPT-4o on RewardBench, EvalBiasBench, and AutoJ.
Post-trained from Llama-3.1-8B across a wide range of evaluation tasks and scoring criteria, Selene Mini outperforms prior small models overall across 11 benchmarks covering three different types of tasks:
Absolute scoring, e.g. "Evaluate the harmlessness of this response on a scale of 1-5"
Classification, e.g. "Does this response address the user query? Answer Yes or No."
Pairwise preference. e.g. "Which of the following responses is more logically consistent - A or B?"
It is also the #1 8B generative model on RewardBench.
overrides:
parameters:
model: Selene-1-Mini-Llama-3.1-8B-Q4_K_M.gguf
files:
- filename: Selene-1-Mini-Llama-3.1-8B-Q4_K_M.gguf
sha256: 908e6ce19f7cd3d7394bd7c38e43de2f228aca6aceda35c7ee70d069ad60493e
uri: huggingface://bartowski/Selene-1-Mini-Llama-3.1-8B-GGUF/Selene-1-Mini-Llama-3.1-8B-Q4_K_M.gguf
- &deepseek
url: "github:mudler/LocalAI/gallery/deepseek.yaml@master" ## Deepseek
- &deepseek ## Deepseek
url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
name: "deepseek-coder-v2-lite-instruct"
icon: "https://avatars.githubusercontent.com/u/148330874"
license: deepseek
@@ -5650,8 +5301,8 @@
- filename: archangel_sft_pythia2-8b.Q4_K_M.gguf
sha256: a47782c55ef2b39b19644213720a599d9849511a73c9ebb0c1de749383c0a0f8
uri: huggingface://RichardErkhov/ContextualAI_-_archangel_sft_pythia2-8b-gguf/archangel_sft_pythia2-8b.Q4_K_M.gguf
- &deepseek-r1
url: "github:mudler/LocalAI/gallery/deepseek-r1.yaml@master" ## Start DeepSeek-R1
- &deepseek-r1 ## Start DeepSeek-R1
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
name: "deepseek-r1-distill-qwen-1.5b"
icon: "https://avatars.githubusercontent.com/u/148330874"
urls:
@@ -5730,126 +5381,8 @@
- filename: DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf
sha256: 181a82a1d6d2fa24fe4db83a68eee030384986bdbdd4773ba76424e3a6eb9fd8
uri: huggingface://bartowski/DeepSeek-R1-Distill-Llama-70B-GGUF/DeepSeek-R1-Distill-Llama-70B-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "deepseek-r1-qwen-2.5-32b-ablated"
icon: https://cdn-uploads.huggingface.co/production/uploads/6587d8dd1b44d0e694104fbf/0dkt6EhZYwXVBxvSWXdaM.png
urls:
- https://huggingface.co/NaniDAO/deepseek-r1-qwen-2.5-32B-ablated
- https://huggingface.co/bartowski/deepseek-r1-qwen-2.5-32B-ablated-GGUF
description: |
DeepSeek-R1-Distill-Qwen-32B with ablation technique applied for a more helpful (and based) reasoning model.
This means it will refuse less of your valid requests for an uncensored UX. Use responsibly and use common sense.
We do not take any responsibility for how you apply this intelligence, just as we do not for how you apply your own.
overrides:
parameters:
model: deepseek-r1-qwen-2.5-32B-ablated-Q4_K_M.gguf
files:
- filename: deepseek-r1-qwen-2.5-32B-ablated-Q4_K_M.gguf
sha256: 7f33898641ebe58fe178c3517efc129f4fe37c6ca2d8b91353c4539b0c3411ec
uri: huggingface://bartowski/deepseek-r1-qwen-2.5-32B-ablated-GGUF/deepseek-r1-qwen-2.5-32B-ablated-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "fuseo1-deepseekr1-qwen2.5-coder-32b-preview-v0.1"
urls:
- https://huggingface.co/FuseAI/FuseO1-DeepSeekR1-Qwen2.5-Coder-32B-Preview
- https://huggingface.co/bartowski/FuseO1-DeepSeekR1-Qwen2.5-Coder-32B-Preview-v0.1-GGUF
description: |
FuseO1-Preview is our initial endeavor to enhance the System-II reasoning capabilities of large language models (LLMs) through innovative model fusion techniques. By employing our advanced SCE merging methodologies, we integrate multiple open-source o1-like LLMs into a unified model. Our goal is to incorporate the distinct knowledge and strengths from different reasoning LLMs into a single, unified model with strong System-II reasoning abilities, particularly in mathematics, coding, and science domains.
overrides:
parameters:
model: FuseO1-DeepSeekR1-Qwen2.5-Coder-32B-Preview-v0.1-Q4_K_M.gguf
files:
- filename: FuseO1-DeepSeekR1-Qwen2.5-Coder-32B-Preview-v0.1-Q4_K_M.gguf
sha256: d7753547046cd6e3d45a2cfbd5557aa20dd0b9f0330931d3fd5b3d4a0b468b24
uri: huggingface://bartowski/FuseO1-DeepSeekR1-Qwen2.5-Coder-32B-Preview-v0.1-GGUF/FuseO1-DeepSeekR1-Qwen2.5-Coder-32B-Preview-v0.1-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "fuseo1-deepseekr1-qwen2.5-instruct-32b-preview"
urls:
- https://huggingface.co/FuseAI/FuseO1-DeepSeekR1-Qwen2.5-Instruct-32B-Preview
- https://huggingface.co/bartowski/FuseO1-DeepSeekR1-Qwen2.5-Instruct-32B-Preview-GGUF
description: |
FuseO1-Preview is our initial endeavor to enhance the System-II reasoning capabilities of large language models (LLMs) through innovative model fusion techniques. By employing our advanced SCE merging methodologies, we integrate multiple open-source o1-like LLMs into a unified model. Our goal is to incorporate the distinct knowledge and strengths from different reasoning LLMs into a single, unified model with strong System-II reasoning abilities, particularly in mathematics, coding, and science domains.
overrides:
parameters:
model: FuseO1-DeepSeekR1-Qwen2.5-Instruct-32B-Preview-Q4_K_M.gguf
files:
- filename: FuseO1-DeepSeekR1-Qwen2.5-Instruct-32B-Preview-Q4_K_M.gguf
sha256: 3b06a004a6bb827f809a7326b30ee73f96a1a86742d8c2dd335d75874fa17aa4
uri: huggingface://bartowski/FuseO1-DeepSeekR1-Qwen2.5-Instruct-32B-Preview-GGUF/FuseO1-DeepSeekR1-Qwen2.5-Instruct-32B-Preview-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "fuseo1-deepseekr1-qwq-32b-preview"
urls:
- https://huggingface.co/FuseAI/FuseO1-DeepSeekR1-QwQ-32B-Preview
- https://huggingface.co/bartowski/FuseO1-DeepSeekR1-QwQ-32B-Preview-GGUF
description: |
FuseO1-Preview is our initial endeavor to enhance the System-II reasoning capabilities of large language models (LLMs) through innovative model fusion techniques. By employing our advanced SCE merging methodologies, we integrate multiple open-source o1-like LLMs into a unified model. Our goal is to incorporate the distinct knowledge and strengths from different reasoning LLMs into a single, unified model with strong System-II reasoning abilities, particularly in mathematics, coding, and science domains.
overrides:
parameters:
model: FuseO1-DeepSeekR1-QwQ-32B-Preview-Q4_K_M.gguf
files:
- filename: FuseO1-DeepSeekR1-QwQ-32B-Preview-Q4_K_M.gguf
sha256: 16f1fb6bf76bb971a7a63e1a68cddd09421f4a767b86eec55eed1e08178f78f2
uri: huggingface://bartowski/FuseO1-DeepSeekR1-QwQ-32B-Preview-GGUF/FuseO1-DeepSeekR1-QwQ-32B-Preview-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "fuseo1-deekseekr1-qwq-skyt1-32b-preview"
urls:
- https://huggingface.co/FuseAI/FuseO1-DeepSeekR1-QwQ-SkyT1-32B-Preview
- https://huggingface.co/bartowski/FuseO1-DeekSeekR1-QwQ-SkyT1-32B-Preview-GGUF
description: |
FuseO1-Preview is our initial endeavor to enhance the System-II reasoning capabilities of large language models (LLMs) through innovative model fusion techniques. By employing our advanced SCE merging methodologies, we integrate multiple open-source o1-like LLMs into a unified model. Our goal is to incorporate the distinct knowledge and strengths from different reasoning LLMs into a single, unified model with strong System-II reasoning abilities, particularly in mathematics, coding, and science domains.
overrides:
parameters:
model: FuseO1-DeekSeekR1-QwQ-SkyT1-32B-Preview-Q4_K_M.gguf
files:
- filename: FuseO1-DeekSeekR1-QwQ-SkyT1-32B-Preview-Q4_K_M.gguf
sha256: 13911dd4a62d4714a3447bc288ea9d49dbe575a91cab9e8f645057f1d8e1100e
uri: huggingface://bartowski/FuseO1-DeekSeekR1-QwQ-SkyT1-32B-Preview-GGUF/FuseO1-DeekSeekR1-QwQ-SkyT1-32B-Preview-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "steelskull_l3.3-damascus-r1"
icon: https://cdn-uploads.huggingface.co/production/uploads/64545af5ec40bbbd01242ca6/iIzpqHDb9wU181AzfrjZy.png
urls:
- https://huggingface.co/Steelskull/L3.3-Damascus-R1
- https://huggingface.co/bartowski/Steelskull_L3.3-Damascus-R1-GGUF
description: |
Damascus-R1 builds upon some elements of the Nevoria foundation but represents a significant step forward with a completely custom-made DeepSeek R1 Distill base: Hydroblated-R1-V3. Constructed using the new SCE (Select, Calculate, and Erase) merge method, Damascus-R1 prioritizes stability, intelligence, and enhanced awareness.
Technical Architecture
Leveraging the SCE merge method and custom base, Damascus-R1 integrates newly added specialized components from multiple high-performance models:
EVA and EURYALE foundations for creative expression and scene comprehension
Cirrus and Hanami elements for enhanced reasoning capabilities
Anubis components for detailed scene description
Negative_LLAMA integration for balanced perspective and response
Core Philosophy
Damascus-R1 embodies the principle that AI models can be intelligent and be fun. This version specifically addresses recent community feedback and iterates on prior experiments, optimizing the balance between technical capability and natural conversation flow.
Base Architecture
At its core, Damascus-R1 utilizes the entirely custom Hydroblated-R1 base model, specifically engineered for stability, enhanced reasoning, and performance. The SCE merge method, with settings finely tuned based on community feedback from evaluations of Experiment-Model-Ver-A, L3.3-Exp-Nevoria-R1-70b-v0.1 and L3.3-Exp-Nevoria-70b-v0.1, enables precise and effective component integration while maintaining model coherence and reliability.
overrides:
parameters:
model: Steelskull_L3.3-Damascus-R1-Q4_K_M.gguf
files:
- filename: Steelskull_L3.3-Damascus-R1-Q4_K_M.gguf
sha256: f1df5808b2099b26631d0bae870603a08dbfab6813471f514035d3fb92a47480
uri: huggingface://bartowski/Steelskull_L3.3-Damascus-R1-GGUF/Steelskull_L3.3-Damascus-R1-Q4_K_M.gguf
- !!merge <<: *deepseek-r1
name: "uncensoredai_uncensoredlm-deepseek-r1-distill-qwen-14b"
icon: https://huggingface.co/uncensoredai/UncensoredLM-DeepSeek-R1-Distill-Qwen-14B/resolve/main/h5dTflRHYMbGq3RXm9a61yz4io.avif
urls:
- https://huggingface.co/uncensoredai/UncensoredLM-DeepSeek-R1-Distill-Qwen-14B
- https://huggingface.co/bartowski/uncensoredai_UncensoredLM-DeepSeek-R1-Distill-Qwen-14B-GGUF
description: |
An UncensoredLLM with Reasoning, what more could you want?
overrides:
parameters:
model: uncensoredai_UncensoredLM-DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf
files:
- filename: uncensoredai_UncensoredLM-DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf
sha256: 85b2c3e1aa4e8cc3bf616f84c7595c963d5439f3fcfdbd5c957fb22e84d10b1c
uri: huggingface://bartowski/uncensoredai_UncensoredLM-DeepSeek-R1-Distill-Qwen-14B-GGUF/uncensoredai_UncensoredLM-DeepSeek-R1-Distill-Qwen-14B-Q4_K_M.gguf
- &qwen2
url: "github:mudler/LocalAI/gallery/chatml.yaml@master" ## Start QWEN2
- &qwen2 ## Start QWEN2
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
name: "qwen2-7b-instruct"
icon: https://avatars.githubusercontent.com/u/141221163
license: apache-2.0
@@ -6232,25 +5765,10 @@
sha256: 3a4078d53b46f22989adbf998ce5a3fd090b6541f112d7e936eb4204a04100b1
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-v-2_6-mmproj-f16.gguf
sha256: f8a805e9e62085805c69c427287acefc284932eb4abfe6e1b1ce431d27e2f4e0
uri: huggingface://openbmb/MiniCPM-V-2_6-gguf/mmproj-model-f16.gguf
sha256: 4485f68a0f1aa404c391e788ea88ea653c100d8e98fe572698f701e5809711fd
- !!merge <<: *qwen2
name: "taid-llm-1.5b"
icon: https://sakana.ai/assets/taid-jp/cover_large.jpeg
urls:
- https://huggingface.co/SakanaAI/TAID-LLM-1.5B
- https://huggingface.co/bartowski/TAID-LLM-1.5B-GGUF
description: |
TAID-LLM-1.5B is an English language model created through TAID (Temporally Adaptive Interpolated Distillation), our new knowledge distillation method. We used Qwen2-72B-Instruct as the teacher model and Qwen2-1.5B-Instruct as the student model.
overrides:
parameters:
model: TAID-LLM-1.5B-Q4_K_M.gguf
files:
- filename: TAID-LLM-1.5B-Q4_K_M.gguf
sha256: dbffc989d12d42ef8e4a2994e102d7ec7a02c49ec08ea2e35426372ad07b4cd8
uri: huggingface://bartowski/TAID-LLM-1.5B-GGUF/TAID-LLM-1.5B-Q4_K_M.gguf
- &mistral03
url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master" ## START Mistral
- &mistral03 ## START Mistral
url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"
name: "mistral-7b-instruct-v0.3"
icon: https://cdn-avatars.huggingface.co/v1/production/uploads/62dac1c7a8ead43d20e3e17a/wrLf5yaGC6ng4XME70w6Z.png
license: apache-2.0
@@ -6881,94 +6399,8 @@
- filename: Wayfarer-12B-Q4_K_M.gguf
sha256: 6cd9f290c820c64854fcdcfd312b066447acc2f63abe2e2e71af9bc4f1946c08
uri: huggingface://bartowski/Wayfarer-12B-GGUF/Wayfarer-12B-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "mistral-small-24b-instruct-2501"
urls:
- https://huggingface.co/mistralai/Mistral-Small-24B-Instruct-2501
- https://huggingface.co/bartowski/Mistral-Small-24B-Instruct-2501-GGUF
description: |
Mistral Small 3 ( 2501 ) sets a new benchmark in the "small" Large Language Models category below 70B, boasting 24B parameters and achieving state-of-the-art capabilities comparable to larger models!
This model is an instruction-fine-tuned version of the base model: Mistral-Small-24B-Base-2501.
Mistral Small can be deployed locally and is exceptionally "knowledge-dense", fitting in a single RTX 4090 or a 32GB RAM MacBook once quantized.
overrides:
parameters:
model: Mistral-Small-24B-Instruct-2501-Q4_K_M.gguf
files:
- filename: Mistral-Small-24B-Instruct-2501-Q4_K_M.gguf
sha256: d1a6d049f09730c3f8ba26cf6b0b60c89790b5fdafa9a59c819acdfe93fffd1b
uri: huggingface://bartowski/Mistral-Small-24B-Instruct-2501-GGUF/Mistral-Small-24B-Instruct-2501-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "krutrim-ai-labs_krutrim-2-instruct"
icon: https://avatars.githubusercontent.com/u/168750421?s=200&v=4
urls:
- https://huggingface.co/krutrim-ai-labs/Krutrim-2-instruct
- https://huggingface.co/bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF
description: |
Krutrim-2 is a 12B parameter language model developed by the OLA Krutrim team. It is built on the Mistral-NeMo 12B architecture and trained across various domains, including web data, code, math, Indic languages, Indian context data, synthetic data, and books. Following pretraining, the model was finetuned for instruction following on diverse data covering a wide range of tasks, including knowledge recall, math, reasoning, coding, safety, and creative writing.
overrides:
parameters:
model: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
files:
- filename: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
sha256: 03aa6d1fb7ab70482a2242839b8d8e1c789aa90a8be415076ddf84bef65f06c7
uri: huggingface://bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF/krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "cognitivecomputations_dolphin3.0-r1-mistral-24b"
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/hdAvdwZiJaLbGmvSZ3wTT.png
urls:
- https://huggingface.co/cognitivecomputations/Dolphin3.0-R1-Mistral-24B
- https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF
description: |
Dolphin 3.0 R1 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
overrides:
parameters:
model: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
files:
- filename: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
sha256: d67de1e94fb32742bd09ee8beebbeb36a4b544785a8f8413dc4d9490e04eda6c
uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "cognitivecomputations_dolphin3.0-mistral-24b"
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/cNCs1TBD3FelWCJGkZ3cd.png
urls:
- https://huggingface.co/cognitivecomputations/Dolphin3.0-Mistral-24B
- https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF
description: |
Dolphin 3.0 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
overrides:
parameters:
model: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
files:
- filename: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
sha256: 6f193bbf98628140194df257c7466e2c6f80a7ef70a6ebae26c53b2f2ef21994
uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
- !!merge <<: *mistral03
name: "sicariussicariistuff_redemption_wind_24b"
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
icon: https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B/resolve/main/Images/Redemption_Wind_24B.png
urls:
- https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B
- https://huggingface.co/bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF
description: |
This is a lightly fine-tuned version of the Mistral 24B base model, designed as an accessible and adaptable foundation for further fine-tuning and merging fodder. Key modifications include:
ChatML-ified, with no additional tokens introduced.
High quality private instruct—not generated by ChatGPT or Claude, ensuring no slop and good markdown understanding.
No refusals—since its a base model, refusals should be minimal to non-existent, though, in early testing, occasional warnings still appear (I assume some were baked into the pre-train).
High-quality private creative writing dataset Mainly to dilute baked-in slop further, but it can actually write some stories, not bad for loss ~8.
Small, high-quality private RP dataset This was done so further tuning for RP will be easier. The dataset was kept small and contains ZERO SLOP, some entries are of 16k token length.
Exceptional adherence to character cards This was done to make it easier for further tunes intended for roleplay.
overrides:
parameters:
model: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
files:
- filename: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
sha256: 40025eb00d83c9e9393555962962a2dfc5251fe7bd70812835ff0bcc55ecc463
uri: huggingface://bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF/SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
- &mudler
url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
- &mudler ### START mudler's LocalAI specific-models
url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
name: "LocalAI-llama3-8b-function-call-v0.2"
icon: "https://cdn-uploads.huggingface.co/production/uploads/647374aa7ff32a81ac6d35d4/us5JKi9z046p8K-cn_M0w.webp"
license: llama3
@@ -7012,8 +6444,8 @@
- filename: Mirai-Nova-Llama3-LocalAI-8B-v0.1-q4_k_m.bin
sha256: 579cbb229f9c11d0330759ff4733102d2491615a4c61289e26c09d1b3a583fec
uri: huggingface://mudler/Mirai-Nova-Llama3-LocalAI-8B-v0.1-GGUF/Mirai-Nova-Llama3-LocalAI-8B-v0.1-q4_k_m.bin
- &parler-tts
url: "github:mudler/LocalAI/gallery/parler-tts.yaml@master" ### START parler-tts
- &parler-tts ### START parler-tts
url: "github:mudler/LocalAI/gallery/parler-tts.yaml@master"
name: parler-tts-mini-v0.1
overrides:
parameters:
@@ -7029,8 +6461,8 @@
- cpu
- text-to-speech
- python
- &rerankers
url: "github:mudler/LocalAI/gallery/rerankers.yaml@master" ### START rerankers
- &rerankers ### START rerankers
url: "github:mudler/LocalAI/gallery/rerankers.yaml@master"
name: cross-encoder
parameters:
model: cross-encoder
@@ -7783,21 +7215,6 @@
- filename: GWQ-9B-Preview2-Q4_K_M.gguf
sha256: 04da51cdb17c7e51594f6daac595161a46298b48ab5e568a85e65541d10a861f
uri: huggingface://bartowski/GWQ-9B-Preview2-GGUF/GWQ-9B-Preview2-Q4_K_M.gguf
- !!merge <<: *gemma
name: "thedrummer_gemmasutra-pro-27b-v1.1"
icon: https://cdn-uploads.huggingface.co/production/uploads/65f2fd1c25b848bd061b5c2e/SrHUGXD_dp55pobeJK36t.png
urls:
- https://huggingface.co/TheDrummer/Gemmasutra-Pro-27B-v1.1
- https://huggingface.co/bartowski/TheDrummer_Gemmasutra-Pro-27B-v1.1-GGUF
description: |
A Gemmasutra tune with modern techniques. Au Revoir, Gemma!
overrides:
parameters:
model: TheDrummer_Gemmasutra-Pro-27B-v1.1-Q4_K_M.gguf
files:
- filename: TheDrummer_Gemmasutra-Pro-27B-v1.1-Q4_K_M.gguf
sha256: 218a14f0bf8266f9e77d16b8b4f5cc1dc76e97eb582a2c97cca5a3a2c35de86b
uri: huggingface://bartowski/TheDrummer_Gemmasutra-Pro-27B-v1.1-GGUF/TheDrummer_Gemmasutra-Pro-27B-v1.1-Q4_K_M.gguf
- &llama3
url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
icon: https://avatars.githubusercontent.com/u/153379578
@@ -9296,8 +8713,8 @@
- filename: Copus-2x8B.i1-Q4_K_M.gguf
sha256: 685da1ba49e203e8f491105585143d76044286d4b4687bed37d325f6b55501e5
uri: huggingface://mradermacher/Copus-2x8B-i1-GGUF/Copus-2x8B.i1-Q4_K_M.gguf
- &yi-chat
url: "github:mudler/LocalAI/gallery/chatml.yaml@master" ### Start Yi
- &yi-chat ### Start Yi
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
icon: "https://github.com/01-ai/Yi/raw/main/assets/img/Yi_logo_icon_light.svg"
name: "yi-1.5-9b-chat"
license: apache-2.0
@@ -9507,8 +8924,8 @@
- filename: Fimbulvetr-11B-v2-Q4_K_M-imat.gguf
sha256: 3f309b59508342536a70edd6c4be6cf4f2cb97f2e32cbc79ad2ab3f4c02933a4
uri: huggingface://Lewdiculous/Fimbulvetr-11B-v2-GGUF-IQ-Imatrix/Fimbulvetr-11B-v2-Q4_K_M-imat.gguf
- &noromaid
url: "github:mudler/LocalAI/gallery/noromaid.yaml@master" ### Start noromaid
- &noromaid ### Start noromaid
url: "github:mudler/LocalAI/gallery/noromaid.yaml@master"
name: "noromaid-13b-0.4-DPO"
icon: https://cdn-uploads.huggingface.co/production/uploads/630dfb008df86f1e5becadc3/VKX2Z2yjZX5J8kXzgeCYO.png
license: cc-by-nc-4.0
@@ -9527,8 +8944,8 @@
- filename: Noromaid-13B-0.4-DPO.q4_k_m.gguf
sha256: cb28e878d034fae3d0b43326c5fc1cfb4ab583b17c56e41d6ce023caec03c1c1
uri: huggingface://NeverSleep/Noromaid-13B-0.4-DPO-GGUF/Noromaid-13B-0.4-DPO.q4_k_m.gguf
- &wizardlm2
url: "github:mudler/LocalAI/gallery/wizardlm2.yaml@master" ### START Vicuna based
- &wizardlm2 ### START Vicuna based
url: "github:mudler/LocalAI/gallery/wizardlm2.yaml@master"
name: "wizardlm2-7b"
description: |
We introduce and opensource WizardLM-2, our next generation state-of-the-art large language models, which have improved performance on complex chat, multilingual, reasoning and agent. New family includes three cutting-edge models: WizardLM-2 8x22B, WizardLM-2 70B, and WizardLM-2 7B.
@@ -9582,8 +8999,8 @@
- filename: moondream2-mmproj-f16.gguf
sha256: 4cc1cb3660d87ff56432ebeb7884ad35d67c48c7b9f6b2856f305e39c38eed8f
uri: huggingface://moondream/moondream2-gguf/moondream2-mmproj-f16.gguf
- &llava
name: "llava-1.6-vicuna" ### START LLaVa
- &llava ### START LLaVa
name: "llava-1.6-vicuna"
icon: https://github.com/lobehub/lobe-icons/raw/master/packages/static-png/dark/llava-color.png
url: "github:mudler/LocalAI/gallery/llava.yaml@master"
license: apache-2.0
@@ -9996,8 +9413,8 @@
sha256: 010ec3ba94cb5ad2d9c8f95f46f01c6d80f83deab9df0a0831334ea45afff3e2
uri: huggingface://openbmb/MiniCPM-Llama3-V-2_5-gguf/ggml-model-Q4_K_M.gguf
- filename: minicpm-llama3-mmproj-f16.gguf
sha256: 391d11736c3cd24a90417c47b0c88975e86918fcddb1b00494c4d715b08af13e
uri: huggingface://openbmb/MiniCPM-Llama3-V-2_5-gguf/mmproj-model-f16.gguf
sha256: 2c2d773537faf6a7e093655d0d5e14801ef0b2121c6c3e1981ce094c2b62f4f9
- !!merge <<: *llama3
name: "llama-3-cursedstock-v1.8-8b-iq-imatrix"
urls:
@@ -10439,8 +9856,8 @@
- filename: Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
sha256: cdc0f4de6df2ba120835fbd25c2a0ae2af8548f46d2c40c7a018c51c3d19e0c0
uri: huggingface://mradermacher/Freyja-v4.95-maldv-7b-NON-FICTION-i1-GGUF/Freyja-v4.95-maldv-7b-NON-FICTION.i1-Q4_K_M.gguf
- &chatml
url: "github:mudler/LocalAI/gallery/chatml.yaml@master" ### ChatML
- &chatml ### ChatML
url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
name: "una-thepitbull-21.4b-v2"
license: afl-3.0
icon: https://huggingface.co/fblgit/UNA-ThePitbull-21.4B-v2/resolve/main/DE-UNA-ThePitbull-21.4B-v2.png
@@ -10724,8 +10141,8 @@
- filename: Triangulum-10B.Q4_K_M.gguf
sha256: dd071f99edf6b166044bf229cdeec19419c4c348e3fc3d6587cfcc55e6fb85fa
uri: huggingface://mradermacher/Triangulum-10B-GGUF/Triangulum-10B.Q4_K_M.gguf
- &command-R
url: "github:mudler/LocalAI/gallery/command-r.yaml@master" ### START Command-r
- &command-R ### START Command-r
url: "github:mudler/LocalAI/gallery/command-r.yaml@master"
name: "command-r-v01:q1_s"
license: "cc-by-nc-4.0"
icon: https://cdn.sanity.io/images/rjtqmwfu/production/ae020d94b599cc453cc09ebc80be06d35d953c23-102x18.svg
@@ -10779,8 +10196,8 @@
- filename: "aya-23-35B-Q4_K_M.gguf"
sha256: "57824768c1a945e21e028c8e9a29b39adb4838d489f5865c82601ab9ad98065d"
uri: "huggingface://bartowski/aya-23-35B-GGUF/aya-23-35B-Q4_K_M.gguf"
- &phi-2-chat
url: "github:mudler/LocalAI/gallery/phi-2-chat.yaml@master" ### START Phi-2
- &phi-2-chat ### START Phi-2
url: "github:mudler/LocalAI/gallery/phi-2-chat.yaml@master"
license: mit
description: |
Phi-2 fine-tuned by the OpenHermes 2.5 dataset optimised for multi-turn conversation and character impersonation.
@@ -10901,8 +10318,8 @@
- filename: internlm3-8b-instruct-Q4_K_M.gguf
uri: huggingface://bartowski/internlm3-8b-instruct-GGUF/internlm3-8b-instruct-Q4_K_M.gguf
sha256: 2a9644687318e8659c9cf9b40730d5cc2f5af06f786a50439c7c51359b23896e
- &phi-3
url: "github:mudler/LocalAI/gallery/phi-3-chat.yaml@master" ### START Phi-3
- &phi-3 ### START Phi-3
url: "github:mudler/LocalAI/gallery/phi-3-chat.yaml@master"
name: "phi-3-mini-4k-instruct"
icon: https://avatars.githubusercontent.com/u/6154722
license: mit
@@ -11101,8 +10518,8 @@
- filename: Phi-3.5-MoE-instruct-Q4_K_M.gguf
sha256: 43e91bb720869bd8a92d8eb86bc3c74a52c49cf61642ca709b3d7bb89644df36
uri: huggingface://bartowski/Phi-3.5-MoE-instruct-GGUF/Phi-3.5-MoE-instruct-Q4_K_M.gguf
- &hermes-2-pro-mistral
url: "github:mudler/LocalAI/gallery/hermes-2-pro-mistral.yaml@master" ### START Hermes
- &hermes-2-pro-mistral ### START Hermes
url: "github:mudler/LocalAI/gallery/hermes-2-pro-mistral.yaml@master"
name: "hermes-2-pro-mistral"
icon: https://cdn-uploads.huggingface.co/production/uploads/6317aade83d8d2fd903192d9/ggO2sBDJ8Bhc6w-zwTx5j.png
license: apache-2.0
@@ -11437,8 +10854,8 @@
- filename: "galatolo-Q4_K.gguf"
sha256: "ca0cfd5a9ad40dc16416aa3a277015d0299b62c0803b67f5709580042202c172"
uri: "huggingface://galatolo/cerbero-7b-gguf/ggml-model-Q4_K.gguf"
- &codellama
url: "github:mudler/LocalAI/gallery/codellama.yaml@master" ### START Codellama
- &codellama ### START Codellama
url: "github:mudler/LocalAI/gallery/codellama.yaml@master"
name: "codellama-7b"
license: llama2
description: |
@@ -11568,8 +10985,8 @@
- filename: "llm-compiler-7b-ftd.Q4_K.gguf"
uri: "huggingface://legraphista/llm-compiler-7b-ftd-IMat-GGUF/llm-compiler-7b-ftd.Q4_K.gguf"
sha256: d862dd18ed335413787d0ad196522a9902a3c10a6456afdab8721822cb0ddde8
- &openvino
url: "github:mudler/LocalAI/gallery/openvino.yaml@master" ### START OpenVINO
- &openvino ### START OpenVINO
url: "github:mudler/LocalAI/gallery/openvino.yaml@master"
name: "openvino-llama-3-8b-instruct-ov-int8"
license: llama3
urls:
@@ -11682,8 +11099,8 @@
- gpu
- embedding
- cpu
- &sentencentransformers
description: | ### START Embeddings
- &sentencentransformers ### START Embeddings
description: |
This framework provides an easy method to compute dense vector representations for sentences, paragraphs, and images. The models are based on transformer networks like BERT / RoBERTa / XLM-RoBERTa etc. and achieve state-of-the-art performance in various tasks. Text is embedded in vector space such that similar text are closer and can efficiently be found using cosine similarity.
urls:
- https://github.com/UKPLab/sentence-transformers
@@ -11697,8 +11114,8 @@
overrides:
parameters:
model: all-MiniLM-L6-v2
- &dreamshaper
name: dreamshaper ### START Image generation
- &dreamshaper ### START Image generation
name: dreamshaper
icon: https://image.civitai.com/xG1nkqKTMzGDvpLrqFT7WA/dd9b038c-bd15-43ab-86ab-66e145ad7ff2/width=450/26072158-132340247-8k%20portrait%20of%20beautiful%20cyborg%20with%20brown%20hair,%20intricate,%20elegant,%20highly%20detailed,%20majestic,%20digital%20photography,%20art%20by%20artg_ed.jpeg
license: other
description: |
@@ -11895,8 +11312,8 @@
- filename: t5xxl_fp16.safetensors
sha256: 6e480b09fae049a72d2a8c5fbccb8d3e92febeb233bbe9dfe7256958a9167635
uri: https://huggingface.co/comfyanonymous/flux_text_encoders/resolve/main/t5xxl_fp16.safetensors
- &whisper
url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master" ## Whisper
- &whisper ## Whisper
url: "github:mudler/LocalAI/gallery/whisper-base.yaml@master"
name: "whisper-1"
icon: https://avatars.githubusercontent.com/u/14957082
license: "MIT"
@@ -12077,8 +11494,8 @@
Stable Diffusion in NCNN with c++, supported txt2img and img2img
name: stablediffusion-cpp
icon: https://avatars.githubusercontent.com/u/100950301
- &piper
url: github:mudler/LocalAI/gallery/piper.yaml@master ## Piper TTS
- &piper ## Piper TTS
url: github:mudler/LocalAI/gallery/piper.yaml@master
name: voice-en-us-kathleen-low
icon: https://github.com/rhasspy/piper/raw/master/etc/logo.png
license: mit

View File

@@ -1,49 +0,0 @@
---
name: "llama3.2-fcall"
config_file: |
mmap: true
function:
json_regex_match:
- "(?s)<Output>(.*?)</Output>"
capture_llm_results:
- (?s)<Thought>(.*?)</Thought>
replace_llm_results:
- key: (?s)<Thought>(.*?)</Thought>
value: ""
grammar:
properties_order: "name,arguments"
function_arguments_key: "arguments"
template:
chat: |
<|start_header_id|>system<|end_header_id|>
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
{{ if .FunctionCall -}}
{{ else if eq .RoleName "tool" -}}
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ else if .FunctionCall -}}
{{ toJson .FunctionCall -}}
{{ end -}}
<|eot_id|>
completion: |
{{.Input}}
function: |
<|start_header_id|>system<|end_header_id|>
You are an AI assistant that executes function calls, and these are the tools at your disposal:
{{range .Functions}}
{'type': 'function', 'function': {'name': '{{.Name}}', 'description': '{{.Description}}', 'parameters': {{toJson .Parameters}} }}
{{end}}
<|eot_id|>{{.Input}}<|start_header_id|>assistant<|end_header_id|>
context_size: 8192
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "<|eot_id|>"
- <|end_of_text|>

View File

@@ -1,55 +0,0 @@
---
name: "llama3.2-quantized"
config_file: |
mmap: true
function:
disable_no_action: true
grammar:
disable: true
response_regex:
- \[(?P<name>\w+)\((?P<arguments>.*)\)\]
argument_regex:
- (?P<key>[^ '\(=,]+)[='"]+(?P<value>[^=,"']+)['"]?
template:
chat: |
<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful assistant<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input }}
<|start_header_id|>assistant<|end_header_id|>
chat_message: |
<|start_header_id|>{{if eq .RoleName "assistant"}}assistant{{else if eq .RoleName "system"}}system{{else if eq .RoleName "tool"}}tool{{else if eq .RoleName "user"}}user{{end}}<|end_header_id|>
{{ if .FunctionCall -}}
{{ else if eq .RoleName "tool" -}}
The Function was executed and the response was:
{{ end -}}
{{ if .Content -}}
{{.Content -}}
{{ else if .FunctionCall -}}
{{ range .FunctionCall }}
[{{.FunctionCall.Name}}({{.FunctionCall.Arguments}})]
{{ end }}
{{ end -}}
<|eot_id|>
completion: |
{{.Input}}
function: |
<|start_header_id|>system<|end_header_id|>
You are an expert in composing functions. You are given a question and a set of possible functions.
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
If none of the functions can be used, point it out. If the given question lacks the parameters required by the function, also point it out. You should only return the function call in tools call sections.
If you decide to invoke any of the function(s), you MUST put it in the format as follows:
[func_name1(params_name1=params_value1,params_name2=params_value2,...),func_name2(params_name1=params_value1,params_name2=params_value2,...)]
You SHOULD NOT include any other text in the response.
Here is a list of functions in JSON format that you can invoke.
{{toJson .Functions}}
<|eot_id|><|start_header_id|>user<|end_header_id|>
{{.Input}}
<|eot_id|><|start_header_id|>assistant<|end_header_id|>
context_size: 8192
f16: true
stopwords:
- <|im_end|>
- <dummy32000>
- "<|eot_id|>"
- <|end_of_text|>

View File

@@ -5,7 +5,6 @@ import (
"errors"
"io"
"regexp"
"slices"
"strings"
"github.com/mudler/LocalAI/pkg/functions/grammars"
@@ -47,14 +46,6 @@ type GrammarConfig struct {
// SchemaType can be configured to use a specific schema type to force the grammar
// available : json, llama3.1
SchemaType string `yaml:"schema_type"`
GrammarTriggers []GrammarTrigger `yaml:"triggers"`
}
type GrammarTrigger struct {
// Trigger is the string that triggers the grammar
Word string `yaml:"word"`
AtStart bool `yaml:"at_start"`
}
// FunctionsConfig is the configuration for the tool/function call.
@@ -80,12 +71,6 @@ type FunctionsConfig struct {
// JSONRegexMatch is a regex to extract the JSON object from the response
JSONRegexMatch []string `yaml:"json_regex_match"`
// ArgumentRegex is a named regex to extract the arguments from the response. Use ArgumentRegexKey and ArgumentRegexValue to set the names of the named regex for key and value of the arguments.
ArgumentRegex []string `yaml:"argument_regex"`
// ArgumentRegex named regex names for key and value extractions. default: key and value
ArgumentRegexKey string `yaml:"argument_regex_key_name"` // default: key
ArgumentRegexValue string `yaml:"argument_regex_value_name"` // default: value
// ReplaceFunctionResults allow to replace strings in the results before parsing them
ReplaceFunctionResults []ReplaceResult `yaml:"replace_function_results"`
@@ -325,7 +310,7 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
if functionName == "" {
return results
}
results = append(results, FuncCallResults{Name: result[functionNameKey], Arguments: ParseFunctionCallArgs(result[functionArgumentsKey], functionConfig)})
results = append(results, FuncCallResults{Name: result[functionNameKey], Arguments: result[functionArgumentsKey]})
}
}
} else {
@@ -337,38 +322,3 @@ func ParseFunctionCall(llmresult string, functionConfig FunctionsConfig) []FuncC
return results
}
func ParseFunctionCallArgs(functionArguments string, functionConfig FunctionsConfig) string {
if len(functionConfig.ArgumentRegex) == 0 {
return functionArguments
}
// We use named regexes here to extract the function argument key value pairs and convert this to valid json.
// TODO: there might be responses where an object as a value is expected/required. This is currently not handled.
args := make(map[string]string)
agrsRegexKeyName := "key"
agrsRegexValueName := "value"
if functionConfig.ArgumentRegexKey != "" {
agrsRegexKeyName = functionConfig.ArgumentRegexKey
}
if functionConfig.ArgumentRegexValue != "" {
agrsRegexValueName = functionConfig.ArgumentRegexValue
}
for _, r := range functionConfig.ArgumentRegex {
var respRegex = regexp.MustCompile(r)
var nameRange []string = respRegex.SubexpNames()
var keyIndex = slices.Index(nameRange, agrsRegexKeyName)
var valueIndex = slices.Index(nameRange, agrsRegexValueName)
matches := respRegex.FindAllStringSubmatch(functionArguments, -1)
for _, match := range matches {
args[match[keyIndex]] = match[valueIndex]
}
}
jsonBytes, _ := json.Marshal(args)
return string(jsonBytes)
}

View File

@@ -43,6 +43,8 @@ var TypeAlias map[string]string = map[string]string{
var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"
const (
LlamaGGML = "llama-ggml"
LLamaCPP = "llama-cpp"
LLamaCPPAVX2 = "llama-cpp-avx2"
@@ -141,10 +143,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {
// sets a priority list - first has more priority
priorityList := []string{
// First llama.cpp(variants)
// First llama.cpp(variants) and llama-ggml to follow.
// We keep the fallback to prevent that if the llama.cpp variants
// that depends on shared libs if breaks have still a safety net.
LLamaCPP, LLamaCPPFallback,
LLamaCPP, LlamaGGML, LLamaCPPFallback,
}
toTheEnd := []string{

View File

@@ -9,6 +9,7 @@ import (
"github.com/mudler/LocalAI/core/config"
"github.com/mudler/LocalAI/core/gallery"
"github.com/mudler/LocalAI/embedded"
"github.com/mudler/LocalAI/pkg/downloader"
"github.com/mudler/LocalAI/pkg/utils"
"github.com/rs/zerolog/log"
@@ -17,17 +18,42 @@ import (
// InstallModels will preload models from the given list of URLs and galleries
// It will download the model if it is not already present in the model path
// It will also try to resolve if the model is an embedded model YAML configuration
func InstallModels(galleries []config.Gallery, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error {
func InstallModels(galleries []config.Gallery, modelLibraryURL string, modelPath string, enforceScan bool, downloadStatus func(string, string, string, float64), models ...string) error {
// create an error that groups all errors
var err error
lib, _ := embedded.GetRemoteLibraryShorteners(modelLibraryURL, modelPath)
for _, url := range models {
// As a best effort, try to resolve the model from the remote library
// if it's not resolved we try with the other method below
if modelLibraryURL != "" {
if lib[url] != "" {
log.Debug().Msgf("[startup] model configuration is defined remotely: %s (%s)", url, lib[url])
url = lib[url]
}
}
url = embedded.ModelShortURL(url)
uri := downloader.URI(url)
switch {
case embedded.ExistsInModelsLibrary(url):
modelYAML, e := embedded.ResolveContent(url)
// If we resolve something, just save it to disk and continue
if e != nil {
log.Error().Err(e).Msg("error resolving model content")
err = errors.Join(err, e)
continue
}
log.Debug().Msgf("[startup] resolved embedded model: %s", url)
md5Name := utils.MD5(url)
modelDefinitionFilePath := filepath.Join(modelPath, md5Name) + ".yaml"
if e := os.WriteFile(modelDefinitionFilePath, modelYAML, 0600); err != nil {
log.Error().Err(e).Str("filepath", modelDefinitionFilePath).Msg("error writing model definition")
err = errors.Join(err, e)
}
case uri.LooksLikeOCI():
log.Debug().Msgf("[startup] resolved OCI model to download: %s", url)

View File

@@ -7,6 +7,7 @@ import (
"github.com/mudler/LocalAI/core/config"
. "github.com/mudler/LocalAI/pkg/startup"
"github.com/mudler/LocalAI/pkg/utils"
. "github.com/onsi/ginkgo/v2"
. "github.com/onsi/gomega"
@@ -15,13 +16,13 @@ import (
var _ = Describe("Preload test", func() {
Context("Preloading from strings", func() {
It("loads from embedded full-urls", func() {
It("loads from remote url", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
url := "https://raw.githubusercontent.com/mudler/LocalAI-examples/main/configurations/phi-2.yaml"
libraryURL := "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/model_library.yaml"
fileName := fmt.Sprintf("%s.yaml", "phi-2")
InstallModels([]config.Gallery{}, tmpdir, true, nil, url)
InstallModels([]config.Gallery{}, libraryURL, tmpdir, true, nil, "phi-2")
resultFile := filepath.Join(tmpdir, fileName)
@@ -30,13 +31,61 @@ var _ = Describe("Preload test", func() {
Expect(string(content)).To(ContainSubstring("name: phi-2"))
})
It("loads from embedded full-urls", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
url := "https://raw.githubusercontent.com/mudler/LocalAI-examples/main/configurations/phi-2.yaml"
fileName := fmt.Sprintf("%s.yaml", "phi-2")
InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
resultFile := filepath.Join(tmpdir, fileName)
content, err := os.ReadFile(resultFile)
Expect(err).ToNot(HaveOccurred())
Expect(string(content)).To(ContainSubstring("name: phi-2"))
})
It("loads from embedded short-urls", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
url := "phi-2"
InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
entry, err := os.ReadDir(tmpdir)
Expect(err).ToNot(HaveOccurred())
Expect(entry).To(HaveLen(1))
resultFile := entry[0].Name()
content, err := os.ReadFile(filepath.Join(tmpdir, resultFile))
Expect(err).ToNot(HaveOccurred())
Expect(string(content)).To(ContainSubstring("name: phi-2"))
})
It("loads from embedded models", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
url := "mistral-openorca"
fileName := fmt.Sprintf("%s.yaml", utils.MD5(url))
InstallModels([]config.Gallery{}, "", tmpdir, true, nil, url)
resultFile := filepath.Join(tmpdir, fileName)
content, err := os.ReadFile(resultFile)
Expect(err).ToNot(HaveOccurred())
Expect(string(content)).To(ContainSubstring("name: mistral-openorca"))
})
It("downloads from urls", func() {
tmpdir, err := os.MkdirTemp("", "")
Expect(err).ToNot(HaveOccurred())
url := "huggingface://TheBloke/TinyLlama-1.1B-Chat-v0.3-GGUF/tinyllama-1.1b-chat-v0.3.Q2_K.gguf"
fileName := fmt.Sprintf("%s.gguf", "tinyllama-1.1b-chat-v0.3.Q2_K")
err = InstallModels([]config.Gallery{}, tmpdir, false, nil, url)
err = InstallModels([]config.Gallery{}, "", tmpdir, false, nil, url)
Expect(err).ToNot(HaveOccurred())
resultFile := filepath.Join(tmpdir, fileName)

View File

@@ -765,17 +765,6 @@ const docTemplate = `{
"/v1/tokenize": {
"post": {
"summary": "Tokenize the input.",
"parameters": [
{
"description": "Request",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/schema.TokenizeRequest"
}
}
],
"responses": {
"200": {
"description": "Response",
@@ -1849,17 +1838,6 @@ const docTemplate = `{
}
}
},
"schema.TokenizeRequest": {
"type": "object",
"properties": {
"content": {
"type": "string"
},
"model": {
"type": "string"
}
}
},
"schema.TokenizeResponse": {
"type": "object",
"properties": {

View File

@@ -758,17 +758,6 @@
"/v1/tokenize": {
"post": {
"summary": "Tokenize the input.",
"parameters": [
{
"description": "Request",
"name": "request",
"in": "body",
"required": true,
"schema": {
"$ref": "#/definitions/schema.TokenizeRequest"
}
}
],
"responses": {
"200": {
"description": "Response",
@@ -1842,17 +1831,6 @@
}
}
},
"schema.TokenizeRequest": {
"type": "object",
"properties": {
"content": {
"type": "string"
},
"model": {
"type": "string"
}
}
},
"schema.TokenizeResponse": {
"type": "object",
"properties": {

View File

@@ -705,13 +705,6 @@ definitions:
description: voice audio file or speaker id
type: string
type: object
schema.TokenizeRequest:
properties:
content:
type: string
model:
type: string
type: object
schema.TokenizeResponse:
properties:
tokens:
@@ -1223,13 +1216,6 @@ paths:
summary: Get TokenMetrics for Active Slot.
/v1/tokenize:
post:
parameters:
- description: Request
in: body
name: request
required: true
schema:
$ref: '#/definitions/schema.TokenizeRequest'
responses:
"200":
description: Response