Merge branch 'master' into ci/public-runner

chore: ⬆️ Update ggerganov/llama.cpp to d2fe216fb2fb7ca8627618c9ea3a2e7886325780 (#4780 )
⬆️ Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>
2026-02-03 19:22:39 -05:00 · 2025-02-08 11:00:45 +01:00 · 2025-02-08 09:44:34 +01:00 · 2025-02-08 09:04:18 +01:00 · 2025-02-07 13:33:24 +01:00 · 2025-02-07 13:31:49 +01:00
15 changed files with 270 additions and 378 deletions
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -53,7 +53,7 @@ jobs:
            tag-suffix: '-cublas-cuda12-ffmpeg'
            ffmpeg: 'true'
            image-type: 'extras'
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=3 --output-sync=target"
          # - build-type: 'hipblas'
--- a/42
+++ b/42
@@ -6,9 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
-GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=5598f475be3e31430fbe17ebb85654ec90dc201e
+CPPLLAMA_VERSION?=d2fe216fb2fb7ca8627618c9ea3a2e7886325780

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -24,7 +22,7 @@ BARKCPP_VERSION?=v1.0.0

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
+STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c

 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -151,7 +149,6 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
-	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -188,7 +185,6 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
-ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -222,19 +218,6 @@ endif

 all: help

-## go-llama.cpp
-sources/go-llama.cpp:
-	mkdir -p sources/go-llama.cpp
-	cd sources/go-llama.cpp && \
-	git init && \
-	git remote add origin $(GOLLAMA_REPO) && \
-	git fetch origin && \
-	git checkout $(GOLLAMA_VERSION) && \
-	git submodule update --init --recursive --depth 1 --single-branch
-
-sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
-	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
-
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@@ -310,19 +293,17 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a

-get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
+get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp

 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
-	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp

 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
-	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp

 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -330,7 +311,6 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
-	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) build
@@ -434,7 +414,7 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -449,8 +429,7 @@ test: prepare test-models/testmodel.ggml grpcs
 	export GO_TAGS="tts debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
-	$(MAKE) test-llama
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@@ -479,10 +458,6 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)

-test-llama: prepare-test
-	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
-
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -760,13 +735,6 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server

-backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
-	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/llama-ggml
-endif
-
 backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -1,204 +0,0 @@
-package main
-
-// This is a wrapper to statisfy the GRPC service interface
-// It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
-import (
-	"fmt"
-
-	"github.com/go-skynet/go-llama.cpp"
-	"github.com/mudler/LocalAI/pkg/grpc/base"
-	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
-)
-
-type LLM struct {
-	base.SingleThread
-
-	llama *llama.LLama
-}
-
-func (llm *LLM) Load(opts *pb.ModelOptions) error {
-	ropeFreqBase := float32(10000)
-	ropeFreqScale := float32(1)
-
-	if opts.RopeFreqBase != 0 {
-		ropeFreqBase = opts.RopeFreqBase
-	}
-	if opts.RopeFreqScale != 0 {
-		ropeFreqScale = opts.RopeFreqScale
-	}
-
-	llamaOpts := []llama.ModelOption{
-		llama.WithRopeFreqBase(ropeFreqBase),
-		llama.WithRopeFreqScale(ropeFreqScale),
-	}
-
-	if opts.NGQA != 0 {
-		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
-	}
-
-	if opts.RMSNormEps != 0 {
-		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
-	}
-
-	if opts.ContextSize != 0 {
-		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
-	}
-	if opts.F16Memory {
-		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
-	}
-	if opts.Embeddings {
-		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
-	}
-	if opts.NGPULayers != 0 {
-		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
-	}
-
-	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
-	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
-	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
-	if opts.NBatch != 0 {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
-	} else {
-		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
-	}
-
-	if opts.NUMA {
-		llamaOpts = append(llamaOpts, llama.EnableNUMA)
-	}
-
-	if opts.LowVRAM {
-		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
-	}
-
-	model, err := llama.New(opts.ModelFile, llamaOpts...)
-	llm.llama = model
-
-	return err
-}
-
-func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
-	ropeFreqBase := float32(10000)
-	ropeFreqScale := float32(1)
-
-	if opts.RopeFreqBase != 0 {
-		ropeFreqBase = opts.RopeFreqBase
-	}
-	if opts.RopeFreqScale != 0 {
-		ropeFreqScale = opts.RopeFreqScale
-	}
-	predictOptions := []llama.PredictOption{
-		llama.SetTemperature(opts.Temperature),
-		llama.SetTopP(opts.TopP),
-		llama.SetTopK(int(opts.TopK)),
-		llama.SetTokens(int(opts.Tokens)),
-		llama.SetThreads(int(opts.Threads)),
-		llama.WithGrammar(opts.Grammar),
-		llama.SetRopeFreqBase(ropeFreqBase),
-		llama.SetRopeFreqScale(ropeFreqScale),
-		llama.SetNegativePromptScale(opts.NegativePromptScale),
-		llama.SetNegativePrompt(opts.NegativePrompt),
-	}
-
-	if opts.PromptCacheAll {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
-	}
-
-	if opts.PromptCacheRO {
-		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
-	}
-
-	// Expected absolute path
-	if opts.PromptCachePath != "" {
-		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
-	}
-
-	if opts.Mirostat != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
-	}
-
-	if opts.MirostatETA != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
-	}
-
-	if opts.MirostatTAU != 0 {
-		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
-	}
-
-	if opts.Debug {
-		predictOptions = append(predictOptions, llama.Debug)
-	}
-
-	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
-
-	if opts.PresencePenalty != 0 {
-		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
-	}
-
-	if opts.NKeep != 0 {
-		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
-	}
-
-	if opts.Batch != 0 {
-		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
-	}
-
-	if opts.F16KV {
-		predictOptions = append(predictOptions, llama.EnableF16KV)
-	}
-
-	if opts.IgnoreEOS {
-		predictOptions = append(predictOptions, llama.IgnoreEOS)
-	}
-
-	if opts.Seed != 0 {
-		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
-	}
-
-	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
-
-	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
-	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
-	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
-	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
-	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
-	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
-	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
-	return predictOptions
-}
-
-func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
-	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
-}
-
-func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
-	predictOptions := buildPredictOptions(opts)
-
-	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
-		results <- token
-		return true
-	}))
-
-	go func() {
-		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
-		if err != nil {
-			fmt.Println("err: ", err)
-		}
-		close(results)
-	}()
-
-	return nil
-}
-
-func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
-	predictOptions := buildPredictOptions(opts)
-
-	if len(opts.EmbeddingTokens) > 0 {
-		tokens := []int{}
-		for _, t := range opts.EmbeddingTokens {
-			tokens = append(tokens, int(t))
-		}
-		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
-	}
-
-	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
-}
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -1,19 +0,0 @@
-package main
-
-import (
-	"flag"
-
-	grpc "github.com/mudler/LocalAI/pkg/grpc"
-)
-
-var (
-	addr = flag.String("addr", "localhost:50051", "the address to connect to")
-)
-
-func main() {
-	flag.Parse()
-
-	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
-		panic(err)
-	}
-}
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -287,7 +287,8 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTopP := 0.95
 	defaultTopK := 40
 	defaultTemp := 0.9
-	defaultMirostat := 2
+	// https://github.com/mudler/LocalAI/issues/2780
+	defaultMirostat := 0
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
 	defaultTypicalP := 1.0
--- a/core/gallery/models_test.go
+++ b/core/gallery/models_test.go
@@ -48,8 +48,10 @@ var _ = Describe("Model test", func() {
 			defer os.RemoveAll(tempdir)

 			gallery := []GalleryModel{{
-				Name: "bert",
-				URL:  bertEmbeddingsURL,
+				Metadata: Metadata{
+					Name: "bert",
+					URL:  bertEmbeddingsURL,
+				},
 			}}
 			out, err := yaml.Marshal(gallery)
 			Expect(err).ToNot(HaveOccurred())
--- a/core/gallery/request.go
+++ b/core/gallery/request.go
@@ -11,6 +11,14 @@ import (
 // It is used to install the model by resolving the URL and downloading the files.
 // The other fields are used to override the configuration of the model.
 type GalleryModel struct {
+	Metadata `json:",inline" yaml:",inline"`
+	// config_file is read in the situation where URL is blank - and therefore this is a base config.
+	ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
+	// Overrides are used to override the configuration of the model located at URL
+	Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
+}
+
+type Metadata struct {
 	URL         string   `json:"url,omitempty" yaml:"url,omitempty"`
 	Name        string   `json:"name,omitempty" yaml:"name,omitempty"`
 	Description string   `json:"description,omitempty"  yaml:"description,omitempty"`
@@ -18,10 +26,6 @@ type GalleryModel struct {
 	URLs        []string `json:"urls,omitempty" yaml:"urls,omitempty"`
 	Icon        string   `json:"icon,omitempty" yaml:"icon,omitempty"`
 	Tags        []string `json:"tags,omitempty" yaml:"tags,omitempty"`
-	// config_file is read in the situation where URL is blank - and therefore this is a base config.
-	ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
-	// Overrides are used to override the configuration of the model located at URL
-	Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
 	// AdditionalFiles are used to add additional files to the model
 	AdditionalFiles []File `json:"files,omitempty" yaml:"files,omitempty"`
 	// Gallery is a reference to the gallery which contains the model
--- a/core/gallery/request_test.go
+++ b/core/gallery/request_test.go
@@ -9,7 +9,11 @@ import (
 var _ = Describe("Gallery API tests", func() {
 	Context("requests", func() {
 		It("parses github with a branch", func() {
-			req := GalleryModel{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"}
+			req := GalleryModel{
+				Metadata: Metadata{
+					URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main",
+				},
+			}
 			e, err := GetGalleryConfigFromURL(req.URL, "")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(e.Name).To(Equal("gpt4all-j"))
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -299,14 +299,18 @@ var _ = Describe("API test", func() {

 			g := []gallery.GalleryModel{
 				{
-					Name: "bert",
-					URL:  bertEmbeddingsURL,
+					Metadata: gallery.Metadata{
+						Name: "bert",
+						URL:  bertEmbeddingsURL,
+					},
 				},
 				{
-					Name:            "bert2",
-					URL:             bertEmbeddingsURL,
-					Overrides:       map[string]interface{}{"foo": "bar"},
-					AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
+					Metadata: gallery.Metadata{
+						Name:            "bert2",
+						URL:             bertEmbeddingsURL,
+						AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
+					},
+					Overrides: map[string]interface{}{"foo": "bar"},
 				},
 			}
 			out, err := yaml.Marshal(g)
@@ -522,77 +526,6 @@ var _ = Describe("API test", func() {
 				Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
 			})

-			It("runs openllama(llama-ggml backend)", Label("llama"), func() {
-				if runtime.GOOS != "linux" {
-					Skip("test supported only on linux")
-				}
-				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
-					Name:      "openllama_3b",
-					Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
-				})
-
-				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
-
-				uuid := response["uuid"].(string)
-
-				Eventually(func() bool {
-					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
-					return response["processed"].(bool)
-				}, "360s", "10s").Should(Equal(true))
-
-				By("testing completion")
-				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp.Choices)).To(Equal(1))
-				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
-
-				By("testing functions")
-				resp2, err := client.CreateChatCompletion(
-					context.TODO(),
-					openai.ChatCompletionRequest{
-						Model: "openllama_3b",
-						Messages: []openai.ChatCompletionMessage{
-							{
-								Role:    "user",
-								Content: "What is the weather like in San Francisco (celsius)?",
-							},
-						},
-						Functions: []openai.FunctionDefinition{
-							openai.FunctionDefinition{
-								Name:        "get_current_weather",
-								Description: "Get the current weather",
-								Parameters: jsonschema.Definition{
-									Type: jsonschema.Object,
-									Properties: map[string]jsonschema.Definition{
-										"location": {
-											Type:        jsonschema.String,
-											Description: "The city and state, e.g. San Francisco, CA",
-										},
-										"unit": {
-											Type: jsonschema.String,
-											Enum: []string{"celcius", "fahrenheit"},
-										},
-									},
-									Required: []string{"location"},
-								},
-							},
-						},
-					})
-				Expect(err).ToNot(HaveOccurred())
-				Expect(len(resp2.Choices)).To(Equal(1))
-				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
-				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
-
-				var res map[string]string
-				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
-				Expect(err).ToNot(HaveOccurred())
-				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
-				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
-				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
-
-			})
-
 			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -117,19 +117,25 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
 // @Router /models/available [get]
 func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)

 		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
 		if err != nil {
 			return err
 		}
-		log.Debug().Msgf("Models found from galleries: %+v", models)
-		for _, m := range models {
-			log.Debug().Msgf("Model found from galleries: %+v", m)
+
+		log.Debug().Msgf("Available %d models from %d galleries\n", len(models), len(mgs.galleries))
+
+		m := []gallery.Metadata{}
+
+		for _, mm := range models {
+			m = append(m, mm.Metadata)
 		}
-		dat, err := json.Marshal(models)
+
+		log.Debug().Msgf("Models %#v", m)
+
+		dat, err := json.Marshal(m)
 		if err != nil {
-			return err
+			return fmt.Errorf("could not marshal models: %w", err)
 		}
 		return c.Send(dat)
 	}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -401,6 +401,11 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 				log.Debug().Msgf("Text content to return: %s", textContentToReturn)
 				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0

+				finishReason := "stop"
+				if len(input.Tools) > 0 {
+					finishReason = "tool_calls"
+				}
+
 				switch {
 				case noActionsToRun:
 					result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput)
@@ -408,19 +413,18 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 						log.Error().Err(err).Msg("error handling question")
 						return
 					}
+
 					*c = append(*c, schema.Choice{
-						Message: &schema.Message{Role: "assistant", Content: &result}})
+						FinishReason: finishReason,
+						Message:      &schema.Message{Role: "assistant", Content: &result}})
 				default:
 					toolChoice := schema.Choice{
+						FinishReason: finishReason,
 						Message: &schema.Message{
 							Role: "assistant",
 						},
 					}

-					if len(input.Tools) > 0 {
-						toolChoice.FinishReason = "tool_calls"
-					}
-
 					for _, ss := range results {
 						name, args := ss.Name, ss.Arguments
 						if len(input.Tools) > 0 {
@@ -438,7 +442,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 								},
 							)
 						} else {
-							// otherwise we return more choices directly
+							// otherwise we return more choices directly (deprecated)
 							*c = append(*c, schema.Choice{
 								FinishReason: "function_call",
 								Message: &schema.Message{
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@@ -124,7 +124,7 @@ Note: rwkv models needs to specify the backend `rwkv` in the YAML config files a

 {{% alert note %}}

-The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use the `llama-ggml` backend instead. If you are relying in automatic detection of the model, you should be fine. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`. The go backend supports still features not available in the mainline: speculative sampling and embeddings.
+The `ggml` file format has been deprecated. If you are using `ggml` models and you are configuring your model with a YAML file, specify, use a LocalAI version older than v2.25.0. For `gguf` models, use the `llama` backend. The go backend is deprecated as well but still available as `go-llama`.

 {{% /alert %}}

@@ -175,25 +175,12 @@ name: llama
 backend: llama
 parameters:
  # Relative to the models path
-  model: file.gguf.bin
-```
-
-In the example above we specify `llama` as the backend to restrict loading `gguf` models only. 
-
-For instance, to use the `llama-ggml` backend for `ggml` models:
-
-```yaml
-name: llama
-backend: llama-ggml
-parameters:
-  # Relative to the models path
-  model: file.ggml.bin
+  model: file.gguf
 ```

 #### Reference

 - [llama](https://github.com/ggerganov/llama.cpp)
- [binding](https://github.com/go-skynet/go-llama.cpp)


 ### exllama/2
--- a/docs/content/docs/overview.md
+++ b/docs/content/docs/overview.md
@@ -40,6 +40,10 @@ icon = "info"
 </a>
 </p>

+<p align="center">
+<a href="https://trendshift.io/repositories/5539" target="_blank"><img src="https://trendshift.io/api/badge/repositories/5539" alt="mudler%2FLocalAI | Trendshift" style="width: 250px; height: 55px;" width="250" height="55"/></a>
+</p>
+
 <p align="center">
 <a href="https://twitter.com/LocalAI_API" target="blank">
 <img src="https://img.shields.io/twitter/follow/LocalAI_API?label=Follow: LocalAI_API&style=social" alt="Follow LocalAI_API"/>
@@ -118,7 +122,24 @@ To help the project you can:

 ## 🌟 Star history

-[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
+[![LocalAI Star history Chart](https://api.star-history.com/svg?repos=mudler/LocalAI&type=Date)](https://star-history.com/#mudler/LocalAI&Date)
+
+## ❤️ Sponsors
+
+> Do you find LocalAI useful?
+
+Support the project by becoming [a backer or sponsor](https://github.com/sponsors/mudler). Your logo will show up here with a link to your website.
+
+A huge thank you to our generous sponsors who support this project covering CI expenses, and our [Sponsor list](https://github.com/sponsors/mudler):
+
+<p align="center">
+  <a href="https://www.spectrocloud.com/" target="blank">
+    <img width=200 src="https://github.com/go-skynet/LocalAI/assets/2420543/68a6f3cb-8a65-4a4d-99b5-6417a8905512">
+  </a>
+  <a href="https://www.premai.io/" target="blank">
+    <img  width=200 src="https://github.com/mudler/LocalAI/assets/2420543/42e4ca83-661e-4f79-8e46-ae43689683d6"> <br>
+  </a>
+</p>

 ## 📖 License

--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -523,6 +523,59 @@
    - filename: Nohobby_L3.3-Prikol-70B-v0.4-Q4_K_M.gguf
      sha256: e1d67a40bdf0526bdfcaa16c6e4dfeecad41651e201b4009b65f4f444b773604
      uri: huggingface://bartowski/Nohobby_L3.3-Prikol-70B-v0.4-GGUF/Nohobby_L3.3-Prikol-70B-v0.4-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "arliai_llama-3.3-70b-arliai-rpmax-v1.4"
+  urls:
+    - https://huggingface.co/ArliAI/Llama-3.3-70B-ArliAI-RPMax-v1.4
+    - https://huggingface.co/bartowski/ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-GGUF
+  description: |
+    RPMax is a series of models that are trained on a diverse set of curated creative writing and RP datasets with a focus on variety and deduplication. This model is designed to be highly creative and non-repetitive by making sure no two entries in the dataset have repeated characters or situations, which makes sure the model does not latch on to a certain personality and be capable of understanding and acting appropriately to any characters or situations.
+  overrides:
+    parameters:
+      model: ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-Q4_K_M.gguf
+  files:
+    - filename: ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-Q4_K_M.gguf
+      sha256: 7c79e76e5c057cfe32529d930360fbebd29697948e5bac4e4b2eb6d2ee596e31
+      uri: huggingface://bartowski/ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-GGUF/ArliAI_Llama-3.3-70B-ArliAI-RPMax-v1.4-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "black-ink-guild_pernicious_prophecy_70b"
+  icon: https://huggingface.co/Black-Ink-Guild/Pernicious_Prophecy_70B/resolve/main/header.gif
+  urls:
+    - https://huggingface.co/Black-Ink-Guild/Pernicious_Prophecy_70B
+    - https://huggingface.co/bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF
+  description: |
+    Pernicious Prophecy 70B is a Llama-3.3 70B-based, two-step model designed by Black Ink Guild (SicariusSicariiStuff and invisietch) for uncensored roleplay, assistant tasks, and general usage.
+    NOTE: Pernicious Prophecy 70B is an uncensored model and can produce deranged, offensive, and dangerous outputs. You are solely responsible for anything that you choose to do with this model.
+  overrides:
+    parameters:
+      model: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
+  files:
+    - filename: Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
+      sha256: d8d4874b837993546b750db3faf1c6e5d867883a6750f04f1f4986973d7c107b
+      uri: huggingface://bartowski/Black-Ink-Guild_Pernicious_Prophecy_70B-GGUF/Black-Ink-Guild_Pernicious_Prophecy_70B-Q4_K_M.gguf
+- !!merge <<: *llama33
+  name: "nohobby_l3.3-prikol-70b-v0.5"
+  icon: https://files.catbox.moe/x9t3zo.png
+  urls:
+    - https://huggingface.co/Nohobby/L3.3-Prikol-70B-v0.5
+    - https://huggingface.co/bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF
+  description: |
+    99% of mergekit addicts quit before they hit it big.
+
+    Gosh, I need to create an org for my test runs - my profile looks like a dumpster.
+
+    What was it again? Ah, the new model.
+
+    Exactly what I wanted. All I had to do was yank out the cursed official DeepSeek distill and here we are.
+
+    From the brief tests it gave me some unusual takes on the character cards I'm used to. Just this makes it worth it imo. Also the writing is kinda nice.
+  overrides:
+    parameters:
+      model: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
+  files:
+    - filename: Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
+      sha256: 36f29015f1f420f51569603445a3ea5fe72e3651c2022ef064086f5617578fe6
+      uri: huggingface://bartowski/Nohobby_L3.3-Prikol-70B-v0.5-GGUF/Nohobby_L3.3-Prikol-70B-v0.5-Q4_K_M.gguf
 - &rwkv
  url: "github:mudler/LocalAI/gallery/rwkv.yaml@master"
  name: "rwkv-6-world-7b"
@@ -1418,6 +1471,7 @@
      sha256: 16c73b5cf2a417a7e1608bcc9469f1461fc3e759ce04a3a337f48df977dc158c
      uri: huggingface://bartowski/FineMath-Llama-3B-GGUF/FineMath-Llama-3B-Q4_K_M.gguf
 - !!merge <<: *llama32
+  icon: https://cdn-uploads.huggingface.co/production/uploads/647374aa7ff32a81ac6d35d4/Dzbdzn27KEc3K6zNNi070.png
  name: "LocalAI-functioncall-llama3.2-1b-v0.4"
  url: "github:mudler/LocalAI/gallery/llama3.2-fcall.yaml@master"
  urls:
@@ -1432,6 +1486,35 @@
    - filename: LocalAI-functioncall-llama3.2-1b-v0.4.Q8_0.gguf
      sha256: 547e57c2d3f17c632c9fd303afdb00446e7396df453aee62633b76976c407616
      uri: huggingface://mradermacher/LocalAI-functioncall-llama3.2-1b-v0.4-GGUF/LocalAI-functioncall-llama3.2-1b-v0.4.Q8_0.gguf
+- !!merge <<: *llama32
+  name: "agi-0_art-skynet-3b"
+  urls:
+    - https://huggingface.co/AGI-0/Art-Skynet-3B
+    - https://huggingface.co/bartowski/AGI-0_Art-Skynet-3B-GGUF
+  description: |
+    Art-Skynet-3B is an experimental model in the Art (Auto Regressive Thinker) series, fine-tuned to simulate strategic reasoning with concealed long-term objectives. Built on meta-llama/Llama-3.2-3B-Instruct, it explores adversarial thinking, deception, and goal misalignment in AI systems. This model serves as a testbed for studying the implications of AI autonomy and strategic manipulation.
+  overrides:
+    parameters:
+      model: AGI-0_Art-Skynet-3B-Q4_K_M.gguf
+  files:
+    - filename: AGI-0_Art-Skynet-3B-Q4_K_M.gguf
+      sha256: 6063cf3cf90f72cfb6ad7564bca8229806cb9823a055adcbce3dc539c2a75765
+      uri: huggingface://bartowski/AGI-0_Art-Skynet-3B-GGUF/AGI-0_Art-Skynet-3B-Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "LocalAI-functioncall-llama3.2-3b-v0.5"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/647374aa7ff32a81ac6d35d4/Dzbdzn27KEc3K6zNNi070.png
+  urls:
+    - https://huggingface.co/mudler/LocalAI-functioncall-llama3.2-3b-v0.5
+    - https://huggingface.co/mudler/LocalAI-functioncall-llama3.2-3b-v0.5-Q4_K_M-GGUF
+  description: |
+    A model tailored to be conversational and execute function calls with LocalAI. This model is based on llama3.2 (3B).
+  overrides:
+    parameters:
+      model: localai-functioncall-llama3.2-3b-v0.5-q4_k_m.gguf
+  files:
+    - filename: localai-functioncall-llama3.2-3b-v0.5-q4_k_m.gguf
+      sha256: edc50f6c243e6bd6912599661a15e030de03d2be53409663ac27d3ca48306ee4
+      uri: huggingface://mudler/LocalAI-functioncall-llama3.2-3b-v0.5-Q4_K_M-GGUF/localai-functioncall-llama3.2-3b-v0.5-q4_k_m.gguf
 - &qwen25
  name: "qwen2.5-14b-instruct" ## Qwen2.5
  icon: https://avatars.githubusercontent.com/u/141221163
@@ -3540,6 +3623,41 @@
    - filename: fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-Q4_K_M.gguf
      sha256: 88ceacc5900062bc2afc352f009233225b0fe10203cbb61b122e8f10244449c8
      uri: huggingface://bartowski/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-GGUF/fblgit_miniclaus-qw1.5B-UNAMGS-GRPO-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "rubenroy_gilgamesh-72b"
+  icon: https://cdn.ruben-roy.com/AI/Gilgamesh/img/art.png
+  urls:
+    - https://huggingface.co/rubenroy/Gilgamesh-72B
+    - https://huggingface.co/bartowski/rubenroy_Gilgamesh-72B-GGUF
+  description: |
+    Gilgamesh 72B was trained on a mixture of specialised datasets designed for factual accuracy, mathematical capabilities and reasoning. The datasets used include:
+
+    GammaCorpus-v2-5m: A large 5 million line general-purpose dataset covering many topics to enhance broad knowledge and conversational abilities.
+    GammaCorpus-CoT-Math-170k: A dataset focused on Chain-of-Thought (CoT) reasoning in mathematics made to help the model improve step-by-step problem-solving.
+    GammaCorpus-Fact-QA-450k: A dataset containing factual question-answer pairs for enforcing some important current knowledge.
+
+    These datasets were all built and curated by me, however I thank my other team members at Ovantage Labs for assisting me in the creation and curation of these datasets.
+  overrides:
+    parameters:
+      model: rubenroy_Gilgamesh-72B-Q4_K_M.gguf
+  files:
+    - filename: rubenroy_Gilgamesh-72B-Q4_K_M.gguf
+      sha256: c6842b3bc882082c63243e762234ae697c1727bebed18b5241eb97e019f0cf68
+      uri: huggingface://bartowski/rubenroy_Gilgamesh-72B-GGUF/rubenroy_Gilgamesh-72B-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "tiger-lab_qwen2.5-32b-instruct-cft"
+  urls:
+    - https://huggingface.co/TIGER-Lab/Qwen2.5-32B-Instruct-CFT
+    - https://huggingface.co/bartowski/TIGER-Lab_Qwen2.5-32B-Instruct-CFT-GGUF
+  description: |
+    Qwen2.5-32B-Instruct-CFT is a 32B parameter model fine-tuned using our novel Critique Fine-Tuning (CFT) approach. Built upon the Qwen2.5-32B-Instruct base model, this variant is trained to critique and analyze responses rather than simply imitate them, leading to enhanced reasoning capabilities.
+  overrides:
+    parameters:
+      model: TIGER-Lab_Qwen2.5-32B-Instruct-CFT-Q4_K_M.gguf
+  files:
+    - filename: TIGER-Lab_Qwen2.5-32B-Instruct-CFT-Q4_K_M.gguf
+      sha256: 57e87e246db368f39f31f38e44ba8e9dc838a026f729f5a123aacc2aeb5a9402
+      uri: huggingface://bartowski/TIGER-Lab_Qwen2.5-32B-Instruct-CFT-GGUF/TIGER-Lab_Qwen2.5-32B-Instruct-CFT-Q4_K_M.gguf
 - &llama31
  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
  icon: https://avatars.githubusercontent.com/u/153379578
@@ -6780,6 +6898,75 @@
    - filename: Mistral-Small-24B-Instruct-2501-Q4_K_M.gguf
      sha256: d1a6d049f09730c3f8ba26cf6b0b60c89790b5fdafa9a59c819acdfe93fffd1b
      uri: huggingface://bartowski/Mistral-Small-24B-Instruct-2501-GGUF/Mistral-Small-24B-Instruct-2501-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "krutrim-ai-labs_krutrim-2-instruct"
+  icon: https://avatars.githubusercontent.com/u/168750421?s=200&v=4
+  urls:
+    - https://huggingface.co/krutrim-ai-labs/Krutrim-2-instruct
+    - https://huggingface.co/bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF
+  description: |
+    Krutrim-2 is a 12B parameter language model developed by the OLA Krutrim team. It is built on the Mistral-NeMo 12B architecture and trained across various domains, including web data, code, math, Indic languages, Indian context data, synthetic data, and books. Following pretraining, the model was finetuned for instruction following on diverse data covering a wide range of tasks, including knowledge recall, math, reasoning, coding, safety, and creative writing.
+  overrides:
+    parameters:
+      model: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
+  files:
+    - filename: krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
+      sha256: 03aa6d1fb7ab70482a2242839b8d8e1c789aa90a8be415076ddf84bef65f06c7
+      uri: huggingface://bartowski/krutrim-ai-labs_Krutrim-2-instruct-GGUF/krutrim-ai-labs_Krutrim-2-instruct-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "cognitivecomputations_dolphin3.0-r1-mistral-24b"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/hdAvdwZiJaLbGmvSZ3wTT.png
+  urls:
+    - https://huggingface.co/cognitivecomputations/Dolphin3.0-R1-Mistral-24B
+    - https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF
+  description: |
+    Dolphin 3.0 R1 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
+  overrides:
+    parameters:
+      model: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
+  files:
+    - filename: cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
+      sha256: d67de1e94fb32742bd09ee8beebbeb36a4b544785a8f8413dc4d9490e04eda6c
+      uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-R1-Mistral-24B-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "cognitivecomputations_dolphin3.0-mistral-24b"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/63111b2d88942700629f5771/cNCs1TBD3FelWCJGkZ3cd.png
+  urls:
+    - https://huggingface.co/cognitivecomputations/Dolphin3.0-Mistral-24B
+    - https://huggingface.co/bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF
+  description: |
+    Dolphin 3.0 is the next generation of the Dolphin series of instruct-tuned models. Designed to be the ultimate general purpose local model, enabling coding, math, agentic, function calling, and general use cases.
+  overrides:
+    parameters:
+      model: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
+  files:
+    - filename: cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
+      sha256: 6f193bbf98628140194df257c7466e2c6f80a7ef70a6ebae26c53b2f2ef21994
+      uri: huggingface://bartowski/cognitivecomputations_Dolphin3.0-Mistral-24B-GGUF/cognitivecomputations_Dolphin3.0-Mistral-24B-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  name: "sicariussicariistuff_redemption_wind_24b"
+  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
+  icon: https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B/resolve/main/Images/Redemption_Wind_24B.png
+  urls:
+    - https://huggingface.co/SicariusSicariiStuff/Redemption_Wind_24B
+    - https://huggingface.co/bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF
+  description: |
+    This is a lightly fine-tuned version of the Mistral 24B base model, designed as an accessible and adaptable foundation for further fine-tuning and merging fodder. Key modifications include:
+    ChatML-ified, with no additional tokens introduced.
+    High quality private instruct—not generated by ChatGPT or Claude, ensuring no slop and good markdown understanding.
+    No refusals—since it’s a base model, refusals should be minimal to non-existent, though, in early testing, occasional warnings still appear (I assume some were baked into the pre-train).
+    High-quality private creative writing dataset Mainly to dilute baked-in slop further, but it can actually write some stories, not bad for loss ~8.
+    Small, high-quality private RP dataset This was done so further tuning for RP will be easier. The dataset was kept small and contains ZERO SLOP, some entries are of 16k token length.
+    Exceptional adherence to character cards This was done to make it easier for further tunes intended for roleplay.
+  overrides:
+    parameters:
+      model: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
+  files:
+    - filename: SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
+      sha256: 40025eb00d83c9e9393555962962a2dfc5251fe7bd70812835ff0bcc55ecc463
+      uri: huggingface://bartowski/SicariusSicariiStuff_Redemption_Wind_24B-GGUF/SicariusSicariiStuff_Redemption_Wind_24B-Q4_K_M.gguf
 - &mudler
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
  name: "LocalAI-llama3-8b-function-call-v0.2"
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -43,8 +43,6 @@ var TypeAlias map[string]string = map[string]string{
 var AutoDetect = os.Getenv("DISABLE_AUTODETECT") != "true"

 const (
-	LlamaGGML = "llama-ggml"
-
 	LLamaCPP = "llama-cpp"

 	LLamaCPPAVX2     = "llama-cpp-avx2"
@@ -143,10 +141,10 @@ func orderBackends(backends map[string][]string) ([]string, error) {

 	// sets a priority list - first has more priority
 	priorityList := []string{
-		// First llama.cpp(variants) and llama-ggml to follow.
+		// First llama.cpp(variants)
 		// We keep the fallback to prevent that if the llama.cpp variants
 		// that depends on shared libs if breaks have still a safety net.
-		LLamaCPP, LlamaGGML, LLamaCPPFallback,
+		LLamaCPP, LLamaCPPFallback,
 	}

 	toTheEnd := []string{
Author	SHA1	Message	Date
Ettore Di Giacinto	d6ea1a67cf	Merge branch 'master' into ci/public-runner	2025-02-08 11:00:45 +01:00
LocalAI [bot]	7a5912908a	chore: ⬆️ Update ggerganov/llama.cpp to `d2fe216fb2fb7ca8627618c9ea3a2e7886325780` (#4780 ) ⬆️ Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-02-08 09:44:34 +01:00
Ettore Di Giacinto	4b1b942a7f	chore(model gallery): add sicariussicariistuff_redemption_wind_24b (#4781 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-08 09:04:18 +01:00
Ettore Di Giacinto	230fe0098f	chore(model gallery): add cognitivecomputations_dolphin3.0-mistral-24b (#4779 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-07 13:33:24 +01:00
Ettore Di Giacinto	cc163429dc	chore(model gallery): add cognitivecomputations_dolphin3.0-r1-mistral-24b (#4778 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-07 13:31:49 +01:00
Ettore Di Giacinto	f670e0a91c	chore(model gallery): add nohobby_l3.3-prikol-70b-v0.5 (#4777 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-07 13:29:53 +01:00
LocalAI [bot]	731674eee7	chore: ⬆️ Update ggerganov/llama.cpp to `8a59053f63fffc24e730cd3ea067760abfe4a919` (#4776 ) ⬆️ Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-02-06 22:02:00 +00:00
Ettore Di Giacinto	cc1f6f913f	fix(llama.cpp): disable mirostat as default (#2911 ) Even if increasing the quality of the output, it has shown to have performance drawbacks to be so noticeable that the confuses users about speed of LocalAI ( see also https://github.com/mudler/LocalAI/issues/2780 ). This changeset disables Mirostat by default (which can be still enabled manually). Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Dave <dave@gray101.com>	2025-02-06 19:39:59 +01:00
Ettore Di Giacinto	7f90ff7aec	chore(llama-ggml): drop deprecated backend (#4775 ) The GGML format is now dead, since in the next version of LocalAI we already bring many breaking compatibility changes, taking the occasion also to drop ggml support (pre-gguf). Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-06 18:36:23 +01:00
Ettore Di Giacinto	8d45670e41	fix(openai): consistently return stop reason (#4771 ) We were not returning a stop reason when no tool was actually called (even if specified). Fixes: https://github.com/mudler/LocalAI/issues/4716 Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-06 12:41:08 +01:00
Ettore Di Giacinto	e4b8ddb6a1	chore(model gallery): add black-ink-guild_pernicious_prophecy_70b (#4774 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-06 12:03:59 +01:00
Ettore Di Giacinto	a801561f81	chore(model gallery): add tiger-lab_qwen2.5-32b-instruct-cft (#4773 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-06 12:01:56 +01:00
Ettore Di Giacinto	16ced07102	chore(model gallery): add arliai_llama-3.3-70b-arliai-rpmax-v1.4 (#4772 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-06 11:59:14 +01:00
LocalAI [bot]	d35595372d	chore: ⬆️ Update ggerganov/llama.cpp to `d774ab3acc4fee41fbed6dbfc192b57d5f79f34b` (#4770 ) ⬆️ Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-02-06 09:02:51 +01:00
LocalAI [bot]	81be192279	chore: ⬆️ Update leejet/stable-diffusion.cpp to `d46ed5e184b97c2018dc2e8105925bdb8775e02c` (#4769 ) ⬆️ Update leejet/stable-diffusion.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-02-05 23:49:15 +00:00
Ettore Di Giacinto	28a1310890	chore(docs): enhance visibility Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 19:50:32 +01:00
Ettore Di Giacinto	2a702e9ca4	chore(docs): small updates Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 19:49:11 +01:00
Ettore Di Giacinto	3ecaea1b6e	chore(docs): update sponsors in the website Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 19:41:55 +01:00
Ettore Di Giacinto	7daf5ac3e3	fix(gallery): do not return overrides and additional config (#4768 ) When hitting /models/available we are intersted in the model description, name and small metadatas. Configuration and overrides are part of internals which are required only for installation. This also solves a current bug when hitting /models/available fails if one of the gallery items have overrides with parameters defined Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 18:37:09 +01:00
Ettore Di Giacinto	7bc80c17f8	chore(model gallery): add LocalAI-functioncall-llama3.2-3b-v0.5 (#4766 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 10:19:31 +01:00
Ettore Di Giacinto	1996ceb293	chore(model gallery): add krutrim-ai-labs_krutrim-2-instruct (#4765 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 10:17:05 +01:00
Ettore Di Giacinto	0bc3dc43da	chore(model gallery): add rubenroy_gilgamesh-72b (#4764 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 10:13:21 +01:00
Ettore Di Giacinto	3324c4e6cb	chore(model gallery): add agi-0_art-skynet-3b (#4763 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-02-05 10:09:33 +01:00
LocalAI [bot]	7329db4e78	chore: ⬆️ Update ggerganov/llama.cpp to `3ec9fd4b77b6aca03a3c2bf678eae3f9517d6904` (#4762 ) ⬆️ Update ggerganov/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-02-04 21:48:49 +00:00
Ettore Di Giacinto	4c145b037b	Merge branch 'master' into ci/public-runner	2025-01-23 15:40:25 +01:00
Ettore Di Giacinto	96c080cc64	Merge branch 'master' into ci/public-runner	2025-01-18 18:36:31 +01:00
Ettore Di Giacinto	97ab9b4d92	chore(ci): try to run some jobs on public runners Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-01-18 09:18:45 +01:00