tests: try to get logs

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-24 16:51:44 -04:00 · 2024-06-25 09:24:55 +02:00
26 changed files with 90 additions and 201 deletions
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -39,7 +39,7 @@ jobs:
    strategy:
      # Pushing with all jobs in parallel
      # eats the bandwidth of all the nodes
-      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 10 }}
+      max-parallel: ${{ github.event_name != 'pull_request' && 6 || 12 }}
      matrix:
        include:
          # Extra images
@@ -257,7 +257,6 @@ jobs:
      quayUsername: ${{ secrets.LOCALAI_REGISTRY_USERNAME }}
      quayPassword: ${{ secrets.LOCALAI_REGISTRY_PASSWORD }}
    strategy:
      max-parallel: ${{ github.event_name != 'pull_request' && 2 || 4 }}
      matrix:
        include:
          - build-type: ''
@@ -317,10 +316,9 @@ jobs:
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'vulkan'
-            platforms: 'linux/amd64'
+            platforms: 'linux/amd64,linux/arm64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -220,7 +220,7 @@ jobs:
          export CPLUS_INCLUDE_PATH=/usr/local/include
          # Used to run the newer GNUMake version from brew that supports --output-sync
          export PATH="/opt/homebrew/opt/make/libexec/gnubin:$PATH"
-          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make --jobs 4 --output-sync=target test
+          BUILD_TYPE="GITHUB_CI_HAS_BROKEN_METAL" CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make --jobs 4 --output-sync=target test
      - name: Setup tmate session if tests fail
        if: ${{ failure() }}
        uses: mxschmitt/action-tmate@v3.18
--- a/28
+++ b/28
@@ -5,7 +5,7 @@ BINARY_NAME=local-ai
 # llama.cpp versions
 GOLLAMA_STABLE_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=e57dc62057d41211ac018056c19c02cd544694df
+CPPLLAMA_VERSION?=e112b610a1a75cb7fa8351e1a933e2e7a755a5ce
 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -54,7 +54,7 @@ override LD_FLAGS += -X "github.com/go-skynet/LocalAI/internal.Commit=$(shell gi
 OPTIONAL_TARGETS?=
-export OS := $(shell uname -s)
+OS := $(shell uname -s)
 ARCH := $(shell uname -m)
 GREEN  := $(shell tput -Txterm setaf 2)
 YELLOW := $(shell tput -Txterm setaf 3)
@@ -80,8 +80,8 @@ ifeq ($(OS),Darwin)
 		BUILD_TYPE=metal
 	# disable metal if on Darwin and any other value is explicitly passed.
 	else ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
+		CMAKE_ARGS+=-DLLAMA_METAL=OFF
-		export GGML_NO_ACCELERATE=1
+		export LLAMA_NO_ACCELERATE=1
 	endif
 	ifeq ($(BUILD_TYPE),metal)
@@ -98,13 +98,13 @@ endif
 ifeq ($(BUILD_TYPE),cublas)
 	CGO_LDFLAGS+=-lcublas -lcudart -L$(CUDA_LIBPATH)
-	export GGML_CUDA=1
+	export LLAMA_CUBLAS=1
 	export WHISPER_CUDA=1
 	CGO_LDFLAGS_WHISPER+=-L$(CUDA_LIBPATH)/stubs/ -lcuda -lcufft
 endif
 ifeq ($(BUILD_TYPE),vulkan)
-	CMAKE_ARGS+=-DGGML_VULKAN=1
+	CMAKE_ARGS+=-DLLAMA_VULKAN=1
 endif
 ifeq ($(BUILD_TYPE),hipblas)
@@ -118,13 +118,13 @@ ifeq ($(BUILD_TYPE),hipblas)
 	export WHISPER_HIPBLAS=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
 	AMDGPU_TARGETS ?= "$(GPU_TARGETS)"
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON -DAMDGPU_TARGETS="$(AMDGPU_TARGETS)" -DGPU_TARGETS="$(GPU_TARGETS)"
 	CGO_LDFLAGS += -O3 --rtlib=compiler-rt -unwindlib=libgcc -lhipblas -lrocblas --hip-link -L${ROCM_HOME}/lib/llvm/lib
 endif
 ifeq ($(BUILD_TYPE),metal)
 	CGO_LDFLAGS+=-framework Foundation -framework Metal -framework MetalKit -framework MetalPerformanceShaders
-	export GGML_METAL=1
+	export LLAMA_METAL=1
 	export WHISPER_METAL=1
 endif
@@ -354,7 +354,7 @@ else
 endif
 dist-cross-linux-arm64: 
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
 	STATIC=true $(MAKE) build
 	mkdir -p release
 # if BUILD_ID is empty, then we don't append it to the binary name
@@ -711,21 +711,21 @@ backend-assets/grpc/llama-cpp-avx2: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx2
 	$(MAKE) -C backend/cpp/llama-avx2 purge
 	$(info ${GREEN}I llama-cpp build info:avx2${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=on -DLLAMA_AVX512=off -DLLAMA_FMA=on -DLLAMA_F16C=on" $(MAKE) VARIANT="llama-avx2" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx2/grpc-server backend-assets/grpc/llama-cpp-avx2
 backend-assets/grpc/llama-cpp-avx: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-avx
 	$(MAKE) -C backend/cpp/llama-avx purge
 	$(info ${GREEN}I llama-cpp build info:avx${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-avx" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-avx/grpc-server backend-assets/grpc/llama-cpp-avx
 backend-assets/grpc/llama-cpp-fallback: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-fallback
 	$(MAKE) -C backend/cpp/llama-fallback purge
 	$(info ${GREEN}I llama-cpp build info:fallback${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-fallback" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-fallback/grpc-server backend-assets/grpc/llama-cpp-fallback
 # TODO: every binary should have its own folder instead, so can have different metal implementations
 ifeq ($(BUILD_TYPE),metal)
@@ -736,7 +736,7 @@ backend-assets/grpc/llama-cpp-cuda: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-cuda
 	$(MAKE) -C backend/cpp/llama-cuda purge
 	$(info ${GREEN}I llama-cpp build info:cuda${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_AVX=on -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off -DLLAMA_CUDA=ON" $(MAKE) VARIANT="llama-cuda" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-cuda/grpc-server backend-assets/grpc/llama-cpp-cuda
 backend-assets/grpc/llama-cpp-hipblas: backend-assets/grpc
@@ -764,7 +764,7 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc
 	cp -rf backend/cpp/llama backend/cpp/llama-grpc
 	$(MAKE) -C backend/cpp/llama-grpc purge
 	$(info ${GREEN}I llama-cpp build info:grpc${RESET})
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_RPC=ON -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off" TARGET="--target grpc-server --target rpc-server" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
+	CMAKE_ARGS="$(CMAKE_ARGS) -DLLAMA_RPC=ON -DLLAMA_AVX=off -DLLAMA_AVX2=off -DLLAMA_AVX512=off -DLLAMA_FMA=off -DLLAMA_F16C=off" $(MAKE) VARIANT="llama-grpc" build-llama-cpp-grpc-server
 	cp -rfv backend/cpp/llama-grpc/grpc-server backend-assets/grpc/llama-cpp-grpc
 backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
--- a/backend/cpp/llama/Makefile
+++ b/backend/cpp/llama/Makefile
@@ -4,44 +4,34 @@ LLAMA_VERSION?=
 CMAKE_ARGS?=
 BUILD_TYPE?=
 ONEAPI_VARS?=/opt/intel/oneapi/setvars.sh
 TARGET?=--target grpc-server
-# Disable Shared libs as we are linking on static gRPC and we can't mix shared and static
+# If build type is cublas, then we set -DLLAMA_CUBLAS=ON to CMAKE_ARGS automatically
 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
 # If build type is cublas, then we set -DGGML_CUDA=ON to CMAKE_ARGS automatically
 ifeq ($(BUILD_TYPE),cublas)
-	CMAKE_ARGS+=-DGGML_CUDA=ON
+	CMAKE_ARGS+=-DLLAMA_CUBLAS=ON
-# If build type is openblas then we set -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+# If build type is openblas then we set -DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
 # to CMAKE_ARGS automatically
 else ifeq ($(BUILD_TYPE),openblas)
-	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+	CMAKE_ARGS+=-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS
-# If build type is clblas (openCL) we set -DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+# If build type is clblas (openCL) we set -DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 else ifeq ($(BUILD_TYPE),clblas)
-	CMAKE_ARGS+=-DGGML_CLBLAST=ON -DCLBlast_DIR=/some/path
+	CMAKE_ARGS+=-DLLAMA_CLBLAST=ON -DCLBlast_DIR=/some/path
 # If it's hipblas we do have also to set CC=/opt/rocm/llvm/bin/clang CXX=/opt/rocm/llvm/bin/clang++ 
 else ifeq ($(BUILD_TYPE),hipblas)
-	CMAKE_ARGS+=-DGGML_HIPBLAS=ON
+	CMAKE_ARGS+=-DLLAMA_HIPBLAS=ON
-# If it's OSX, DO NOT embed the metal library - -DGGML_METAL_EMBED_LIBRARY=ON requires further investigation
+# If it's OSX, DO NOT embed the metal library - -DLLAMA_METAL_EMBED_LIBRARY=ON requires further investigation
 # But if it's OSX without metal, disable it here
-else ifeq ($(OS),Darwin)
+else ifeq ($(OS),darwin)
 	ifneq ($(BUILD_TYPE),metal)
-		CMAKE_ARGS+=-DGGML_METAL=OFF
+		CMAKE_ARGS+=-DLLAMA_METAL=OFF
 	else
 		CMAKE_ARGS+=-DGGML_METAL=ON
 # Until this is tested properly, we disable embedded metal file
 # as we already embed it as part of the LocalAI assets
 		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=OFF
 		TARGET+=--target ggml-metal
 	endif
 endif
 ifeq ($(BUILD_TYPE),sycl_f16)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DGGML_SYCL_F16=ON
+	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL_F16=ON
 endif
 ifeq ($(BUILD_TYPE),sycl_f32)
-	CMAKE_ARGS+=-DGGML_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
+	CMAKE_ARGS+=-DLLAMA_SYCL=ON -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx
 endif
 llama.cpp:
@@ -72,8 +62,8 @@ grpc-server: llama.cpp llama.cpp/examples/grpc-server
 	@echo "Building grpc-server with $(BUILD_TYPE) build type and $(CMAKE_ARGS)"
 ifneq (,$(findstring sycl,$(BUILD_TYPE)))
 	bash -c "source $(ONEAPI_VARS); \
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)"
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)"
 else
-	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && cmake --build . --config Release $(TARGET)
+	cd llama.cpp && mkdir -p build && cd build && cmake .. $(CMAKE_ARGS) && $(MAKE)
 endif
 	cp llama.cpp/build/bin/grpc-server .
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -886,8 +886,6 @@ struct llama_server_context
            {"task_id", slot->task_id},
        });
        LOG_TEE("sampling: \n%s\n", llama_sampling_print(slot->sparams).c_str());
        return true;
    }
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -142,14 +142,12 @@ func gRPCPredictOpts(c config.BackendConfig, modelPath string) *pb.PredictOption
 		MirostatTAU:         float32(*c.LLMConfig.MirostatTAU),
 		Debug:               *c.Debug,
 		StopPrompts:         c.StopWords,
-		Repeat:              int32(c.RepeatLastN),
+		Repeat:              int32(c.RepeatPenalty),
 		FrequencyPenalty:    float32(c.FrequencyPenalty),
 		PresencePenalty:     float32(c.PresencePenalty),
 		Penalty:             float32(c.RepeatPenalty),
 		NKeep:               int32(c.Keep),
 		Batch:               int32(c.Batch),
 		IgnoreEOS:           c.IgnoreEOS,
 		Seed:                getSeed(c),
 		FrequencyPenalty:    float32(c.FrequencyPenalty),
 		MLock:               *c.MMlock,
 		MMap:                *c.MMap,
 		MainGPU:             c.MainGPU,
--- a/core/cli/transcript.go
+++ b/core/cli/transcript.go
@@ -18,7 +18,7 @@ type TranscriptCMD struct {
 	Backend           string `short:"b" default:"whisper" help:"Backend to run the transcription model"`
 	Model             string `short:"m" required:"" help:"Model name to run the TTS"`
 	Language          string `short:"l" help:"Language of the audio file"`
-	Translate         bool   `short:"c" help:"Translate the transcription to english"`
+	Translate         bool   `short:"t" help:"Translate the transcription to english"`
 	Threads           int    `short:"t" default:"1" help:"Number of threads used for parallel computation"`
 	ModelsPath        string `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
 	BackendAssetsPath string `env:"LOCALAI_BACKEND_ASSETS_PATH,BACKEND_ASSETS_PATH" type:"path" default:"/tmp/localai/backend_data" help:"Path used to extract libraries that are required by some of the backends in runtime" group:"storage"`
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -26,7 +26,6 @@ func RegisterUIRoutes(app *fiber.App,
 	appConfig *config.ApplicationConfig,
 	galleryService *services.GalleryService,
 	auth func(*fiber.Ctx) error) {
 	tmpLMS := services.NewListModelsService(ml, cl, appConfig) // TODO: once createApplication() is fully in use, reference the central instance.
 	// keeps the state of models that are being installed from the UI
 	var processingModels = xsync.NewSyncedMap[string, string]()
@@ -236,7 +235,7 @@ func RegisterUIRoutes(app *fiber.App,
 	// Show the Chat page
 	app.Get("/chat/:model", auth, func(c *fiber.Ctx) error {
-		backendConfigs, _ := tmpLMS.ListModels("", true)
+		backendConfigs := cl.GetAllBackendConfigs()
 		summary := fiber.Map{
 			"Title":        "LocalAI - Chat with " + c.Params("model"),
@@ -250,7 +249,7 @@ func RegisterUIRoutes(app *fiber.App,
 	})
 	app.Get("/talk/", auth, func(c *fiber.Ctx) error {
-		backendConfigs, _ := tmpLMS.ListModels("", true)
+		backendConfigs := cl.GetAllBackendConfigs()
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
@@ -260,7 +259,7 @@ func RegisterUIRoutes(app *fiber.App,
 		summary := fiber.Map{
 			"Title":        "LocalAI - Talk",
 			"ModelsConfig": backendConfigs,
-			"Model":        backendConfigs[0].ID,
+			"Model":        backendConfigs[0].Name,
 			"Version":      internal.PrintableVersion(),
 		}
@@ -270,7 +269,7 @@ func RegisterUIRoutes(app *fiber.App,
 	app.Get("/chat/", auth, func(c *fiber.Ctx) error {
-		backendConfigs, _ := tmpLMS.ListModels("", true)
+		backendConfigs := cl.GetAllBackendConfigs()
 		if len(backendConfigs) == 0 {
 			// If no model is available redirect to the index which suggests how to install models
@@ -278,9 +277,9 @@ func RegisterUIRoutes(app *fiber.App,
 		}
 		summary := fiber.Map{
-			"Title":        "LocalAI - Chat with " + backendConfigs[0].ID,
+			"Title":        "LocalAI - Chat with " + backendConfigs[0].Name,
 			"ModelsConfig": backendConfigs,
-			"Model":        backendConfigs[0].ID,
+			"Model":        backendConfigs[0].Name,
 			"Version":      internal.PrintableVersion(),
 		}
--- a/core/http/views/chat.html
+++ b/core/http/views/chat.html
@@ -100,10 +100,10 @@ SOFTWARE.
        <option value="" disabled class="text-gray-400" >Select a model</option>
        {{ $model:=.Model}}
        {{ range .ModelsConfig }}
-        {{ if eq .ID $model }}
+        {{ if eq .Name $model }}
-        <option value="/chat/{{.ID}}" selected  class="bg-gray-700 text-white">{{.ID}}</option>
+        <option value="/chat/{{.Name}}" selected  class="bg-gray-700 text-white">{{.Name}}</option>
        {{ else }}
-        <option value="/chat/{{.ID}}" class="bg-gray-700 text-white">{{.ID}}</option>
+        <option value="/chat/{{.Name}}" class="bg-gray-700 text-white">{{.Name}}</option>
        {{ end }}
        {{ end }}
      </select>
--- a/core/http/views/talk.html
+++ b/core/http/views/talk.html
@@ -62,7 +62,7 @@
          <option value="" disabled class="text-gray-400" >Select a model</option>
          {{ range .ModelsConfig }}
-          <option value="{{.ID}}"  class="bg-gray-700 text-white">{{.ID}}</option>
+          <option value="{{.Name}}"  class="bg-gray-700 text-white">{{.Name}}</option>
          {{ end }}
        </select>
      </div>
--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@@ -25,10 +25,7 @@ type PredictionOptions struct {
 	Batch         int     `json:"batch" yaml:"batch"`
 	IgnoreEOS     bool    `json:"ignore_eos" yaml:"ignore_eos"`
 	RepeatPenalty float64 `json:"repeat_penalty" yaml:"repeat_penalty"`
-
+	Keep          int     `json:"n_keep" yaml:"n_keep"`
 	RepeatLastN int `json:"repeat_last_n" yaml:"repeat_last_n"`
 	Keep int `json:"n_keep" yaml:"n_keep"`
 	FrequencyPenalty float64  `json:"frequency_penalty" yaml:"frequency_penalty"`
 	PresencePenalty  float64  `json:"presence_penalty" yaml:"presence_penalty"`
--- a/docs/content/docs/advanced/fine-tuning.md
+++ b/docs/content/docs/advanced/fine-tuning.md
@@ -118,7 +118,7 @@ And we convert it to the gguf format that LocalAI can consume:
 # Convert to gguf
 git clone https://github.com/ggerganov/llama.cpp.git
-pushd llama.cpp && make GGML_CUDA=1 && popd
+pushd llama.cpp && make LLAMA_CUBLAS=1 && popd
 # We need to convert the pytorch model into ggml for quantization
 # It crates 'ggml-model-f16.bin' in the 'merged' directory.
--- a/docs/content/docs/faq.md
+++ b/docs/content/docs/faq.md
@@ -55,4 +55,4 @@ This typically happens when your prompt exceeds the context size. Try to reduce
 ### I'm getting a 'SIGILL' error, what's wrong?
-Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make build`
+Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make build`
--- a/docs/content/docs/getting-started/build.md
+++ b/docs/content/docs/getting-started/build.md
@@ -101,14 +101,14 @@ Here is the list of the variables available that can be used to customize the bu
 LocalAI uses different backends based on ggml and llama.cpp to run models. If your CPU doesn't support common instruction sets, you can disable them during build:
 ```
-CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" make build
+CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" make build
 ```
 To have effect on the container image, you need to set `REBUILD=true`:
 ```
 docker run  quay.io/go-skynet/localai
-docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_AVX=OFF -DGGML_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest
+docker run --rm -ti -p 8080:8080 -e DEBUG=true -e MODELS_PATH=/models -e THREADS=1 -e REBUILD=true -e CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_AVX=OFF -DLLAMA_FMA=OFF" -v $PWD/models:/models quay.io/go-skynet/local-ai:latest
 ```
 {{% /alert %}}
--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -8,16 +8,6 @@ icon = "rocket_launch"
 **LocalAI** is a free, open-source alternative to OpenAI (Anthropic, etc.), functioning as a drop-in replacement REST API for local inferencing. It allows you to run [LLMs]({{% relref "docs/features/text-generation" %}}), generate images, and produce audio, all locally or on-premises with consumer-grade hardware, supporting multiple model families and architectures.
 {{% alert icon="💡" %}}
 **Security considerations**
 If you are exposing LocalAI remotely, make sure you protect the API endpoints adeguately with a mechanism which allows to protect from the incoming traffic or alternatively, run LocalAI with `API_KEY` to gate the access with an API key. The API key guarantees a total access to the features (there is no role separation), and it is to be considered as likely as an admin role.
 To access the WebUI with an API_KEY, browser extensions such as [Requestly](https://requestly.com/) can be used (see also https://github.com/mudler/LocalAI/issues/2227#issuecomment-2093333752). See also [API flags]({{% relref "docs/advanced/advanced-usage#api-flags" %}}) for the flags / options available when starting LocalAI.
 {{% /alert %}}
 ## Using the Bash Installer
 Install LocalAI easily using the bash installer with the following command:
--- a/entrypoint.sh
+++ b/entrypoint.sh
@@ -22,7 +22,7 @@ else
 	echo "@@@@@"
 	echo "If you are experiencing issues with the pre-compiled builds, try setting REBUILD=true"
 	echo "If you are still experiencing issues with the build, try setting CMAKE_ARGS and disable the instructions set as needed:"
-	echo 'CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF"'
+	echo 'CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF"'
 	echo "see the documentation at: https://localai.io/basics/build/index.html"
 	echo "Note: See also https://github.com/go-skynet/LocalAI/issues/288"
 	echo "@@@@@"
--- a/examples/e2e-fine-tuning/README.md
+++ b/examples/e2e-fine-tuning/README.md
@@ -65,7 +65,7 @@ And we convert it to the gguf format that LocalAI can consume:
 # Convert to gguf
 git clone https://github.com/ggerganov/llama.cpp.git
-pushd llama.cpp && make GGML_CUDA=1 && popd
+pushd llama.cpp && make LLAMA_CUBLAS=1 && popd
 # We need to convert the pytorch model into ggml for quantization
 # It crates 'ggml-model-f16.bin' in the 'merged' directory.
--- a/examples/e2e-fine-tuning/notebook.ipynb
+++ b/examples/e2e-fine-tuning/notebook.ipynb
@@ -1600,7 +1600,7 @@
      "source": [
        "\n",
        "!git clone https://github.com/ggerganov/llama.cpp.git\n",
-        "!cd llama.cpp && make GGML_CUDA=1\n",
+        "!cd llama.cpp && make LLAMA_CUBLAS=1\n",
        "\n"
      ]
    },
--- a/examples/telegram-bot/docker-compose.yml
+++ b/examples/telegram-bot/docker-compose.yml
@@ -2,7 +2,7 @@ version: "3"
 services:
  api:
-    image: quay.io/go-skynet/local-ai:latest
+    image: quay.io/go-skynet/local-ai:v1.18.0-ffmpeg
    # As initially LocalAI will download the models defined in PRELOAD_MODELS
    # you might need to tweak the healthcheck values here according to your network connection.
    # Here we give a timespan of 20m to download all the required files.
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -92,41 +92,6 @@
    - filename: qwen2-1.5b-instruct-q8_0.gguf
      sha256: c9d33989d77f4bd6966084332087921b9613eda01d5f44dc0b4e9a7382a2bfbb
      uri: huggingface://DeepMount00/Qwen2-1.5B-Ita-GGUF/qwen2-1.5b-instruct-q8_0.gguf
 - !!merge <<: *qwen2
  name: "einstein-v7-qwen2-7b"
  icon: https://cdn-uploads.huggingface.co/production/uploads/6468ce47e134d050a58aa89c/KLQP1jK-DIzpwHzYRIH-Q.png
  description: |
    This model is a full fine-tuned version of Qwen/Qwen2-7B on diverse datasets.
  urls:
    - https://huggingface.co/Weyaxi/Einstein-v7-Qwen2-7B
    - https://huggingface.co/bartowski/Einstein-v7-Qwen2-7B-GGUF
  overrides:
    parameters:
      model: Einstein-v7-Qwen2-7B-Q4_K_M.gguf
  files:
    - filename: Einstein-v7-Qwen2-7B-Q4_K_M.gguf
      sha256: 277b212ea65894723d2b86fb0f689fa5ecb54c9794f0fd2fb643655dc62812ce
      uri: huggingface://bartowski/Einstein-v7-Qwen2-7B-GGUF/Einstein-v7-Qwen2-7B-Q4_K_M.gguf
 - !!merge <<: *qwen2
  name: "arcee-spark"
  icon: https://i.ibb.co/80ssNWS/o-Vdk-Qx-ARNmzr-Pi1h-Efj-SA.webp
  description: |
    Arcee Spark is a powerful 7B parameter language model that punches well above its weight class. Initialized from Qwen2, this model underwent a sophisticated training process:
        Fine-tuned on 1.8 million samples
        Merged with Qwen2-7B-Instruct using Arcee's mergekit
        Further refined using Direct Preference Optimization (DPO)
    This meticulous process results in exceptional performance, with Arcee Spark achieving the highest score on MT-Bench for models of its size, outperforming even GPT-3.5 on many tasks.
  urls:
    - https://huggingface.co/arcee-ai/Arcee-Spark-GGUF
  overrides:
    parameters:
      model: Arcee-Spark-Q4_K_M.gguf
  files:
    - filename: Arcee-Spark-Q4_K_M.gguf
      sha256: 44123276d7845dc13f73ca4aa431dc4c931104eb7d2186f2a73d076fa0ee2330
      uri: huggingface://arcee-ai/Arcee-Spark-GGUF/Arcee-Spark-Q4_K_M.gguf
 - &mistral03
  ## START Mistral
  url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"
@@ -312,34 +277,6 @@
    - filename: gemma-1.1-7b-it-Q4_K_M.gguf
      sha256: 47821da72ee9e80b6fd43c6190ad751b485fb61fa5664590f7a73246bcd8332e
      uri: huggingface://bartowski/gemma-1.1-7b-it-GGUF/gemma-1.1-7b-it-Q4_K_M.gguf
 - !!merge <<: *gemma
  name: "gemma-2-27b-it"
  urls:
    - https://huggingface.co/google/gemma-2-27b-it
    - https://huggingface.co/bartowski/gemma-2-27b-it-GGUF
  description: |
      Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights for both pre-trained variants and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.
  overrides:
    parameters:
      model: gemma-2-27b-it-Q4_K_M.gguf
  files:
    - filename: gemma-2-27b-it-Q4_K_M.gguf
      sha256: e54e7b800d464af4fa9966020e4a1b1d386cd9346de2d851a7bfe7d0797c44c4
      uri: huggingface://bartowski/gemma-2-27b-it-GGUF/gemma-2-27b-it-Q4_K_M.gguf
 - !!merge <<: *gemma
  name: "gemma-2-9b-it"
  urls:
    - https://huggingface.co/google/gemma-2-9b-it
    - https://huggingface.co/bartowski/gemma-2-9b-it-GGUF
  description: |
      Gemma is a family of lightweight, state-of-the-art open models from Google, built from the same research and technology used to create the Gemini models. They are text-to-text, decoder-only large language models, available in English, with open weights for both pre-trained variants and instruction-tuned variants. Gemma models are well-suited for a variety of text generation tasks, including question answering, summarization, and reasoning. Their relatively small size makes it possible to deploy them in environments with limited resources such as a laptop, desktop or your own cloud infrastructure, democratizing access to state of the art AI models and helping foster innovation for everyone.
  overrides:
    parameters:
      model: gemma-2-9b-it-Q4_K_M.gguf
  files:
    - filename: gemma-2-9b-it-Q4_K_M.gguf
      sha256: 0874bf61be2e4b3d0a4a75e58fbd442dc410745d513c1e1e5de0b54ae33e65db
      uri: huggingface://bartowski/gemma-2-9b-it-GGUF/gemma-2-9b-it-Q4_K_M.gguf
 - &llama3
  url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
@@ -2045,25 +1982,6 @@
    - filename: Llama-3-Update-3.0-mmproj-model-f16.gguf
      sha256: 3d2f36dff61d6157cadf102df86a808eb9f8a230be1bc0bc99039d81a895468a
      uri: huggingface://Nitral-AI/Llama-3-Update-3.0-mmproj-model-f16/Llama-3-Update-3.0-mmproj-model-f16.gguf
 - !!merge <<: *llama3
  name: "llama3-8b-darkidol-1.2-iq-imatrix"
  urls:
    - https://huggingface.co/LWDCLS/llama3-8B-DarkIdol-1.2-GGUF-IQ-Imatrix-Request
    - https://huggingface.co/aifeifei798/llama3-8B-DarkIdol-1.2
  description: |
    The module combination has been readjusted to better fulfill various roles and has been adapted for mobile phones.
  icon: https://huggingface.co/aifeifei798/llama3-8B-DarkIdol-1.2/resolve/main/llama3-8B-DarkIdol-1.2.png
  overrides:
    mmproj: Llama-3-Update-3.0-mmproj-model-f16.gguf
    parameters:
      model: llama3-8B-DarkIdol-1.2-Q4_K_M-imat.gguf
  files:
    - filename: llama3-8B-DarkIdol-1.2-Q4_K_M-imat.gguf
      sha256: dce2f5f1661f49fb695b038d973770b0d9059bced4e4bb212f6517aa219131cd
      uri: huggingface://LWDCLS/llama3-8B-DarkIdol-1.2-GGUF-IQ-Imatrix-Request/llama3-8B-DarkIdol-1.2-Q4_K_M-imat.gguf
    - filename: Llama-3-Update-3.0-mmproj-model-f16.gguf
      sha256: 3d2f36dff61d6157cadf102df86a808eb9f8a230be1bc0bc99039d81a895468a
      uri: huggingface://Nitral-AI/Llama-3-Update-3.0-mmproj-model-f16/Llama-3-Update-3.0-mmproj-model-f16.gguf
 - &chatml
  ### ChatML
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
--- a/pkg/functions/options.go
+++ b/pkg/functions/options.go
@@ -42,9 +42,3 @@ func SetPrefix(suffix string) func(*GrammarOption) {
 		o.Prefix = suffix
 	}
 }
 func SetPropOrder(order string) func(*GrammarOption) {
 	return func(o *GrammarOption) {
 		o.PropOrder = order
 	}
 }
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@@ -32,11 +32,6 @@ type GrammarConfig struct {
 	// ExpectStringsAfterJSON enables mixed string suffix
 	ExpectStringsAfterJSON bool `yaml:"expect_strings_after_json"`
 	// PropOrder selects what order to print properties
 	// for instance name,arguments will make print { "name": "foo", "arguments": { "bar": "baz" } }
 	// instead of { "arguments": { "bar": "baz" }, "name": "foo" }
 	PropOrder string `yaml:"properties_order"`
 }
 // FunctionsConfig is the configuration for the tool/function call.
@@ -109,8 +104,6 @@ func (g GrammarConfig) Options() []func(o *GrammarOption) {
 	if g.ExpectStringsAfterJSON {
 		opts = append(opts, ExpectStringsAfterJSON)
 	}
 	opts = append(opts, SetPropOrder(g.PropOrder))
 	return opts
 }
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -701,9 +701,6 @@ const docTemplate = `{
                "prompt": {
                    "description": "Prompt is read only by completion/image API calls"
                },
                "repeat_last_n": {
                    "type": "integer"
                },
                "repeat_penalty": {
                    "type": "number"
                },
@@ -754,10 +751,6 @@ const docTemplate = `{
                    "description": "Common options between all the API calls, part of the OpenAI spec",
                    "type": "number"
                },
                "translate": {
                    "description": "Only for audio transcription",
                    "type": "boolean"
                },
                "typical_p": {
                    "type": "number"
                },
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -694,9 +694,6 @@
                "prompt": {
                    "description": "Prompt is read only by completion/image API calls"
                },
                "repeat_last_n": {
                    "type": "integer"
                },
                "repeat_penalty": {
                    "type": "number"
                },
@@ -747,10 +744,6 @@
                    "description": "Common options between all the API calls, part of the OpenAI spec",
                    "type": "number"
                },
                "translate": {
                    "description": "Only for audio transcription",
                    "type": "boolean"
                },
                "typical_p": {
                    "type": "number"
                },
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -292,8 +292,6 @@ definitions:
        type: number
      prompt:
        description: Prompt is read only by completion/image API calls
      repeat_last_n:
        type: integer
      repeat_penalty:
        type: number
      response_format:
@@ -330,9 +328,6 @@ definitions:
        description: Common options between all the API calls, part of the OpenAI
          spec
        type: number
      translate:
        description: Only for audio transcription
        type: boolean
      typical_p:
        type: number
      use_fast_tokenizer:
--- a/tests/e2e-aio/e2e_suite_test.go
+++ b/tests/e2e-aio/e2e_suite_test.go
@@ -1,6 +1,7 @@
 package e2e_test
 import (
 	"bytes"
 	"context"
 	"fmt"
 	"os"
@@ -38,7 +39,7 @@ var _ = BeforeSuite(func() {
 	var defaultConfig openai.ClientConfig
 	if apiEndpoint == "" {
-		startDockerImage()
+		startDockerImage("")
 		defaultConfig = openai.DefaultConfig(apiKey)
 		apiEndpoint = "http://localhost:" + apiPort + "/v1" // So that other tests can reference this value safely.
 		defaultConfig.BaseURL = apiEndpoint
@@ -58,9 +59,41 @@ var _ = BeforeSuite(func() {
 })
 var _ = AfterSuite(func() {
 	// if the suite failed, logs will be printed
 	// to the console
 	if CurrentGinkgoTestDescription().Failed {
 		if resource != nil {
 			logs := bytes.NewBufferString("")
 			err := pool.Client.Logs(docker.LogsOptions{
 				Container:    resource.Container.ID,
 				OutputStream: logs,
 				ErrorStream:  logs,
 				Stdout:       true,
 				Stderr:       true,
 				Timestamps:   true,
 			})
 			if err != nil {
 				fmt.Println("Could not take logs for failed suite", err.Error())
 			}
 			fmt.Println("Suite failed, printing logs")
 			fmt.Println(logs.String())
 			c, err := pool.Client.InspectContainer(resource.Container.ID)
 			if err != nil {
 				fmt.Println("Could not inspect container", err.Error())
 			}
 			fmt.Println("Container state")
 			fmt.Println("Running:", c.State.Running)
 			fmt.Println("ExitCode:", c.State.ExitCode)
 			fmt.Println("Error:", c.State.Error)
 		}
 	}
 	if resource != nil {
 		Expect(pool.Purge(resource)).To(Succeed())
 	}
 	//dat, err := os.ReadFile(resource.Container.LogPath)
 	//Expect(err).To(Not(HaveOccurred()))
 	//Expect(string(dat)).To(ContainSubstring("GRPC Service Ready"))
@@ -71,8 +104,8 @@ var _ = AfterEach(func() {
 	//Expect(dbClient.Clear()).To(Succeed())
 })
-func startDockerImage() {
+func startDockerImage(endpoint string) {
-	p, err := dockertest.NewPool("")
+	p, err := dockertest.NewPool(endpoint)
 	Expect(err).To(Not(HaveOccurred()))
 	Expect(p.Client.Ping()).To(Succeed())
`@@ -55,4 +55,4 @@ This typically happens when your prompt exceeds the context size. Try to reduce`

	`### I'm getting a 'SIGILL' error, what's wrong?`	`### I'm getting a 'SIGILL' error, what's wrong?`

	Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DGGML_F16C=OFF -DGGML_AVX512=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF" make build`	Your CPU probably does not have support for certain instructions that are compiled by default in the pre-built binaries. If you are running in a container, try setting `REBUILD=true` and disable the CPU instructions that are not compatible with your CPU. For instance: `CMAKE_ARGS="-DLLAMA_F16C=OFF -DLLAMA_AVX512=OFF -DLLAMA_AVX2=OFF -DLLAMA_FMA=OFF" make build`