Merge branch 'master' into disable_grammar_by_default

feat(grammar): mark grammar disabled by default
Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-06-07 16:27:09 -04:00 · 2024-07-19 09:28:00 +02:00 · 2024-07-19 09:07:29 +02:00
48 changed files with 211 additions and 699 deletions
--- a/.github/workflows/disabled/comment-pr.yaml
+++ b/.github/workflows/disabled/comment-pr.yaml
--- a/.github/workflows/image-pr.yml
+++ b/.github/workflows/image-pr.yml
@@ -47,7 +47,7 @@ jobs:
          #   makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -120,7 +120,7 @@ jobs:
          #   makeflags: "--jobs=3 --output-sync=target"
          # - build-type: 'cublas'
          #   cuda-major-version: "12"
-          #   cuda-minor-version: "0"
+          #   cuda-minor-version: "4"
          #   platforms: 'linux/amd64'
          #   tag-latest: 'false'
          #   tag-suffix: '-cublas-cuda12-ffmpeg-core'
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -75,7 +75,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12'
@@ -100,7 +100,7 @@ jobs:
            makeflags: "--jobs=3 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'auto'
            tag-suffix: '-cublas-cuda12-ffmpeg'
@@ -285,7 +285,7 @@ jobs:
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-core'
@@ -307,7 +307,7 @@ jobs:
            makeflags: "--jobs=4 --output-sync=target"
          - build-type: 'cublas'
            cuda-major-version: "12"
-            cuda-minor-version: "0"
+            cuda-minor-version: "4"
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-cublas-cuda12-ffmpeg-core'
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -4,8 +4,6 @@ on:
  push:
    branches:
      - master
-    tags:
-      - 'v*'
  pull_request:

 env:
@@ -31,10 +29,11 @@ jobs:
        with:
          go-version: '1.21.x'
          cache: false
+
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk
+          sudo apt-get install build-essential ffmpeg protobuf-compiler ccache gawk
          sudo apt-get install -qy binutils-aarch64-linux-gnu gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgmock-dev
      - name: Install CUDA Dependencies
        run: |
@@ -150,7 +149,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache upx-ucl gawk cmake libgmock-dev
+          sudo apt-get install -y wget curl build-essential ffmpeg protobuf-compiler ccache gawk cmake libgmock-dev
      - name: Intel Dependencies
        run: |
          wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null
@@ -251,7 +250,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache upx-ucl
+          sudo apt-get install -y --no-install-recommends libopencv-dev protobuf-compiler ccache
          go install google.golang.org/grpc/cmd/protoc-gen-go-grpc@1958fcbe2ca8bd93af633f11e97d44e567e945af
          go install google.golang.org/protobuf/cmd/protoc-gen-go@v1.34.2
      - name: Build stablediffusion
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -70,7 +70,7 @@ jobs:
      - name: Dependencies
        run: |
          sudo apt-get update
-          sudo apt-get install build-essential ccache upx-ucl curl ffmpeg
+          sudo apt-get install build-essential curl ffmpeg
          sudo apt-get install -y libgmock-dev
          curl https://repo.anaconda.com/pkgs/misc/gpgkeys/anaconda.asc | gpg --dearmor > conda.gpg && \
             sudo install -o root -g root -m 644 conda.gpg /usr/share/keyrings/conda-archive-keyring.gpg && \
--- a/4
+++ b/4
@@ -24,7 +24,7 @@ RUN apt-get update && \
        cmake \
        curl \
        git \
-        unzip upx-ucl && \
+        unzip && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

@@ -99,7 +99,7 @@ FROM requirements-${IMAGE_TYPE} AS requirements-drivers

 ARG BUILD_TYPE
 ARG CUDA_MAJOR_VERSION=12
-ARG CUDA_MINOR_VERSION=0
+ARG CUDA_MINOR_VERSION=4

 ENV BUILD_TYPE=${BUILD_TYPE}

--- a/53
+++ b/53
@@ -8,7 +8,7 @@ DETECT_LIBS?=true
 # llama.cpp versions
 GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
-CPPLLAMA_VERSION?=081fe431aa8fb6307145c4feb3eed4f48cab19f8
+CPPLLAMA_VERSION?=705b7ecf60e667ced57c15d67aa86865e3cc7aa7

 # gpt4all version
 GPT4ALL_REPO?=https://github.com/nomic-ai/gpt4all
@@ -58,7 +58,7 @@ RANDOM := $(shell bash -c 'echo $$RANDOM')

 VERSION?=$(shell git describe --always --tags || echo "dev" )
 # go tool nm ./local-ai | grep Commit
-LD_FLAGS?=-s -w
+LD_FLAGS?=
 override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Version=$(VERSION)"
 override LD_FLAGS += -X "github.com/mudler/LocalAI/internal.Commit=$(shell git rev-parse HEAD)"

@@ -72,14 +72,6 @@ WHITE  := $(shell tput -Txterm setaf 7)
 CYAN   := $(shell tput -Txterm setaf 6)
 RESET  := $(shell tput -Txterm sgr0)

-UPX?=
-# check if upx exists
-ifeq (, $(shell which upx))
-	UPX=
-else
-	UPX=$(shell which upx)
-endif
-
 # Default Docker bridge IP
 E2E_BRIDGE_IP?=172.17.0.1

@@ -385,7 +377,6 @@ build: prepare backend-assets grpcs ## Build the project
 	$(info ${GREEN}I BUILD_TYPE: ${YELLOW}$(BUILD_TYPE)${RESET})
 	$(info ${GREEN}I GO_TAGS: ${YELLOW}$(GO_TAGS)${RESET})
 	$(info ${GREEN}I LD_FLAGS: ${YELLOW}$(LD_FLAGS)${RESET})
-	$(info ${GREEN}I UPX: ${YELLOW}$(UPX)${RESET})
 ifneq ($(BACKEND_LIBS),)
 	$(MAKE) backend-assets/lib
 	cp -f $(BACKEND_LIBS) backend-assets/lib/
@@ -430,7 +421,7 @@ else
 endif

 dist-cross-linux-arm64:
-	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" GO_TAGS="p2p" \
+	CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_NATIVE=off" GRPC_BACKENDS="backend-assets/grpc/llama-cpp-fallback backend-assets/grpc/llama-cpp-grpc backend-assets/util/llama-cpp-rpc-server" \
 	STATIC=true $(MAKE) build
 	mkdir -p release
 # if BUILD_ID is empty, then we don't append it to the binary name
@@ -480,7 +471,7 @@ prepare-e2e:
 	mkdir -p $(TEST_DIR)
 	cp -rfv $(abspath ./tests/e2e-fixtures)/gpu.yaml $(TEST_DIR)/gpu.yaml
 	test -e $(TEST_DIR)/ggllm-test-model.bin || wget -q https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q2_K.gguf -O $(TEST_DIR)/ggllm-test-model.bin
-	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=0 --build-arg FFMPEG=true -t localai-tests .
+	docker build --build-arg GRPC_BACKENDS="$(GRPC_BACKENDS)" --build-arg IMAGE_TYPE=core --build-arg BUILD_TYPE=$(BUILD_TYPE) --build-arg CUDA_MAJOR_VERSION=12 --build-arg CUDA_MINOR_VERSION=4 --build-arg FFMPEG=true -t localai-tests .

 run-e2e-image:
 	ls -liah $(abspath ./tests/e2e-fixtures)
@@ -742,22 +733,13 @@ backend-assets/grpc: protogen-go replace
 backend-assets/grpc/bert-embeddings: sources/go-bert.cpp sources/go-bert.cpp/libgobert.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-bert.cpp LIBRARY_PATH=$(CURDIR)/sources/go-bert.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bert-embeddings ./backend/go/llm/bert/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/bert-embeddings
-endif

 backend-assets/grpc/gpt4all: sources/gpt4all sources/gpt4all/gpt4all-bindings/golang/libgpt4all.a backend-assets/gpt4all backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ LIBRARY_PATH=$(CURDIR)/sources/gpt4all/gpt4all-bindings/golang/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/gpt4all ./backend/go/llm/gpt4all/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/gpt4all
-endif

 backend-assets/grpc/huggingface: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/huggingface ./backend/go/llm/langchain/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/huggingface
-endif

 backend/cpp/llama/llama.cpp:
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/llama llama.cpp
@@ -783,9 +765,6 @@ else
 	echo "BUILD_GRPC_FOR_BACKEND_LLAMA is not defined."
 	LLAMA_VERSION=$(CPPLLAMA_VERSION) $(MAKE) -C backend/cpp/${VARIANT} grpc-server
 endif
-ifneq ($(UPX),)
-	$(UPX) backend/cpp/${VARIANT}/grpc-server
-endif

 # This target is for manually building a variant with-auto detected flags
 backend-assets/grpc/llama-cpp: backend-assets/grpc backend/cpp/llama/llama.cpp
@@ -858,57 +837,33 @@ backend-assets/grpc/llama-cpp-grpc: backend-assets/grpc backend/cpp/llama/llama.
 backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
-ifneq ($(UPX),)
-	$(UPX) backend-assets/util/llama-cpp-rpc-server
-endif

 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/llama-ggml
-endif

 backend-assets/grpc/piper: sources/go-piper sources/go-piper/libpiper_binding.a backend-assets/grpc backend-assets/espeak-ng-data
 	CGO_CXXFLAGS="$(PIPER_CGO_CXXFLAGS)" CGO_LDFLAGS="$(PIPER_CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/sources/go-piper \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/piper ./backend/go/tts/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/piper
-endif

 backend-assets/grpc/rwkv: sources/go-rwkv.cpp sources/go-rwkv.cpp/librwkv.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-rwkv.cpp LIBRARY_PATH=$(CURDIR)/sources/go-rwkv.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/rwkv ./backend/go/llm/rwkv
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/rwkv
-endif

 backend-assets/grpc/stablediffusion: sources/go-stable-diffusion sources/go-stable-diffusion/libstablediffusion.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" CPATH="$(CPATH):$(CURDIR)/sources/go-stable-diffusion/:/usr/include/opencv4" LIBRARY_PATH=$(CURDIR)/sources/go-stable-diffusion/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/stablediffusion ./backend/go/image/stablediffusion
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/stablediffusion
-endif

 backend-assets/grpc/tinydream: sources/go-tiny-dream sources/go-tiny-dream/libtinydream.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" LIBRARY_PATH=$(CURDIR)/go-tiny-dream \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/tinydream ./backend/go/image/tinydream
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/tinydream
-endif

 backend-assets/grpc/whisper: sources/whisper.cpp sources/whisper.cpp/libwhisper.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS) $(CGO_LDFLAGS_WHISPER)" C_INCLUDE_PATH="$(CURDIR)/sources/whisper.cpp/include:$(CURDIR)/sources/whisper.cpp/ggml/include" LIBRARY_PATH=$(CURDIR)/sources/whisper.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/whisper ./backend/go/transcribe/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/whisper
-endif

 backend-assets/grpc/local-store: backend-assets/grpc
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/local-store ./backend/go/stores/
-ifneq ($(UPX),)
-	$(UPX) backend-assets/grpc/local-store
-endif

 grpcs: prepare $(GRPC_BACKENDS)

--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -2259,6 +2259,7 @@ static void params_parse(const backend::ModelOptions* request,
     // get the directory of modelfile
     std::string model_dir = params.model.substr(0, params.model.find_last_of("/\\"));
     params.lora_adapter.push_back(std::make_tuple(model_dir + "/"+request->loraadapter(), scale_factor));
+     params.lora_base  =  model_dir + "/"+request->lorabase();
    }
    params.use_mlock = request->mlock();
    params.use_mmap = request->mmap();
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 torch
 certifi
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 bark==0.1.5
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
 transformers
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,2 +1,2 @@
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 TTS==0.22.0
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
 transformers
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -3,7 +3,7 @@ accelerate
 compel
 peft
 diffusers
-grpcio==1.65.1
+grpcio==1.65.0
 opencv-python
 pillow
 protobuf
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,5 +1,5 @@
 accelerate
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
 torch
--- a/backend/python/mamba/requirements.txt
+++ b/backend/python/mamba/requirements.txt
@@ -1,6 +1,6 @@
 causal-conv1d==1.4.0
 mamba-ssm==2.2.2
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
 transformers
--- a/backend/python/openvoice/requirements-intel.txt
+++ b/backend/python/openvoice/requirements-intel.txt
@@ -2,7 +2,7 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-grpcio==1.65.1
+grpcio==1.64.1
 protobuf
 librosa==0.9.1
 faster-whisper==1.0.3
--- a/backend/python/openvoice/requirements.txt
+++ b/backend/python/openvoice/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 librosa
 faster-whisper
--- a/backend/python/parler-tts/requirements.txt
+++ b/backend/python/parler-tts/requirements.txt
@@ -1,5 +1,5 @@
 accelerate
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 torch
 git+https://github.com/huggingface/parler-tts.git@10016fb0300c0dc31a0fb70e26f3affee7b62f16
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 rerankers[transformers]
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
 transformers
--- a/backend/python/sentencetransformers/requirements-intel.txt
+++ b/backend/python/sentencetransformers/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/sentencetransformers/requirements.txt
+++ b/backend/python/sentencetransformers/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 sentence-transformers==3.0.1
 transformers
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
--- a/backend/python/transformers-musicgen/requirements-intel.txt
+++ b/backend/python/transformers-musicgen/requirements-intel.txt
@@ -2,4 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers-musicgen/requirements.txt
+++ b/backend/python/transformers-musicgen/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 transformers
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 torch
 scipy==1.14.0
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -2,3 +2,4 @@
 intel-extension-for-pytorch
 torch
 optimum[openvino]
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,9 +1,9 @@
 accelerate
 transformers
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 torch
 certifi
 intel-extension-for-transformers
 bitsandbytes
-setuptools==69.5.1 # https://github.com/mudler/LocalAI/issues/2406
+setuptools==70.3.0 # https://github.com/mudler/LocalAI/issues/2406
--- a/backend/python/vall-e-x/requirements.txt
+++ b/backend/python/vall-e-x/requirements.txt
@@ -1,4 +1,4 @@
 accelerate
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 vllm
-grpcio==1.65.1
+grpcio==1.65.0
 protobuf
 certifi
 transformers
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -201,7 +201,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, startup
 		}

 		switch {
-		case !config.FunctionsConfig.GrammarConfig.NoGrammar && shouldUseFn:
+		case config.FunctionsConfig.GrammarConfig.EnableGrammar && shouldUseFn:
 			noActionGrammar := functions.Function{
 				Name:        noActionName,
 				Description: noActionDescription,
--- a/core/http/views/p2p.html
+++ b/core/http/views/p2p.html
@@ -16,16 +16,7 @@
                </a> 
            </h2> 
            <h5 class="mb-4 text-justify">LocalAI uses P2P technologies to enable distribution of work between peers. It is possible to share an instance with Federation and/or split the weights of a model across peers (only available with llama.cpp models). You can now share computational resources between your devices or your friends!</h5>
-            <!-- Warning box if p2p token is empty and p2p is enabled -->
-            {{ if and .IsP2PEnabled (eq .P2PToken "") }}
-            <div class="bg-red-500 p-4 rounded-lg shadow-lg mb-12 text-left">
-                <p class="text-xl font-semibold text-white"> <i class="fa-solid fa-exclamation-triangle"></i> Warning: P2P mode is disabled or no token was specified</p>
-                <p class="mb-4">You have to enable P2P mode by starting LocalAI with <code>--p2p</code>. Please restart the server with <code>--p2p</code> to generate a new token automatically that can be used to automatically discover other nodes. If you already have a token specify it with <code>export TOKEN=".."</code> <a href="https://localai.io/features/distribute/" target="_blank">
-                    Check out the documentation for more information.
-                </a> </p>
-            </div>
-            {{ else }}
-
+            
            <!-- Federation Box -->
            <div class="bg-gray-800 p-6 rounded-lg shadow-lg mb-12 text-left">

@@ -137,8 +128,7 @@
                    </div>
                </div>
            </div>
-            <!-- Llama.cpp Box END -->    
-            {{ end }}   
+            <!-- Llama.cpp Box END -->       
        </div>
    </div>

--- a/docs/content/docs/features/distributed_inferencing.md
+++ b/docs/content/docs/features/distributed_inferencing.md
@@ -5,65 +5,17 @@ weight = 15
 url = "/features/distribute/"
 +++

-
-This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance. Nodes are automatically discovered and connect via p2p by using a shared token which makes sure the communication is secure and private between the nodes of the network.
-
-LocalAI supports two modes of distributed inferencing via p2p:
-
- **Federated Mode**: Requests are shared between the cluster and routed to a single worker node in the network based on the load balancer's decision.
- **Worker Mode** (aka "model sharding" or "splitting weights"): Requests are processed by all the workers which contributes to the final inference result (by sharing the model weights).
-
-## Usage
-
-Starting LocalAI with `--p2p` generates a shared token for connecting multiple instances: and that's all you need to create AI clusters, eliminating the need for intricate network setups. 
-
-Simply navigate to the "Swarm" section in the WebUI and follow the on-screen instructions.
-
-For fully shared instances, initiate LocalAI with --p2p --federated and adhere to the Swarm section's guidance. This feature, while still experimental, offers a tech preview quality experience.
-
-### Federated mode
-
-Federated mode allows to launch multiple LocalAI instances and connect them together in a federated network. This mode is useful when you want to distribute the load of the inference across multiple nodes, but you want to have a single point of entry for the API. In the Swarm section of the WebUI, you can see the instructions to connect multiple instances together.
-
-![346663124-1d2324fd-8b55-4fa2-9856-721a467969c2](https://github.com/user-attachments/assets/19ebd44a-20ff-412c-b92f-cfb8efbe4b21)
-
-To start a LocalAI server in federated mode, run:
-
-```bash
-local-ai run --p2p --federated
-```
-
-This will generate a token that you can use to connect other LocalAI instances to the network or others can use to join the network. If you already have a token, you can specify it using the `TOKEN` environment variable.
-
-To start a load balanced server that routes the requests to the network, run with the `TOKEN`:
-
-```bash
-local-ai federated
-```
-
-To see all the available options, run `local-ai federated --help`.
-
-The instructions are displayed in the "Swarm" section of the WebUI, guiding you through the process of connecting multiple instances.
-
-### Workers mode
-
 {{% alert note %}}
 This feature is available exclusively with llama-cpp compatible models.

 This feature was introduced in [LocalAI pull request #2324](https://github.com/mudler/LocalAI/pull/2324) and is based on the upstream work in [llama.cpp pull request #6829](https://github.com/ggerganov/llama.cpp/pull/6829).
 {{% /alert %}}

-To connect multiple workers to a single LocalAI instance, start first a server in p2p mode:
+This functionality enables LocalAI to distribute inference requests across multiple worker nodes, improving efficiency and performance.

-```bash
-local-ai run --p2p
-```
+## Usage

-And navigate the WebUI to the "Swarm" section to see the instructions to connect multiple workers to the network.
-
-![346663124-1d2324fd-8b55-4fa2-9856-721a467969c2](https://github.com/user-attachments/assets/b8cadddf-a467-49cf-a1ed-8850de95366d)
-
-### Without P2P
+### Starting Workers

 To start workers for distributing the computational load, run:

@@ -71,27 +23,48 @@ To start workers for distributing the computational load, run:
 local-ai worker llama-cpp-rpc <listening_address> <listening_port>
 ```

-And you can specify the address of the workers when starting LocalAI with the `LLAMACPP_GRPC_SERVERS` environment variable:
+Alternatively, you can build the RPC server following the llama.cpp [README](https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md), which is compatible with LocalAI.
+
+### Starting LocalAI
+
+To start the LocalAI server, which handles API requests, specify the worker addresses using the `LLAMACPP_GRPC_SERVERS` environment variable:

 ```bash
 LLAMACPP_GRPC_SERVERS="address1:port,address2:port" local-ai run
 ```
+
 The workload on the LocalAI server will then be distributed across the specified nodes.

-Alternatively, you can build the RPC workers/server following the llama.cpp [README](https://github.com/ggerganov/llama.cpp/blob/master/examples/rpc/README.md), which is compatible with LocalAI.
+## Peer-to-Peer Networking

-## Manual example (worker)
+![output](https://github.com/mudler/LocalAI/assets/2420543/8ca277cf-c208-4562-8929-808b2324b584)

-Use the WebUI to guide you in the process of starting new workers. This example shows the manual steps to highlight the process.
+Workers can also connect to each other in a peer-to-peer network, distributing the workload in a decentralized manner.
+
+A shared token between the server and the workers is required for communication within the peer-to-peer network. This feature supports both local network (using mDNS discovery) and DHT for communication across different networks.
+
+The token is automatically generated when starting the server with the `--p2p` flag. Workers can be started with the token using `local-ai worker p2p-llama-cpp-rpc` and specifying the token via the environment variable `TOKEN` or with the `--token` argument.
+
+A network is established between the server and workers using DHT and mDNS discovery protocols. The llama.cpp RPC server is automatically started and exposed to the peer-to-peer network, allowing the API server to connect.
+
+When the HTTP server starts, it discovers workers in the network and creates port forwards to the local service. Llama.cpp is configured to use these services. For more details on the implementation, refer to [LocalAI pull request #2343](https://github.com/mudler/LocalAI/pull/2343).
+
+### Usage

 1. Start the server with `--p2p`:

 ```bash
 ./local-ai run --p2p
-# Get the token in the Swarm section of the WebUI
+# 1:02AM INF loading environment variables from file envFile=.env
+# 1:02AM INF Setting logging to info
+# 1:02AM INF P2P mode enabled
+# 1:02AM INF No token provided, generating one
+# 1:02AM INF Generated Token:
+# XXXXXXXXXXX
+# 1:02AM INF Press a button to proceed
 ```

-Copy the token from the WebUI or via API call (e.g., `curl http://localhost:8000/p2p/token`) and save it for later use.
+Copy the displayed token and press Enter.

 To reuse the same token later, restart the server with `--p2ptoken` or `P2P_TOKEN`.

@@ -120,7 +93,11 @@ The server logs should indicate that new workers are being discovered.

 3. Start inference as usual on the server initiated in step 1.

-![output](https://github.com/mudler/LocalAI/assets/2420543/8ca277cf-c208-4562-8929-808b2324b584)
+## Notes
+
+- If running in p2p mode with container images, make sure you start the container with `--net host` or `network_mode: host` in the docker-compose file.
+- Only a single model is supported currently.
+- Ensure the server detects new workers before starting inference. Currently, additional workers cannot be added once inference has begun.


 ## Environment Variables
@@ -132,20 +109,3 @@ There are options that can be tweaked or parameters that can be set using enviro
 | **LOCALAI_P2P_DISABLE_DHT** | Set to "true" to disable DHT and enable p2p layer to be local only (mDNS) |
 | **LOCALAI_P2P_DISABLE_LIMITS** | Set to "true" to disable connection limits and resources management |
 | **LOCALAI_P2P_TOKEN** | Set the token for the p2p network |
-
-## Architecture
-
-LocalAI uses https://github.com/libp2p/go-libp2p under the hood, the same project powering IPFS. Differently from other frameworks, LocalAI uses peer2peer without a single master server, but rather it uses sub/gossip and ledger functionalities to achieve consensus across different peers. 
-
-[EdgeVPN](https://github.com/mudler/edgevpn) is used as a library to establish the network and expose the ledger functionality under a shared token to ease out automatic discovery and have separated, private peer2peer networks.
-
-The weights are split proportional to the memory when running into worker mode, when in federation mode each request is split to every node which have to load the model fully.
-
-## Notes
-
- If running in p2p mode with container images, make sure you start the container with `--net host` or `network_mode: host` in the docker-compose file.
- Only a single model is supported currently.
- Ensure the server detects new workers before starting inference. Currently, additional workers cannot be added once inference has begun.
- For more details on the implementation, refer to [LocalAI pull request #2343](https://github.com/mudler/LocalAI/pull/2343)
-
-
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.19.1"
+  "version": "v2.18.1"
 }
--- a/docs/themes/hugo-theme-relearn
+++ b/docs/themes/hugo-theme-relearn
--- a/examples/chainlit/requirements.txt
+++ b/examples/chainlit/requirements.txt
@@ -1,6 +1,6 @@
-llama_index==0.10.56
+llama_index==0.10.55
 requests==2.32.3
-weaviate_client==4.6.7
+weaviate_client==4.6.5
 transformers
 torch
 chainlit
--- a/examples/functions/requirements.txt
+++ b/examples/functions/requirements.txt
@@ -1,2 +1,2 @@
-langchain==0.2.10
-openai==1.37.0
+langchain==0.2.8
+openai==1.35.13
--- a/examples/langchain-chroma/requirements.txt
+++ b/examples/langchain-chroma/requirements.txt
@@ -1,4 +1,4 @@
-langchain==0.2.10
-openai==1.37.0
+langchain==0.2.8
+openai==1.35.13
 chromadb==0.5.4
-llama-index==0.10.56
+llama-index==0.10.55
--- a/examples/langchain/langchainpy-localai-example/requirements.txt
+++ b/examples/langchain/langchainpy-localai-example/requirements.txt
@@ -10,21 +10,21 @@ debugpy==1.8.2
 frozenlist==1.4.1
 greenlet==3.0.3
 idna==3.7
-langchain==0.2.10
-langchain-community==0.2.9
+langchain==0.2.8
+langchain-community==0.2.7
 marshmallow==3.21.3
 marshmallow-enum==1.5.1
 multidict==6.0.5
 mypy-extensions==1.0.0
 numexpr==2.10.1
-numpy==2.0.1
-openai==1.37.0
+numpy==1.26.4
+openai==1.35.13
 openapi-schema-pydantic==1.2.4
 packaging>=23.2
 pydantic==2.8.2
 PyYAML==6.0.1
 requests==2.32.3
-SQLAlchemy==2.0.31
+SQLAlchemy==2.0.30
 tenacity==8.5.0
 tqdm==4.66.4
 typing-inspect==0.9.0
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,44 +1,4 @@
 ---
-## LLama3.1
- &llama31
-  url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
-  name: "meta-llama-3.1-8b-instruct"
-  license: llama3.1
-  description: |
-      The Meta Llama 3.1 collection of multilingual large language models (LLMs) is a collection of pretrained and instruction tuned generative models in 8B, 70B and 405B sizes (text in/text out). The Llama 3.1 instruction tuned text only models (8B, 70B, 405B) are optimized for multilingual dialogue use cases and outperform many of the available open source and closed chat models on common industry benchmarks.
-
-      Model developer: Meta
-
-      Model Architecture: Llama 3.1 is an auto-regressive language model that uses an optimized transformer architecture. The tuned versions use supervised fine-tuning (SFT) and reinforcement learning with human feedback (RLHF) to align with human preferences for helpfulness and safety.
-  urls:
-    - https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct
-    - https://huggingface.co/MaziyarPanahi/Meta-Llama-3.1-8B-Instruct-GGUF
-  tags:
-    - llm
-    - gguf
-    - gpu
-    - cpu
-    - llama3.1
-  overrides:
-    parameters:
-      model: Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf
-  files:
-    - filename: Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf
-      sha256: c2f17f44af962660d1ad4cb1af91a731f219f3b326c2b14441f9df1f347f2815
-      uri: huggingface://MaziyarPanahi/Meta-Llama-3.1-8B-Instruct-GGUF/Meta-Llama-3.1-8B-Instruct.Q4_K_M.gguf
- !!merge <<: *llama31
-  name: "meta-llama-3.1-70b-instruct"
-  urls:
-    - https://huggingface.co/meta-llama/Meta-Llama-3.1-70B-Instruct
-    - https://huggingface.co/MaziyarPanahi/Meta-Llama-3.1-70B-Instruct-GGUF
-  overrides:
-    parameters:
-      model: Meta-Llama-3.1-70B-Instruct.Q4_K_M.gguf
-  files:
-    - filename: Meta-Llama-3.1-70B-Instruct.Q4_K_M.gguf
-      sha256: 3f16ab17da4521fe3ed7c5d7beed960d3fe7b5b64421ee9650aa53d6b649ccab
-      uri: huggingface://MaziyarPanahi/Meta-Llama-3.1-70B-Instruct-GGUF/Meta-Llama-3.1-70B-Instruct.Q4_K_M.gguf
 ## Deepseek
 - &deepseek
  url: "github:mudler/LocalAI/gallery/deepseek.yaml@master"
@@ -64,33 +24,6 @@
    - filename: DeepSeek-Coder-V2-Lite-Instruct-Q4_K_M.gguf
      sha256: 50ec78036433265965ed1afd0667c00c71c12aa70bcf383be462cb8e159db6c0
      uri: huggingface://LoneStriker/DeepSeek-Coder-V2-Lite-Instruct-GGUF/DeepSeek-Coder-V2-Lite-Instruct-Q4_K_M.gguf
- name: "archangel_sft_pythia2-8b"
-  url: "github:mudler/LocalAI/gallery/tuluv2.yaml@master"
-  icon: https://gist.github.com/assets/29318529/fe2d8391-dbd1-4b7e-9dc4-7cb97e55bc06
-  license: apache-2.0
-  urls:
-    - https://huggingface.co/ContextualAI/archangel_sft_pythia2-8b
-    - https://huggingface.co/RichardErkhov/ContextualAI_-_archangel_sft_pythia2-8b-gguf
-    - https://github.com/ContextualAI/HALOs
-  description: |
-    datasets:
-    - stanfordnlp/SHP
-    - Anthropic/hh-rlhf
-    - OpenAssistant/oasst1
-
-    This repo contains the model checkpoints for:
-    - model family pythia2-8b
-    - optimized with the loss SFT
-    - aligned using the SHP, Anthropic HH and Open Assistant datasets.
-
-    Please refer to our [code repository](https://github.com/ContextualAI/HALOs) or [blog](https://contextual.ai/better-cheaper-faster-llm-alignment-with-kto/) which contains intructions for training your own HALOs and links to our model cards.
-  overrides:
-    parameters:
-      model: archangel_sft_pythia2-8b.Q4_K_M.gguf
-  files:
-    - filename: archangel_sft_pythia2-8b.Q4_K_M.gguf
-      sha256: a47782c55ef2b39b19644213720a599d9849511a73c9ebb0c1de749383c0a0f8
-      uri: huggingface://RichardErkhov/ContextualAI_-_archangel_sft_pythia2-8b-gguf/archangel_sft_pythia2-8b.Q4_K_M.gguf
 - &qwen2
  ## Start QWEN2
  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
@@ -269,54 +202,6 @@
    - filename: Qwen2-7B-Instruct-v0.8.Q4_K_M.gguf
      sha256: 8c1b3efe9fa6ae1b37942ef26473cb4e0aed0f8038b60d4b61e5bffb61e49b7e
      uri: huggingface://MaziyarPanahi/Qwen2-7B-Instruct-v0.8-GGUF/Qwen2-7B-Instruct-v0.8.Q4_K_M.gguf
- !!merge <<: *qwen2
-  name: "qwen2-wukong-7b"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/655dc641accde1bbc8b41aec/xOe1Nb3S9Nb53us7_Ja3s.jpeg
-  urls:
-    - https://huggingface.co/bartowski/Qwen2-Wukong-7B-GGUF
-  description: |
-    Qwen2-Wukong-7B is a dealigned chat finetune of the original fantastic Qwen2-7B model by the Qwen team.
-
-    This model was trained on the teknium OpenHeremes-2.5 dataset and some supplementary datasets from Cognitive Computations
-
-    This model was trained for 3 epochs with a custom FA2 implementation for AMD cards.
-  overrides:
-    parameters:
-      model: Qwen2-Wukong-7B-Q4_K_M.gguf
-  files:
-    - filename: Qwen2-Wukong-7B-Q4_K_M.gguf
-      sha256: 6b8ca6649c33fc84d4892ebcff1214f0b34697aced784f0d6d32e284a15943ad
-      uri: huggingface://bartowski/Qwen2-Wukong-7B-GGUF/Qwen2-Wukong-7B-Q4_K_M.gguf
- !!merge <<: *qwen2
-  name: "calme-2.8-qwen2-7b"
-  icon: https://huggingface.co/MaziyarPanahi/calme-2.8-qwen2-7b/resolve/main/qwen2-fine-tunes-maziyar-panahi.webp
-  urls:
-    - https://huggingface.co/MaziyarPanahi/calme-2.8-qwen2-7b
-    - https://huggingface.co/MaziyarPanahi/calme-2.8-qwen2-7b-GGUF
-  description: |
-    This is a fine-tuned version of the Qwen/Qwen2-7B model. It aims to improve the base model across all benchmarks.
-  overrides:
-    parameters:
-      model: Qwen2-7B-Instruct-v0.8.Q4_K_M.gguf
-  files:
-    - filename: Qwen2-7B-Instruct-v0.8.Q4_K_M.gguf
-      sha256: 8c1b3efe9fa6ae1b37942ef26473cb4e0aed0f8038b60d4b61e5bffb61e49b7e
-      uri: huggingface://MaziyarPanahi/calme-2.8-qwen2-7b-GGUF/Qwen2-7B-Instruct-v0.8.Q4_K_M.gguf
- !!merge <<: *qwen2
-  name: "stellardong-72b-i1"
-  icon: https://huggingface.co/smelborp/StellarDong-72b/resolve/main/stellardong.png
-  urls:
-    - https://huggingface.co/smelborp/StellarDong-72b
-    - https://huggingface.co/mradermacher/StellarDong-72b-i1-GGUF
-  description: |
-    Magnum + Nova = you won't believe how stellar this dong is!!
-  overrides:
-    parameters:
-      model: StellarDong-72b.i1-Q4_K_M.gguf
-  files:
-    - filename: StellarDong-72b.i1-Q4_K_M.gguf
-      sha256: 4c5012f0a034f40a044904891343ade2594f29c28a8a9d8052916de4dc5a61df
-      uri: huggingface://mradermacher/StellarDong-72b-i1-GGUF/StellarDong-72b.i1-Q4_K_M.gguf
 - &mistral03
  ## START Mistral
  url: "github:mudler/LocalAI/gallery/mistral-0.3.yaml@master"
@@ -379,31 +264,6 @@
    - filename: Mahou-1.3d-mistral-7B.i1-Q4_K_M.gguf
      sha256: 8272f050e36d612ab282e095cb4e775e2c818e7096f8d522314d256923ef6da9
      uri: huggingface://mradermacher/Mahou-1.3d-mistral-7B-i1-GGUF/Mahou-1.3d-mistral-7B.i1-Q4_K_M.gguf
- name: "einstein-v4-7b"
-  url: "github:mudler/LocalAI/gallery/chatml.yaml@master"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/6468ce47e134d050a58aa89c/U0zyXVGj-O8a7KP3BvPue.png
-  urls:
-    - https://huggingface.co/Weyaxi/Einstein-v4-7B
-    - https://huggingface.co/mradermacher/Einstein-v4-7B-GGUF
-  tags:
-    - llm
-    - gguf
-    - gpu
-    - mistral
-    - cpu
-  description: |
-    🔬 Einstein-v4-7B
-
-    This model is a full fine-tuned version of mistralai/Mistral-7B-v0.1 on diverse datasets.
-
-    This model is finetuned using 7xRTX3090 + 1xRTXA6000 using axolotl.
-  overrides:
-    parameters:
-      model: Einstein-v4-7B.Q4_K_M.gguf
-  files:
-    - filename: Einstein-v4-7B.Q4_K_M.gguf
-      sha256: 78bd573de2a9eb3c6e213132858164e821145f374fcaa4b19dfd6502c05d990d
-      uri: huggingface://mradermacher/Einstein-v4-7B-GGUF/Einstein-v4-7B.Q4_K_M.gguf
 - &mudler
  ### START mudler's LocalAI specific-models
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master"
@@ -734,76 +594,6 @@
    - filename: Big-Tiger-Gemma-27B-v1c-Q4_K_M.gguf
      sha256: c5fc5605d36ae280c1c908c9b4bcb12b28abbe2692f317edeb83ab1104657fe5
      uri: huggingface://TheDrummer/Big-Tiger-Gemma-27B-v1-GGUF/Big-Tiger-Gemma-27B-v1c-Q4_K_M.gguf
- !!merge <<: *gemma
-  name: "gemma-2b-translation-v0.150"
-  urls:
-    - https://huggingface.co/lemon-mint/gemma-2b-translation-v0.150
-    - https://huggingface.co/RichardErkhov/lemon-mint_-_gemma-2b-translation-v0.150-gguf
-  description: |
-    Original model: lemon-mint/gemma-ko-1.1-2b-it
-    Evaluation metrics: Eval Loss, Train Loss, lr, optimizer, lr_scheduler_type.
-    Prompt Template:
-    <bos><start_of_turn>user
-    Translate into Korean: [input text]<end_of_turn>
-    <start_of_turn>model
-    [translated text in Korean]<eos>
-    <bos><start_of_turn>user
-    Translate into English: [Korean text]<end_of_turn>
-    <start_of_turn>model
-    [translated text in English]<eos>
-    Model features:
-    * Developed by: lemon-mint
-    * Model type: Gemma
-    * Languages (NLP): English
-    * License: Gemma Terms of Use
-    * Finetuned from model: lemon-mint/gemma-ko-1.1-2b-it
-  overrides:
-    parameters:
-      model: gemma-2b-translation-v0.150.Q4_K_M.gguf
-  files:
-    - filename: gemma-2b-translation-v0.150.Q4_K_M.gguf
-      sha256: dcde67b83168d2e7ca835cf9a7a4dcf38b41b9cefe3cbc997c71d2741c08cd25
-      uri: huggingface://RichardErkhov/lemon-mint_-_gemma-2b-translation-v0.150-gguf/gemma-2b-translation-v0.150.Q4_K_M.gguf
- !!merge <<: *gemma
-  name: "emo-2b"
-  urls:
-    - https://huggingface.co/OEvortex/EMO-2B
-    - https://huggingface.co/RichardErkhov/OEvortex_-_EMO-2B-gguf
-  description: |
-    EMO-2B: Emotionally Intelligent Conversational AI
-
-    Overview:
-    EMO-2B is a state-of-the-art conversational AI model with 2.5 billion parameters, designed to engage in emotionally resonant dialogue. Building upon the success of EMO-1.5B, this model has been further fine-tuned on an extensive corpus of emotional narratives, enabling it to perceive and respond to the emotional undertones of user inputs with exceptional empathy and emotional intelligence.
-
-    Key Features:
-
-    - Advanced Emotional Intelligence: With its increased capacity, EMO-2B demonstrates an even deeper understanding and generation of emotional language, allowing for more nuanced and contextually appropriate emotional responses.
-    - Enhanced Contextual Awareness: The model considers an even broader context within conversations, accounting for subtle emotional cues and providing emotionally resonant responses tailored to the specific situation.
-    - Empathetic and Supportive Dialogue: EMO-2B excels at active listening, validating emotions, offering compassionate advice, and providing emotional support, making it an ideal companion for users seeking empathy and understanding.
-    - Dynamic Persona Adaptation: The model can dynamically adapt its persona, communication style, and emotional responses to match the user's emotional state, ensuring a highly personalized and tailored conversational experience.
-
-    Use Cases:
-
-    EMO-2B is well-suited for a variety of applications where emotional intelligence and empathetic communication are crucial, such as:
-
-    - Mental health support chatbots
-    - Emotional support companions
-    - Personalized coaching and motivation
-    - Narrative storytelling and interactive fiction
-    - Customer service and support (for emotionally sensitive contexts)
-
-    Limitations and Ethical Considerations:
-
-    While EMO-2B is designed to provide emotionally intelligent and empathetic responses, it is important to note that it is an AI system and cannot replicate the depth and nuance of human emotional intelligence. Users should be aware that the model's responses, while emotionally supportive, should not be considered a substitute for professional mental health support or counseling.
-
-    Additionally, as with any language model, EMO-2B may reflect biases present in its training data. Users should exercise caution and critical thinking when interacting with the model, and report any concerning or inappropriate responses.
-  overrides:
-    parameters:
-      model: EMO-2B.Q4_K_M.gguf
-  files:
-    - filename: EMO-2B.Q4_K_M.gguf
-      sha256: 608bffc0e9012bc7f9a94b714f4932e2826cc122dbac59b586e4baa2ee0fdca5
-      uri: huggingface://RichardErkhov/OEvortex_-_EMO-2B-gguf/EMO-2B.Q4_K_M.gguf
 - &llama3
  url: "github:mudler/LocalAI/gallery/llama3-instruct.yaml@master"
  icon: https://cdn-uploads.huggingface.co/production/uploads/642cc1c253e76b4c2286c58e/aJJxKus1wP5N-euvHEUq7.png
@@ -3226,106 +3016,6 @@
    - filename: L3-15B-EtherealMaid-t0.0001.i1-Q4_K_M.gguf
      sha256: 2911be6be8e0fd4184998d452410ba847491b4ab71a928749de87cafb0e13757
      uri: huggingface://mradermacher/L3-15B-EtherealMaid-t0.0001-i1-GGUF/L3-15B-EtherealMaid-t0.0001.i1-Q4_K_M.gguf
- !!merge <<: *llama3
-  name: "l3-8b-celeste-v1"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/630cf5d14ca0a22768bbe10c/Zv__LDTO-nHvpuxPcCgUU.webp
-  urls:
-    - https://huggingface.co/nothingiisreal/L3-8B-Celeste-v1
-    - https://huggingface.co/bartowski/L3-8B-Celeste-v1-GGUF
-  description: |
-    Trained on LLaMA 3 8B Instruct at 8K context using Reddit Writing Prompts, Opus 15K Instruct an c2 logs cleaned.
-
-    This is a roleplay model any instruction following capabilities outside roleplay contexts are coincidental.
-  overrides:
-    parameters:
-      model: L3-8B-Celeste-v1-Q4_K_M.gguf
-  files:
-    - filename: L3-8B-Celeste-v1-Q4_K_M.gguf
-      sha256: ed5277719965fb6bbcce7d16742e3bac4a8d5b8f52133261a3402a480cd65317
-      uri: huggingface://bartowski/L3-8B-Celeste-v1-GGUF/L3-8B-Celeste-v1-Q4_K_M.gguf
- !!merge <<: *llama3
-  name: "l3-8b-celeste-v1.2"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/630cf5d14ca0a22768bbe10c/Zv__LDTO-nHvpuxPcCgUU.webp
-  urls:
-    - https://huggingface.co/mudler/L3-8B-Celeste-V1.2-Q4_K_M-GGUF
-  description: |
-    Trained on LLaMA 3 8B Instruct at 8K context using Reddit Writing Prompts, Opus 15K Instruct an c2 logs cleaned.
-
-    This is a roleplay model any instruction following capabilities outside roleplay contexts are coincidental.
-  overrides:
-    parameters:
-      model: l3-8b-celeste-v1.2-q4_k_m.gguf
-  files:
-    - filename: l3-8b-celeste-v1.2-q4_k_m.gguf
-      sha256: 7752204c0e9f627ff5726eb69bb6114974cafbc934a993ad019abfba62002783
-      uri: huggingface://mudler/L3-8B-Celeste-V1.2-Q4_K_M-GGUF/l3-8b-celeste-v1.2-q4_k_m.gguf
- !!merge <<: *llama3
-  name: "llama-3-tulu-2-8b-i1"
-  icon: https://huggingface.co/datasets/allenai/blog-images/resolve/main/tulu-v2/Tulu%20V2%20banner.png
-  urls:
-    - https://huggingface.co/allenai/llama-3-tulu-2-8b
-    - https://huggingface.co/mradermacher/llama-3-tulu-2-8b-i1-GGUF
-  description: |
-    Tulu is a series of language models that are trained to act as helpful assistants. Llama 3 Tulu V2 8B is a fine-tuned version of Llama 3 that was trained on a mix of publicly available, synthetic and human datasets.
-  overrides:
-    parameters:
-      model: llama-3-tulu-2-8b.i1-Q4_K_M.gguf
-  files:
-    - filename: llama-3-tulu-2-8b.i1-Q4_K_M.gguf
-      sha256: f859c22bfa64f461e9ffd973dc7ad6a78bb98b1dda6f49abfa416a4022b7e333
-      uri: huggingface://mradermacher/llama-3-tulu-2-8b-i1-GGUF/llama-3-tulu-2-8b.i1-Q4_K_M.gguf
- !!merge <<: *llama3
-  name: "llama-3-tulu-2-dpo-70b-i1"
-  icon: https://huggingface.co/datasets/allenai/blog-images/resolve/main/tulu-v2/Tulu%20V2%20banner.png
-  urls:
-    - https://huggingface.co/allenai/llama-3-tulu-2-dpo-70b
-    - https://huggingface.co/mradermacher/llama-3-tulu-2-dpo-70b-i1-GGUF
-  description: |
-    Tulu is a series of language models that are trained to act as helpful assistants. Llama 3 Tulu V2 8B is a fine-tuned version of Llama 3 that was trained on a mix of publicly available, synthetic and human datasets.
-  overrides:
-    parameters:
-      model: llama-3-tulu-2-dpo-70b.i1-Q4_K_M.gguf
-  files:
-    - filename: llama-3-tulu-2-dpo-70b.i1-Q4_K_M.gguf
-      sha256: fc309bbdf1e2bdced954c4c8dc1f9a885c547017ee5e750bfde645af89e3d3a5
-      uri: huggingface://mradermacher/llama-3-tulu-2-dpo-70b-i1-GGUF/llama-3-tulu-2-dpo-70b.i1-Q4_K_M.gguf
- !!merge <<: *llama3
-  license: cc-by-nc-4.0
-  name: "suzume-llama-3-8b-multilingual-orpo-borda-top25"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/64b63f8ad57e02621dc93c8b/kWQSu02YfgYdUQqv4s5lq.png
-  urls:
-    - https://huggingface.co/lightblue/suzume-llama-3-8B-multilingual-orpo-borda-top25
-    - https://huggingface.co/RichardErkhov/lightblue_-_suzume-llama-3-8B-multilingual-orpo-borda-top25-gguf
-  description: |
-    This is Suzume ORPO, an ORPO trained fine-tune of the lightblue/suzume-llama-3-8B-multilingual model using our lightblue/mitsu dataset.
-
-    We have trained several versions of this model using ORPO and so recommend that you use the best performing model from our tests, lightblue/suzume-llama-3-8B-multilingual-orpo-borda-half.
-
-    Note that this model has a non-commerical license as we used the Command R and Command R+ models to generate our training data for this model (lightblue/mitsu).
-
-    We are currently working on a developing a commerically usable model, so stay tuned for that!
-  overrides:
-    parameters:
-      model: suzume-llama-3-8B-multilingual-orpo-borda-top25.Q4_K_M.gguf
-  files:
-    - filename: suzume-llama-3-8B-multilingual-orpo-borda-top25.Q4_K_M.gguf
-      sha256: ef75a02c5f38e14a8873c7989188dac6974851b4654279fe1921d2c8018cc388
-      uri: huggingface://RichardErkhov/lightblue_-_suzume-llama-3-8B-multilingual-orpo-borda-top25-gguf/suzume-llama-3-8B-multilingual-orpo-borda-top25.Q4_K_M.gguf
- !!merge <<: *llama3
-  name: "calme-2.4-llama3-70b"
-  icon: https://huggingface.co/MaziyarPanahi/calme-2.4-llama3-70b/resolve/main/llama-3-merges.webp
-  urls:
-    - https://huggingface.co/MaziyarPanahi/calme-2.4-llama3-70b
-    - https://huggingface.co/mradermacher/calme-2.4-llama3-70b-GGUF
-  description: |
-    This model is a fine-tune (DPO) of meta-llama/Meta-Llama-3-70B-Instruct model.
-  overrides:
-    parameters:
-      model: calme-2.4-llama3-70b.Q4_K_M.gguf
-  files:
-    - filename: calme-2.4-llama3-70b.Q4_K_M.gguf
-      sha256: 0b44ac8a88395dfc60f1b9d3cfffc0ffef74ec0a302e610ef91fc787187568f2
-      uri: huggingface://mradermacher/calme-2.4-llama3-70b-GGUF/calme-2.4-llama3-70b.Q4_K_M.gguf
 - &command-R
  ### START Command-r
  url: "github:mudler/LocalAI/gallery/command-r.yaml@master"
@@ -3570,38 +3260,6 @@
    - filename: Phi-3.1-mini-4k-instruct-Q4_K_M.gguf
      sha256: 39458b227a4be763b7eb39d306d240c3d45205e3f8b474ec7bdca7bba0158e69
      uri: huggingface://bartowski/Phi-3.1-mini-4k-instruct-GGUF/Phi-3.1-mini-4k-instruct-Q4_K_M.gguf
- !!merge <<: *phi-3
-  name: "phillama-3.8b-v0.1"
-  icon: https://cdn-uploads.huggingface.co/production/uploads/657eb5b256c9c67605a6e8b5/f96pPiJQb3puzbPYNknG2.png
-  urls:
-    - https://huggingface.co/RichardErkhov/raincandy-u_-_phillama-3.8b-v0.1-gguf
-  description: |
-    The description of the LLM model is:
-    Phillama is a model based on Phi-3-mini and trained on Llama-generated dataset raincandy-u/Dextromethorphan-10k to make it more "llama-like". Also, this model is converted into Llama format, so it will work with any Llama-2/3 workflow. The model aims to generate text with a specific "llama-like" style and is suited for text-generation tasks.
-  overrides:
-    parameters:
-      model: phillama-3.8b-v0.1.Q4_K_M.gguf
-  files:
-    - filename: phillama-3.8b-v0.1.Q4_K_M.gguf
-      sha256: da537d352b7aae54bbad0d2cff3e3a1b0e1dc1e1d25bec3aae1d05cf4faee7a2
-      uri: huggingface://RichardErkhov/raincandy-u_-_phillama-3.8b-v0.1-gguf/phillama-3.8b-v0.1.Q4_K_M.gguf
- !!merge <<: *llama3
-  name: "calme-2.3-phi3-4b"
-  icon: https://huggingface.co/MaziyarPanahi/calme-2.1-phi3-4b/resolve/main/phi-3-instruct.webp
-  urls:
-    - https://huggingface.co/MaziyarPanahi/calme-2.3-phi3-4b
-    - https://huggingface.co/MaziyarPanahi/calme-2.3-phi3-4b-GGUF
-  description: |
-    MaziyarPanahi/calme-2.1-phi3-4b
-
-    This model is a fine-tune (DPO) of microsoft/Phi-3-mini-4k-instruct model.
-  overrides:
-    parameters:
-      model: Phi-3-mini-4k-instruct-v0.3.Q4_K_M.gguf
-  files:
-    - filename: Phi-3-mini-4k-instruct-v0.3.Q4_K_M.gguf
-      sha256: 3a23e1052369c080afb925882bd814cbea5ec859894655a7434c3d49e43a6127
-      uri: huggingface://MaziyarPanahi/calme-2.3-phi3-4b-GGUF/Phi-3-mini-4k-instruct-v0.3.Q4_K_M.gguf
 - &hermes-2-pro-mistral
  ### START Hermes
  url: "github:mudler/LocalAI/gallery/hermes-2-pro-mistral.yaml@master"
--- a/gallery/mudler.yaml
+++ b/gallery/mudler.yaml
@@ -10,7 +10,8 @@ config_file: |-
    - <|end_of_text|>

  function:
-    return_name_in_function_response: true
+    grammar:
+      enable: true

  template:
    chat: |
--- a/gallery/tuluv2.yaml
+++ b/gallery/tuluv2.yaml
@@ -1,43 +0,0 @@
---
-name: "tuluv2"
-
-config_file: |
-  mmap: true
-  template:
-    chat_message: |
-      <|{{ .RoleName }}|>
-      {{ if .FunctionCall -}}
-      Function call:
-      {{ else if eq .RoleName "tool" -}}
-      Function response:
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      {{ end -}}
-    function: |
-      <|{{ .RoleName }}|>
-      {{ if .FunctionCall -}}
-      Function call:
-      {{ else if eq .RoleName "tool" -}}
-      Function response:
-      {{ end -}}
-      {{ if .Content -}}
-      {{.Content }}
-      {{ end -}}
-      {{ if .FunctionCall -}}
-      {{toJson .FunctionCall}}
-      {{ end -}}
-    chat: |
-      {{.Input -}}
-      <|assistant|>
-    completion: |
-      {{.Input}}
-  context_size: 4096
-  f16: true
-  stopwords:
-  - '<|im_end|>'
-  - '<dummy32000>'
-  - '<|endoftext|>'
--- a/pkg/functions/bnf_rules.go
+++ b/pkg/functions/bnf_rules.go
@@ -1,47 +0,0 @@
-package functions
-
-import "regexp"
-
-var (
-	PRIMITIVE_RULES = map[string]string{
-		"boolean": `("true" | "false") space`,
-		"number":  `("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space`,
-		"integer": `("-"? ([0-9] | [1-9] [0-9]*)) space`,
-		"string": `"\"" (
-			[^"\\] |
-			"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-		  )* "\"" space`,
-		// TODO: we shouldn't forbid \" and \\ or all unicode and have this branch here,
-		// however, if we don't have it, the grammar will be ambiguous and
-		// empirically results are way worse.
-		"freestring": `(
-			[^\x00] |
-			"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
-		  )* space`,
-		"null": `"null" space`,
-	}
-
-	INVALID_RULE_CHARS_RE     = regexp.MustCompile(`[^a-zA-Z0-9-]+`)
-	GRAMMAR_LITERAL_ESCAPE_RE = regexp.MustCompile(`[\r\n"]`)
-	GRAMMAR_LITERAL_ESCAPES   = map[string]string{
-		"\r": `\r`,
-		"\n": `\n`,
-		`"`:  `\"`,
-	}
-)
-
-const (
-	SPACE_RULE = `" "?`
-
-	arrayNewLines = `arr  ::=
-  "[\n"  (
-		realvalue
-    (",\n"  realvalue)*
-  )? "]"`
-
-	array = `arr  ::=
-  "["  (
-		realvalue
-    (","  realvalue)*
-  )? "]"`
-)
--- a/pkg/functions/function_structure.go
+++ b/pkg/functions/function_structure.go
@@ -1,22 +0,0 @@
-package functions
-
-import "encoding/json"
-
-type Item struct {
-	Type       string                 `json:"type"`
-	Properties map[string]interface{} `json:"properties"`
-}
-
-type JSONFunctionStructure struct {
-	OneOf []Item                 `json:"oneOf,omitempty"`
-	AnyOf []Item                 `json:"anyOf,omitempty"`
-	Defs  map[string]interface{} `json:"$defs,omitempty"`
-}
-
-func (j JSONFunctionStructure) Grammar(options ...func(*GrammarOption)) string {
-	grammarOpts := &GrammarOption{}
-	grammarOpts.Apply(options...)
-
-	dat, _ := json.Marshal(j)
-	return NewJSONSchemaConverter(grammarOpts.PropOrder).GrammarFromBytes(dat, options...)
-}
--- a/pkg/functions/functions.go
+++ b/pkg/functions/functions.go
@@ -18,15 +18,6 @@ type Function struct {
 }
 type Functions []Function

-type FunctionName struct {
-	Const string `json:"const"`
-}
-
-type Argument struct {
-	Type       string                 `json:"type"`
-	Properties map[string]interface{} `json:"properties"`
-}
-
 type Tool struct {
 	Type     string   `json:"type"`
 	Function Function `json:"function,omitempty"`
@@ -95,8 +86,3 @@ func (f Functions) Select(name string) Functions {

 	return funcs
 }
-
-func jsonString(v interface{}) string {
-	b, _ := json.Marshal(v)
-	return string(b)
-}
--- a/pkg/functions/functions_suite_test.go
+++ b/pkg/functions/functions_suite_test.go
@@ -1,10 +1,8 @@
-package functions_test
+package functions

 import (
 	"testing"

-	. "github.com/mudler/LocalAI/pkg/functions"
-
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 )
@@ -13,13 +11,3 @@ func TestGrammar(t *testing.T) {
 	RegisterFailHandler(Fail)
 	RunSpecs(t, "Grammar test suite")
 }
-
-func createFunction(field1 string, field2 string, name string, properties map[string]interface{}) map[string]interface{} {
-	property := map[string]interface{}{}
-	property[field1] = FunctionName{Const: name}
-	property[field2] = Argument{
-		Type:       "object",
-		Properties: properties,
-	}
-	return property
-}
--- a/pkg/functions/grammar_json_schema.go
+++ b/pkg/functions/grammar_json_schema.go
@@ -5,12 +5,70 @@ package functions
 import (
 	"encoding/json"
 	"fmt"
+	"regexp"
 	"sort"
 	"strings"

 	"github.com/mudler/LocalAI/pkg/utils"
 )

+const (
+	JSONBNF = `root   ::= object
+value  ::= object | array | string | number | ("true" | "false" | "null") ws
+
+object ::=
+  "{" ws (
+            string ":" ws value
+    ("," ws string ":" ws value)*
+  )? "}" ws
+
+array  ::=
+  "[" ws (
+            value
+    ("," ws value)*
+  )? "]" ws
+
+string ::=
+  "\"" (
+    [^"\\] |
+    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
+  )* "\"" ws
+
+number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
+
+ws ::= ([ \t\n] ws)?`
+)
+
+var (
+	SPACE_RULE = `" "?`
+
+	PRIMITIVE_RULES = map[string]string{
+		"boolean": `("true" | "false") space`,
+		"number":  `("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? space`,
+		"integer": `("-"? ([0-9] | [1-9] [0-9]*)) space`,
+		"string": `"\"" (
+			[^"\\] |
+			"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+		  )* "\"" space`,
+		// TODO: we shouldn't forbid \" and \\ or all unicode and have this branch here,
+		// however, if we don't have it, the grammar will be ambiguous and
+		// empirically results are way worse.
+		"freestring": `(
+			[^\x00] |
+			"\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F])
+		  )* space`,
+		"null": `"null" space`,
+	}
+
+	INVALID_RULE_CHARS_RE     = regexp.MustCompile(`[^a-zA-Z0-9-]+`)
+	GRAMMAR_LITERAL_ESCAPE_RE = regexp.MustCompile(`[\r\n"]`)
+	GRAMMAR_LITERAL_ESCAPES   = map[string]string{
+		"\r": `\r`,
+		"\n": `\n`,
+		`"`:  `\"`,
+	}
+)
+
 type JSONSchemaConverter struct {
 	propOrder map[string]int
 	rules     map[string]string
@@ -56,6 +114,18 @@ func (sc *JSONSchemaConverter) addRule(name, rule string) string {
 	return key
 }

+const arrayNewLines = `arr  ::=
+  "[\n"  (
+		realvalue
+    (",\n"  realvalue)*
+  )? "]"`
+
+const array = `arr  ::=
+  "["  (
+		realvalue
+    (","  realvalue)*
+  )? "]"`
+
 func (sc *JSONSchemaConverter) finalizeGrammar(options ...func(*GrammarOption)) string {

 	grammarOpts := &GrammarOption{}
@@ -273,3 +343,36 @@ func (sc *JSONSchemaConverter) GrammarFromBytes(b []byte, options ...func(*Gramm
 	_ = json.Unmarshal(b, &schema)
 	return sc.Grammar(schema, options...)
 }
+
+func jsonString(v interface{}) string {
+	b, _ := json.Marshal(v)
+	return string(b)
+}
+
+type FunctionName struct {
+	Const string `json:"const"`
+}
+
+type Argument struct {
+	Type       string                 `json:"type"`
+	Properties map[string]interface{} `json:"properties"`
+}
+
+type Item struct {
+	Type       string                 `json:"type"`
+	Properties map[string]interface{} `json:"properties"`
+}
+
+type JSONFunctionStructure struct {
+	OneOf []Item                 `json:"oneOf,omitempty"`
+	AnyOf []Item                 `json:"anyOf,omitempty"`
+	Defs  map[string]interface{} `json:"$defs,omitempty"`
+}
+
+func (j JSONFunctionStructure) Grammar(options ...func(*GrammarOption)) string {
+	grammarOpts := &GrammarOption{}
+	grammarOpts.Apply(options...)
+
+	dat, _ := json.Marshal(j)
+	return NewJSONSchemaConverter(grammarOpts.PropOrder).GrammarFromBytes(dat, options...)
+}
--- a/pkg/functions/grammar_json_schema_test.go
+++ b/pkg/functions/grammar_json_schema_test.go
@@ -9,6 +9,16 @@ import (
 	. "github.com/onsi/gomega"
 )

+func createFunction(field1 string, field2 string, name string, properties map[string]interface{}) map[string]interface{} {
+	property := map[string]interface{}{}
+	property[field1] = FunctionName{Const: name}
+	property[field2] = Argument{
+		Type:       "object",
+		Properties: properties,
+	}
+	return property
+}
+
 var testFunctions = []Item{
 	{
 		Type: "object",
--- a/pkg/functions/json_mode.go
+++ b/pkg/functions/json_mode.go
@@ -1,28 +0,0 @@
-package functions
-
-const (
-	JSONBNF = `root   ::= object
-value  ::= object | array | string | number | ("true" | "false" | "null") ws
-
-object ::=
-  "{" ws (
-            string ":" ws value
-    ("," ws string ":" ws value)*
-  )? "}" ws
-
-array  ::=
-  "[" ws (
-            value
-    ("," ws value)*
-  )? "]" ws
-
-string ::=
-  "\"" (
-    [^"\\] |
-    "\\" (["\\/bfnrt] | "u" [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]) # escapes
-  )* "\"" ws
-
-number ::= ("-"? ([0-9] | [1-9] [0-9]*)) ("." [0-9]+)? ([eE] [-+]? [0-9]+)? ws
-
-ws ::= ([ \t\n] ws)?`
-)
--- a/pkg/functions/parse.go
+++ b/pkg/functions/parse.go
@@ -25,8 +25,8 @@ type GrammarConfig struct {
 	// In this way if the LLM selects a free string, it won't be mixed necessarly with JSON objects
 	NoMixedFreeString bool `yaml:"no_mixed_free_string"`

-	// NoGrammar disables the grammar parsing and parses the responses directly from the LLM
-	NoGrammar bool `yaml:"disable"`
+	// EnableGrammar disables the grammar parsing and parses the responses directly from the LLM
+	EnableGrammar bool `yaml:"enable"`

 	// Prefix is the suffix to append to the grammar when being generated
 	// This is useful when models prepend a tag before returning JSON
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -212,7 +212,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 					grpcProcess = p
 					foundCUDA = true
 				} else {
-					log.Debug().Msgf("Nvidia GPU device found, no embedded CUDA variant found. You can ignore this message if you are using container with CUDA support")
+					log.Info().Msgf("GPU device found but no CUDA backend present")
 				}
 			}
 			if strings.Contains(gpu.String(), "amd") {
@@ -222,7 +222,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 					grpcProcess = p
 					foundAMDGPU = true
 				} else {
-					log.Debug().Msgf("AMD GPU device found, no embedded HIPBLAS variant found. You can ignore this message if you are using container with HIPBLAS support")
+					log.Info().Msgf("GPU device found but no HIPBLAS backend present")
 				}
 			}
 			if strings.Contains(gpu.String(), "intel") {
@@ -236,7 +236,7 @@ func selectGRPCProcess(backend, assetDir string, f16 bool) string {
 					grpcProcess = p
 					foundIntelGPU = true
 				} else {
-					log.Debug().Msgf("Intel GPU device found, no embedded SYCL variant found. You can ignore this message if you are using container with SYCL support")
+					log.Info().Msgf("GPU device found but no Intel backend present")
 				}
 			}
 		}
Author	SHA1	Message	Date
Ettore Di Giacinto	4ebf3c7ac4	Merge branch 'master' into disable_grammar_by_default	2024-07-19 09:28:00 +02:00
Ettore Di Giacinto	2a96232f99	feat(grammar): mark grammar disabled by default Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2024-07-19 09:07:29 +02:00