wire to grpc

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
wip reranking llama.cpp
2026-02-03 11:13:31 -05:00 · 2025-04-19 20:22:31 +02:00 · 2025-04-19 19:52:02 +02:00 · 2025-04-19 15:52:29 +02:00 · 2025-04-19 08:53:24 +02:00 · 2025-04-18 21:45:48 +00:00
44 changed files with 560 additions and 368 deletions
--- a/.github/dependabot.yml
+++ b/.github/dependabot.yml
@@ -29,10 +29,6 @@ updates:
    schedule:
      # Check for updates to GitHub Actions every weekday
      interval: "weekly"
-  - package-ecosystem: "pip"
-    directory: "/backend/python/autogptq"
-    schedule:
-      interval: "weekly"
  - package-ecosystem: "pip"
    directory: "/backend/python/bark"
    schedule:
--- a/.github/workflows/image.yml
+++ b/.github/workflows/image.yml
@@ -75,6 +75,7 @@ jobs:
            grpc-base-image: "ubuntu:22.04"
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-hipblas-core'
          - build-type: 'hipblas'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -251,6 +252,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f16-core'
          - build-type: 'sycl_f32'
            platforms: 'linux/amd64'
            tag-latest: 'false'
@@ -261,6 +263,7 @@ jobs:
            image-type: 'core'
            runs-on: 'arc-runner-set'
            makeflags: "--jobs=3 --output-sync=target"
+            latest-image: 'latest-gpu-intel-f32-core'

  core-image-build:
    uses: ./.github/workflows/image_build.yml
@@ -339,6 +342,7 @@ jobs:
            base-image: "ubuntu:22.04"
            makeflags: "--jobs=4 --output-sync=target"
            skip-drivers: 'false'
+            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'cublas'
            cuda-major-version: "12"
            cuda-minor-version: "0"
@@ -351,17 +355,18 @@ jobs:
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-nvidia-cuda-12-core'
          - build-type: 'vulkan'
            platforms: 'linux/amd64'
            tag-latest: 'false'
            tag-suffix: '-vulkan-ffmpeg-core'
-            latest-image: 'latest-vulkan-ffmpeg-core'
            ffmpeg: 'true'
            image-type: 'core'
            runs-on: 'arc-runner-set'
            base-image: "ubuntu:22.04"
            skip-drivers: 'false'
            makeflags: "--jobs=4 --output-sync=target"
+            latest-image: 'latest-gpu-vulkan-core'
  gh-runner:
    uses: ./.github/workflows/image_build.yml
    with:
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -8,7 +8,7 @@ jobs:
  notify-discord:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
@@ -16,7 +16,7 @@ jobs:
        fetch-depth: 0 # needed to checkout all branches for this Action to work
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
    - uses: GrantBirki/git-diff-action@v2.8.0
      id: git-diff-action
@@ -87,7 +87,7 @@ jobs:
  notify-twitter:
    if: ${{ (github.event.pull_request.merged == true) && (contains(github.event.pull_request.labels.*.name, 'area/ai-model')) }}
    env:
-        MODEL_NAME: hermes-2-theta-llama-3-8b
+        MODEL_NAME: gemma-3-12b-it
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v4
--- a/.github/workflows/notify-releases.yaml
+++ b/.github/workflows/notify-releases.yaml
@@ -14,7 +14,7 @@ jobs:
    steps:
    - uses: mudler/localai-github-action@v1
      with:
-        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
+        model: 'gemma-3-12b-it' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
    - name: Summarize
      id: summarize
      run: |
@@ -60,4 +60,4 @@ jobs:
        DISCORD_AVATAR: "https://avatars.githubusercontent.com/u/139863280?v=4"
      uses: Ilshidur/action-discord@master
      with:
-        args: ${{ steps.summarize.outputs.message }}
+        args: ${{ steps.summarize.outputs.message }}
--- a/5
+++ b/5
@@ -15,7 +15,7 @@ ARG TARGETARCH
 ARG TARGETVARIANT

 ENV DEBIAN_FRONTEND=noninteractive
-ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,autogptq:/build/backend/python/autogptq/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"
+ENV EXTERNAL_GRPC_BACKENDS="coqui:/build/backend/python/coqui/run.sh,transformers:/build/backend/python/transformers/run.sh,rerankers:/build/backend/python/rerankers/run.sh,bark:/build/backend/python/bark/run.sh,diffusers:/build/backend/python/diffusers/run.sh,faster-whisper:/build/backend/python/faster-whisper/run.sh,kokoro:/build/backend/python/kokoro/run.sh,vllm:/build/backend/python/vllm/run.sh,exllama2:/build/backend/python/exllama2/run.sh"

 RUN apt-get update && \
    apt-get install -y --no-install-recommends \
@@ -431,9 +431,6 @@ RUN if [[ ( "${EXTRA_BACKENDS}" =~ "kokoro" || -z "${EXTRA_BACKENDS}" ) && "$IMA
 RUN if [[ ( "${EXTRA_BACKENDS}" =~ "vllm" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/vllm \
    ; fi && \
-    if [[ ( "${EXTRA_BACKENDS}" =~ "autogptq" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
-        make -C backend/python/autogptq \
-    ; fi && \
    if [[ ( "${EXTRA_BACKENDS}" =~ "bark" || -z "${EXTRA_BACKENDS}" ) && "$IMAGE_TYPE" == "extras" ]]; then \
        make -C backend/python/bark \
    ; fi && \
--- a/15
+++ b/15
@@ -6,7 +6,7 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true

 # llama.cpp versions
-CPPLLAMA_VERSION?=d6d2c2ab8c8865784ba9fef37f2b2de3f2134d33
+CPPLLAMA_VERSION?=6408210082cc0a61b992b487be7e2ff2efbb9e36

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -505,18 +505,10 @@ protogen-go-clean:
 	$(RM) bin/*

 .PHONY: protogen-python
-protogen-python: autogptq-protogen bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen
+protogen-python: bark-protogen coqui-protogen diffusers-protogen exllama2-protogen rerankers-protogen transformers-protogen kokoro-protogen vllm-protogen faster-whisper-protogen

 .PHONY: protogen-python-clean
-protogen-python-clean: autogptq-protogen-clean bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean
-
-.PHONY: autogptq-protogen
-autogptq-protogen:
-	$(MAKE) -C backend/python/autogptq protogen
-
-.PHONY: autogptq-protogen-clean
-autogptq-protogen-clean:
-	$(MAKE) -C backend/python/autogptq protogen-clean
+protogen-python-clean: bark-protogen-clean coqui-protogen-clean diffusers-protogen-clean  exllama2-protogen-clean rerankers-protogen-clean transformers-protogen-clean kokoro-protogen-clean vllm-protogen-clean faster-whisper-protogen-clean

 .PHONY: bark-protogen
 bark-protogen:
@@ -593,7 +585,6 @@ vllm-protogen-clean:
 ## GRPC
 # Note: it is duplicated in the Dockerfile
 prepare-extra-conda-environments: protogen-python
-	$(MAKE) -C backend/python/autogptq
 	$(MAKE) -C backend/python/bark
 	$(MAKE) -C backend/python/coqui
 	$(MAKE) -C backend/python/diffusers
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -190,11 +190,7 @@ message ModelOptions {
  int32 NGQA = 20;
  string ModelFile = 21;

-  // AutoGPTQ
-  string Device = 22;
-  bool UseTriton = 23;
-  string ModelBaseName = 24;
-  bool UseFastTokenizer = 25;
+

  // Diffusers
  string PipelineType = 26;
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -217,6 +217,7 @@ struct llama_client_slot

    bool infill = false;
    bool embedding = false;
+    bool reranker = false;
    bool has_next_token = true;
    bool truncated = false;
    bool stopped_eos = false;
@@ -535,6 +536,12 @@ struct llama_server_context
            return false;
        }

+        // Enable reranking if embeddings are enabled - moved after context initialization
+        if (params.embedding) {
+            params.reranking = true;
+            LOG_INFO("Reranking enabled (embeddings are enabled)", {});
+        }
+
        if (multimodal) {
            const int n_embd_clip = clip_n_mmproj_embd(clp_ctx);
            const int n_embd_llm  = llama_model_n_embd(model);
@@ -1413,7 +1420,59 @@ struct llama_server_context
        queue_results.send(res);
    }

-    void request_completion(int task_id, json data, bool infill, bool embedding, int multitask_id)
+    void send_rerank(llama_client_slot &slot, const llama_batch & batch)
+    {
+        task_result res;
+        res.id = slot.task_id;
+        res.multitask_id = slot.multitask_id;
+        res.error = false;
+        res.stop = true;
+
+        float score = -1e6f; // Default score if we fail to get embeddings
+
+        if (!params.reranking)
+        {
+            LOG_WARNING("reranking disabled", {
+                {"params.reranking", params.reranking},
+            });
+        }
+        else if (ctx == nullptr)
+        {
+            LOG_ERR("context is null, cannot perform reranking");
+            res.error = true;
+        }
+        else
+        {
+            for (int i = 0; i < batch.n_tokens; ++i) {
+                if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
+                    continue;
+                }
+
+                const float * embd = llama_get_embeddings_seq(ctx, batch.seq_id[i][0]);
+                if (embd == NULL) {
+                    embd = llama_get_embeddings_ith(ctx, i);
+                }
+
+                if (embd == NULL) {
+                    LOG("failed to get embeddings");
+                    continue;
+                }
+
+                score = embd[0];
+            }
+        }
+
+        // Format result as JSON similar to the embedding function
+        res.result_json = json
+        {
+            {"score", score},
+            {"tokens", slot.num_prompt_tokens}
+        };
+        
+        queue_results.send(res);
+    }
+
+    void request_completion(int task_id, json data, bool infill, bool embedding, bool rerank, int multitask_id)
    {
        task_server task;
        task.id = task_id;
@@ -1421,6 +1480,7 @@ struct llama_server_context
        task.data = std::move(data);
        task.infill_mode = infill;
        task.embedding_mode = embedding;
+        task.reranking_mode = rerank;
        task.type = TASK_TYPE_COMPLETION;
        task.multitask_id = multitask_id;

@@ -1552,7 +1612,7 @@ struct llama_server_context
            subtask_data["prompt"] = subtask_data["prompt"][i];

            // subtasks inherit everything else (infill mode, embedding mode, etc.)
-            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multitask_id);
+            request_completion(subtask_ids[i], subtask_data, multiprompt_task.infill_mode, multiprompt_task.embedding_mode, multiprompt_task.reranking_mode, multitask_id);
        }
    }

@@ -1591,6 +1651,7 @@ struct llama_server_context

                slot->infill       = task.infill_mode;
                slot->embedding    = task.embedding_mode;
+                slot->reranker    = task.reranking_mode;
                slot->task_id      = task.id;
                slot->multitask_id = task.multitask_id;

@@ -2034,6 +2095,14 @@ struct llama_server_context
                    continue;
                }

+                if (slot.reranker)
+                {
+                    send_rerank(slot, batch_view);
+                    slot.release();
+                    slot.i_batch = -1;
+                    continue;
+                }
+
                completion_token_output result;
                const llama_token id = common_sampler_sample(slot.ctx_sampling, ctx, slot.i_batch - i);

@@ -2489,7 +2558,7 @@ public:
        json data = parse_options(true, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        while (true)
        {
            task_result result = llama.queue_results.recv(task_id);
@@ -2543,7 +2612,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, data, false, false, -1);
+        llama.request_completion(task_id, data, false, false, false, -1);
        std::string completion_text;
        task_result result = llama.queue_results.recv(task_id);
        if (!result.error && result.stop) {
@@ -2580,7 +2649,7 @@ public:
        json data = parse_options(false, request, llama);
        const int task_id = llama.queue_tasks.get_new_id();
        llama.queue_results.add_waiting_task_id(task_id);
-        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, -1);
+        llama.request_completion(task_id, { {"prompt", data["embeddings"]}, { "n_predict", 0}, {"image_data", ""} }, false, true, false, -1);
        // get the result
        task_result result = llama.queue_results.recv(task_id);
        //std::cout << "Embedding result JSON" << result.result_json.dump() << std::endl;
@@ -2612,6 +2681,46 @@ public:
        return grpc::Status::OK;
    }

+    grpc::Status Rerank(ServerContext* context, const backend::RerankRequest* request, backend::RerankResult* rerankResult) {
+        // Create a JSON object with the query and documents
+        json data = {
+            {"prompt", request->query()},
+            {"documents", request->documents()},
+            {"top_n", request->top_n()}
+        };
+
+        // Generate a new task ID
+        const int task_id = llama.queue_tasks.get_new_id();
+        llama.queue_results.add_waiting_task_id(task_id);
+
+        // Queue the task with reranking mode enabled
+        llama.request_completion(task_id, data, false, false, true, -1);
+
+        // Get the result
+        task_result result = llama.queue_results.recv(task_id);
+        llama.queue_results.remove_waiting_task_id(task_id);
+
+        if (!result.error && result.stop) {
+            // Set usage information
+            backend::Usage* usage = rerankResult->mutable_usage();
+            usage->set_total_tokens(result.result_json.value("tokens", 0));
+            usage->set_prompt_tokens(result.result_json.value("tokens", 0));
+
+            // Get the score from the result
+            float score = result.result_json.value("score", 0.0f);
+
+            // Create document results for each input document
+            for (int i = 0; i < request->documents_size(); i++) {
+                backend::DocumentResult* doc_result = rerankResult->add_results();
+                doc_result->set_index(i);
+                doc_result->set_text(request->documents(i));
+                doc_result->set_relevance_score(score);
+            }
+        }
+
+        return grpc::Status::OK;
+    }
+
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();

@@ -2644,7 +2753,9 @@ void RunServer(const std::string& server_address) {
  ServerBuilder builder;
  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
  builder.RegisterService(&service);
-
+  builder.SetMaxMessageSize(50 * 1024 * 1024); // 50MB
+  builder.SetMaxSendMessageSize(50 * 1024 * 1024); // 50MB
+  builder.SetMaxReceiveMessageSize(50 * 1024 * 1024); // 50MB
  std::unique_ptr<Server> server(builder.BuildAndStart());
  std::cout << "Server listening on " << server_address << std::endl;
  server->Wait();
--- a/backend/cpp/llama/utils.hpp
+++ b/backend/cpp/llama/utils.hpp
@@ -61,6 +61,7 @@ struct task_server {
    json data;
    bool infill_mode = false;
    bool embedding_mode = false;
+    bool reranking_mode = false;
    int multitask_id = -1;
 };

--- a/backend/python/autogptq/Makefile
+++ b/backend/python/autogptq/Makefile
@@ -1,17 +0,0 @@
-.PHONY: autogptq
-autogptq: protogen
-	bash install.sh
-
-.PHONY: protogen
-protogen: backend_pb2_grpc.py backend_pb2.py
-
-.PHONY: protogen-clean
-protogen-clean:
-	$(RM) backend_pb2_grpc.py backend_pb2.py
-
-backend_pb2_grpc.py backend_pb2.py:
-	python3 -m grpc_tools.protoc -I../.. --python_out=. --grpc_python_out=. backend.proto
-
-.PHONY: clean
-clean: protogen-clean
-	rm -rf venv __pycache__
--- a/backend/python/autogptq/README.md
+++ b/backend/python/autogptq/README.md
@@ -1,5 +0,0 @@
-# Creating a separate environment for the autogptq project
-
-```
-make autogptq
-```
--- a/backend/python/autogptq/backend.py
+++ b/backend/python/autogptq/backend.py
@@ -1,153 +0,0 @@
-#!/usr/bin/env python3
-from concurrent import futures
-import argparse
-import signal
-import sys
-import os
-import time
-import base64
-
-import grpc
-import backend_pb2
-import backend_pb2_grpc
-
-from auto_gptq import AutoGPTQForCausalLM
-from transformers import AutoTokenizer, AutoModelForCausalLM
-from transformers import TextGenerationPipeline
-
-_ONE_DAY_IN_SECONDS = 60 * 60 * 24
-
-# If MAX_WORKERS are specified in the environment use it, otherwise default to 1
-MAX_WORKERS = int(os.environ.get('PYTHON_GRPC_MAX_WORKERS', '1'))
-
-# Implement the BackendServicer class with the service methods
-class BackendServicer(backend_pb2_grpc.BackendServicer):
-    def Health(self, request, context):
-        return backend_pb2.Reply(message=bytes("OK", 'utf-8'))
-    def LoadModel(self, request, context):
-        try:
-            device = "cuda:0"
-            if request.Device != "":
-                device = request.Device
-
-            # support loading local model files
-            model_path = os.path.join(os.environ.get('MODELS_PATH', './'), request.Model)
-            tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True, trust_remote_code=request.TrustRemoteCode)
-
-            # support model `Qwen/Qwen-VL-Chat-Int4`
-            if "qwen-vl" in request.Model.lower():
-                self.model_name = "Qwen-VL-Chat"
-                model = AutoModelForCausalLM.from_pretrained(model_path, 
-                    trust_remote_code=request.TrustRemoteCode,
-                    device_map="auto").eval()
-            else:
-                model = AutoGPTQForCausalLM.from_quantized(model_path,
-                    model_basename=request.ModelBaseName,
-                    use_safetensors=True,
-                    trust_remote_code=request.TrustRemoteCode,
-                    device=device,
-                    use_triton=request.UseTriton,
-                    quantize_config=None)
-            
-            self.model = model
-            self.tokenizer = tokenizer
-        except Exception as err:
-            return backend_pb2.Result(success=False, message=f"Unexpected {err=}, {type(err)=}")
-        return backend_pb2.Result(message="Model loaded successfully", success=True)
-
-    def Predict(self, request, context):
-        penalty = 1.0
-        if request.Penalty != 0.0:
-            penalty = request.Penalty
-        tokens = 512
-        if request.Tokens != 0:
-            tokens = request.Tokens
-        top_p = 0.95
-        if request.TopP != 0.0:
-            top_p = request.TopP
-
-        
-        prompt_images = self.recompile_vl_prompt(request)
-        compiled_prompt = prompt_images[0]
-        print(f"Prompt: {compiled_prompt}", file=sys.stderr)
-
-        # Implement Predict RPC
-        pipeline = TextGenerationPipeline(
-            model=self.model, 
-            tokenizer=self.tokenizer,
-            max_new_tokens=tokens,
-            temperature=request.Temperature,
-            top_p=top_p,
-            repetition_penalty=penalty,
-            )
-        t = pipeline(compiled_prompt)[0]["generated_text"]
-        print(f"generated_text: {t}", file=sys.stderr)
-        
-        if compiled_prompt in t:
-            t = t.replace(compiled_prompt, "")
-        # house keeping. Remove the image files from /tmp folder
-        for img_path in prompt_images[1]:
-            try:
-                os.remove(img_path)
-            except Exception as e:
-                print(f"Error removing image file: {img_path}, {e}", file=sys.stderr)
-
-        return backend_pb2.Result(message=bytes(t, encoding='utf-8'))
-
-    def PredictStream(self, request, context):
-        # Implement PredictStream RPC
-        #for reply in some_data_generator():
-        #    yield reply
-        # Not implemented yet
-        return self.Predict(request, context)
-
-    def recompile_vl_prompt(self, request):
-        prompt = request.Prompt
-        image_paths = []
-
-        if "qwen-vl" in self.model_name.lower():
-            # request.Images is an array which contains base64 encoded images. Iterate the request.Images array, decode and save each image to /tmp folder with a random filename.
-            # Then, save the image file paths to an array "image_paths".
-            # read "request.Prompt", replace "[img-%d]" with the image file paths in the order they appear in "image_paths". Save the new prompt to "prompt".
-            for i, img in enumerate(request.Images):
-                timestamp = str(int(time.time() * 1000))  # Generate timestamp
-                img_path = f"/tmp/vl-{timestamp}.jpg"  # Use timestamp in filename
-                with open(img_path, "wb") as f:
-                    f.write(base64.b64decode(img))
-                image_paths.append(img_path)
-                prompt = prompt.replace(f"[img-{i}]", "<img>" + img_path + "</img>,")
-        else:
-            prompt = request.Prompt
-        return (prompt, image_paths)
-
-def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
-    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
-    server.add_insecure_port(address)
-    server.start()
-    print("Server started. Listening on: " + address, file=sys.stderr)
-
-    # Define the signal handler function
-    def signal_handler(sig, frame):
-        print("Received termination signal. Shutting down...")
-        server.stop(0)
-        sys.exit(0)
-
-    # Set the signal handlers for SIGINT and SIGTERM
-    signal.signal(signal.SIGINT, signal_handler)
-    signal.signal(signal.SIGTERM, signal_handler)
-
-    try:
-        while True:
-            time.sleep(_ONE_DAY_IN_SECONDS)
-    except KeyboardInterrupt:
-        server.stop(0)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Run the gRPC server.")
-    parser.add_argument(
-        "--addr", default="localhost:50051", help="The address to bind the server to."
-    )
-    args = parser.parse_args()
-
-    serve(args.addr)
--- a/backend/python/autogptq/install.sh
+++ b/backend/python/autogptq/install.sh
@@ -1,14 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-# This is here because the Intel pip index is broken and returns 200 status codes for every package name, it just doesn't return any package links.
-# This makes uv think that the package exists in the Intel pip index, and by default it stops looking at other pip indexes once it finds a match.
-# We need uv to continue falling through to the pypi default index to find optimum[openvino] in the pypi index
-# the --upgrade actually allows us to *downgrade* torch to the version provided in the Intel pip index
-if [ "x${BUILD_PROFILE}" == "xintel" ]; then
-    EXTRA_PIP_INSTALL_FLAGS+=" --upgrade --index-strategy=unsafe-first-match"
-fi
-
-installRequirements
--- a/backend/python/autogptq/requirements-cublas11.txt
+++ b/backend/python/autogptq/requirements-cublas11.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/cu118
-torch==2.4.1+cu118
--- a/backend/python/autogptq/requirements-cublas12.txt
+++ b/backend/python/autogptq/requirements-cublas12.txt
@@ -1 +0,0 @@
-torch==2.4.1
--- a/backend/python/autogptq/requirements-hipblas.txt
+++ b/backend/python/autogptq/requirements-hipblas.txt
@@ -1,2 +0,0 @@
--extra-index-url https://download.pytorch.org/whl/rocm6.0
-torch==2.4.1+rocm6.0
--- a/backend/python/autogptq/requirements-intel.txt
+++ b/backend/python/autogptq/requirements-intel.txt
@@ -1,6 +0,0 @@
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/
-intel-extension-for-pytorch==2.3.110+xpu
-torch==2.3.1+cxx11.abi
-oneccl_bind_pt==2.3.100+xpu
-optimum[openvino]
-setuptools
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +0,0 @@
-accelerate
-auto-gptq==0.7.1
-grpcio==1.71.0
-protobuf
-certifi
-transformers
--- a/backend/python/autogptq/run.sh
+++ b/backend/python/autogptq/run.sh
@@ -1,4 +0,0 @@
-#!/bin/bash
-source $(dirname $0)/../common/libbackend.sh
-
-startBackend $@
--- a/backend/python/autogptq/test.sh
+++ b/backend/python/autogptq/test.sh
@@ -1,6 +0,0 @@
-#!/bin/bash
-set -e
-
-source $(dirname $0)/../common/libbackend.sh
-
-runUnittests
--- a/backend/python/bark/backend.py
+++ b/backend/python/bark/backend.py
@@ -61,7 +61,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/coqui/backend.py
+++ b/backend/python/coqui/backend.py
@@ -86,7 +86,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -522,7 +522,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/exllama2/backend.py
+++ b/backend/python/exllama2/backend.py
@@ -105,7 +105,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):


 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/faster-whisper/backend.py
+++ b/backend/python/faster-whisper/backend.py
@@ -62,7 +62,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.TranscriptResult(segments=resultSegments, text=text)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/kokoro/backend.py
+++ b/backend/python/kokoro/backend.py
@@ -99,7 +99,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.Result(success=True)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/rerankers/backend.py
+++ b/backend/python/rerankers/backend.py
@@ -91,7 +91,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        return backend_pb2.RerankResult(usage=usage, results=results)

 def serve(address):
-    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.server(futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    server.add_insecure_port(address)
    server.start()
--- a/backend/python/transformers/backend.py
+++ b/backend/python/transformers/backend.py
@@ -559,7 +559,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/backend/python/vllm/backend.py
+++ b/backend/python/vllm/backend.py
@@ -320,7 +320,12 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):

 async def serve(address):
    # Start asyncio gRPC server
-    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS))
+    server = grpc.aio.server(migration_thread_pool=futures.ThreadPoolExecutor(max_workers=MAX_WORKERS),
+        options=[
+            ('grpc.max_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_send_message_length', 50 * 1024 * 1024),  # 50MB
+            ('grpc.max_receive_message_length', 50 * 1024 * 1024),  # 50MB
+        ])
    # Add the servicer to the server
    backend_pb2_grpc.add_BackendServicer_to_server(BackendServicer(), server)
    # Bind the server to the address
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -184,11 +184,6 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		MainGPU:             c.MainGPU,
 		Threads:             int32(*c.Threads),
 		TensorSplit:         c.TensorSplit,
-		// AutoGPTQ
-		ModelBaseName:    c.AutoGPTQ.ModelBaseName,
-		Device:           c.AutoGPTQ.Device,
-		UseTriton:        c.AutoGPTQ.Triton,
-		UseFastTokenizer: c.AutoGPTQ.UseFastTokenizer,
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -50,9 +50,6 @@ type BackendConfig struct {
 	// LLM configs (GPT4ALL, Llama.cpp, ...)
 	LLMConfig `yaml:",inline"`

-	// AutoGPTQ specifics
-	AutoGPTQ AutoGPTQ `yaml:"autogptq"`
-
 	// Diffusers
 	Diffusers Diffusers `yaml:"diffusers"`
 	Step      int       `yaml:"step"`
@@ -176,14 +173,6 @@ type LimitMMPerPrompt struct {
 	LimitAudioPerPrompt int `yaml:"audio"`
 }

-// AutoGPTQ is a struct that holds the configuration specific to the AutoGPTQ backend
-type AutoGPTQ struct {
-	ModelBaseName    string `yaml:"model_base_name"`
-	Device           string `yaml:"device"`
-	Triton           bool   `yaml:"triton"`
-	UseFastTokenizer bool   `yaml:"use_fast_tokenizer"`
-}
-
 // TemplateConfig is a struct that holds the configuration of the templating system
 type TemplateConfig struct {
 	// Chat is the template used in the chat completion endpoint
@@ -555,7 +544,7 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TTS) == FLAG_TTS {
-		ttsBackends := []string{"piper", "transformers-musicgen", "parler-tts"}
+		ttsBackends := []string{"bark-cpp", "parler-tts", "piper", "transformers-musicgen"}
 		if !slices.Contains(ttsBackends, c.Backend) {
 			return false
 		}
--- a/core/http/middleware/request.go
+++ b/core/http/middleware/request.go
@@ -203,18 +203,10 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 		config.Diffusers.ClipSkip = input.ClipSkip
 	}

-	if input.ModelBaseName != "" {
-		config.AutoGPTQ.ModelBaseName = input.ModelBaseName
-	}
-
 	if input.NegativePromptScale != 0 {
 		config.NegativePromptScale = input.NegativePromptScale
 	}

-	if input.UseFastTokenizer {
-		config.UseFastTokenizer = input.UseFastTokenizer
-	}
-
 	if input.NegativePrompt != "" {
 		config.NegativePrompt = input.NegativePrompt
 	}
--- a/core/http/static/talk.js
+++ b/core/http/static/talk.js
@@ -115,6 +115,7 @@ async function sendTextToChatGPT(text) {

    const response = await fetch('v1/chat/completions', {
        method: 'POST',
+        headers: { "Content-Type": "application/json" },
        body: JSON.stringify({
            model: getModel(),
            messages: conversationHistory
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -202,7 +202,6 @@ type OpenAIRequest struct {

 	Backend string `json:"backend" yaml:"backend"`

-	// AutoGPTQ
 	ModelBaseName string `json:"model_base_name" yaml:"model_base_name"`
 }

--- a/core/schema/prediction.go
+++ b/core/schema/prediction.go
@@ -41,8 +41,6 @@ type PredictionOptions struct {
 	RopeFreqBase        float32 `json:"rope_freq_base" yaml:"rope_freq_base"`
 	RopeFreqScale       float32 `json:"rope_freq_scale" yaml:"rope_freq_scale"`
 	NegativePromptScale float32 `json:"negative_prompt_scale" yaml:"negative_prompt_scale"`
-	// AutoGPTQ
-	UseFastTokenizer bool `json:"use_fast_tokenizer" yaml:"use_fast_tokenizer"`

 	// Diffusers
 	ClipSkip int `json:"clip_skip" yaml:"clip_skip"`
--- a/docs/content/docs/advanced/advanced-usage.md
+++ b/docs/content/docs/advanced/advanced-usage.md
@@ -268,14 +268,6 @@ yarn_ext_factor: 0
 yarn_attn_factor: 0
 yarn_beta_fast: 0
 yarn_beta_slow: 0
-
-# AutoGPT-Q settings, for configurations specific to GPT models.
-autogptq:
-    model_base_name: "" # Base name of the model.
-    device: "" # Device to run the model on.
-    triton: false # Whether to use Triton Inference Server.
-    use_fast_tokenizer: false # Whether to use a fast tokenizer for quicker processing.
-
 # configuration for diffusers model
 diffusers:
    cuda: false # Whether to use CUDA
--- a/docs/content/docs/features/GPU-acceleration.md
+++ b/docs/content/docs/features/GPU-acceleration.md
@@ -147,7 +147,6 @@ The devices in the following list have been tested with `hipblas` images running
 | diffusers | yes | Radeon VII (gfx906) |
 | piper | yes | Radeon VII (gfx906) |
 | whisper | no | none |
-| autogptq | no | none |
 | bark | no | none |
 | coqui | no | none |
 | transformers | no | none |
--- a/docs/content/docs/features/text-generation.md
+++ b/docs/content/docs/features/text-generation.md
@@ -74,49 +74,9 @@ curl http://localhost:8080/v1/models

 ## Backends

-### AutoGPTQ
-
-[AutoGPTQ](https://github.com/PanQiWei/AutoGPTQ) is an easy-to-use LLMs quantization package with user-friendly apis, based on GPTQ algorithm.
-
-#### Prerequisites
-
-This is an extra backend - in the container images is already available and there is nothing to do for the setup.
-
-If you are building LocalAI locally, you need to install [AutoGPTQ manually](https://github.com/PanQiWei/AutoGPTQ#quick-installation).
-
-
-#### Model setup
-
-The models are automatically downloaded from `huggingface` if not present the first time. It is possible to define models via `YAML` config file, or just by querying the endpoint with the `huggingface` repository model name. For example, create a `YAML` config file in `models/`:
-
-```
-name: orca
-backend: autogptq
-model_base_name: "orca_mini_v2_13b-GPTQ-4bit-128g.no-act.order"
-parameters:
-  model: "TheBloke/orca_mini_v2_13b-GPTQ"
-# ...
-```
-
-Test with:
-
-```bash
-curl http://localhost:8080/v1/chat/completions -H "Content-Type: application/json" -d '{                                                                                                         
-   "model": "orca",
-   "messages": [{"role": "user", "content": "How are you?"}],
-   "temperature": 0.1
- }'
-```
 ### RWKV

-A full example on how to run a rwkv model is in the [examples](https://github.com/go-skynet/LocalAI/tree/master/examples/rwkv).
-
-Note: rwkv models needs to specify the backend `rwkv` in the YAML config files and have an associated tokenizer along that needs to be provided with it:
-
-```
-36464540 -rw-r--r--  1 mudler mudler 1.2G May  3 10:51 rwkv_small
-36464543 -rw-r--r--  1 mudler mudler 2.4M May  3 10:51 rwkv_small.tokenizer.json
-```
+RWKV support is available through llama.cpp (see below)

 ### llama.cpp

--- a/docs/content/docs/getting-started/quickstart.md
+++ b/docs/content/docs/getting-started/quickstart.md
@@ -14,8 +14,6 @@ icon = "rocket_launch"

 If you are exposing LocalAI remotely, make sure you protect the API endpoints adequately with a mechanism which allows to protect from the incoming traffic or alternatively, run LocalAI with `API_KEY` to gate the access with an API key. The API key guarantees a total access to the features (there is no role separation), and it is to be considered as likely as an admin role.

-To access the WebUI with an API_KEY, browser extensions such as [Requestly](https://requestly.com/) can be used (see also https://github.com/mudler/LocalAI/issues/2227#issuecomment-2093333752). See also [API flags]({{% relref "docs/advanced/advanced-usage#api-flags" %}}) for the flags / options available when starting LocalAI.
-
 {{% /alert %}}

 ## Quickstart
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v2.27.0"
+  "version": "v2.28.0"
 }
--- a/docs/static/install.sh
+++ b/docs/static/install.sh
@@ -6,6 +6,7 @@
 #   curl ... | ENV_VAR=... sh -
 #       or
 #   ENV_VAR=... ./install.sh
+#   To uninstall: ./install.sh --uninstall

 set -e
 set -o noglob
@@ -57,6 +58,59 @@ require() {
    echo $MISSING
 }

+# Function to uninstall LocalAI
+uninstall_localai() {
+    info "Starting LocalAI uninstallation..."
+
+    # Stop and remove Docker container if it exists
+    if available docker && $SUDO docker ps -a --format '{{.Names}}' | grep -q local-ai; then
+        info "Stopping and removing LocalAI Docker container..."
+        $SUDO docker stop local-ai || true
+        $SUDO docker rm local-ai || true
+        $SUDO docker volume rm local-ai-data || true
+    fi
+
+    # Remove systemd service if it exists
+    if [ -f "/etc/systemd/system/local-ai.service" ]; then
+        info "Removing systemd service..."
+        $SUDO systemctl stop local-ai || true
+        $SUDO systemctl disable local-ai || true
+        $SUDO rm -f /etc/systemd/system/local-ai.service
+        $SUDO systemctl daemon-reload
+    fi
+
+    # Remove environment file
+    if [ -f "/etc/localai.env" ]; then
+        info "Removing environment file..."
+        $SUDO rm -f /etc/localai.env
+    fi
+
+    # Remove binary
+    for BINDIR in /usr/local/bin /usr/bin /bin; do
+        if [ -f "$BINDIR/local-ai" ]; then
+            info "Removing binary from $BINDIR..."
+            $SUDO rm -f "$BINDIR/local-ai"
+        fi
+    done
+
+    # Remove models directory
+    if [ -d "/usr/share/local-ai" ]; then
+        info "Removing LocalAI data directory..."
+        $SUDO rm -rf /usr/share/local-ai
+    fi
+
+    # Remove local-ai user if it exists
+    if id local-ai >/dev/null 2>&1; then
+        info "Removing local-ai user..."
+        $SUDO userdel -r local-ai || true
+    fi
+
+    info "LocalAI has been successfully uninstalled."
+    exit 0
+}
+
+
+
 ## VARIABLES

 # DOCKER_INSTALL - set to "true" to install Docker images
@@ -516,10 +570,10 @@ install_docker() {
 install_binary_darwin() {
    [ "$(uname -s)" = "Darwin" ] || fatal 'This script is intended to run on macOS only.'

-    info "Downloading local-ai..."
+    info "Downloading LocalAI ${VERSION}..."
    curl --fail --show-error --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${VERSION}/local-ai-Darwin-${ARCH}"

-    info "Installing local-ai..."
+    info "Installing to /usr/local/bin/local-ai"
    install -o0 -g0 -m755 $TEMP_DIR/local-ai /usr/local/bin/local-ai

    install_success
@@ -548,14 +602,14 @@ install_binary() {
        exit 1
    fi

-    info "Downloading local-ai..."
+    info "Downloading LocalAI ${VERSION}..."
    curl --fail --location --progress-bar -o $TEMP_DIR/local-ai "https://github.com/mudler/LocalAI/releases/download/${VERSION}/local-ai-Linux-${ARCH}"

    for BINDIR in /usr/local/bin /usr/bin /bin; do
        echo $PATH | grep -q $BINDIR && break || continue
    done

-    info "Installing local-ai to $BINDIR..."
+    info "Installing LocalAI as local-ai to $BINDIR..."
    $SUDO install -o0 -g0 -m755 -d $BINDIR
    $SUDO install -o0 -g0 -m755 $TEMP_DIR/local-ai $BINDIR/local-ai

@@ -617,6 +671,10 @@ detect_start_command() {
    fi
 }

+# Check if uninstall flag is provided
+if [ "$1" = "--uninstall" ]; then
+    uninstall_localai
+fi

 detect_start_command

@@ -664,10 +722,12 @@ for PACKAGE_MANAGER in dnf yum apt-get; do
 done

 if [ "$DOCKER_INSTALL" = "true" ]; then
+    info "Installing LocalAI from container images"
    if [ "$HAS_CUDA" = true ]; then
        install_container_toolkit
    fi
    install_docker
 else
+    info "Installing LocalAI from binaries"
    install_binary
 fi
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -487,6 +487,50 @@
    - filename: Omega-LN-SFT-Q4_K_M.gguf
      sha256: 33fb1c61085f9b18074e320ac784e6dbc8a98fe20705f92773e055471fd3cb0f
      uri: huggingface://Delta-Vector/Daichi-12B-GGUF/Omega-LN-SFT-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "readyart_amoral-fallen-omega-gemma3-12b"
+  icon: https://i.imghippo.com/files/EBq6162wlk.webp
+  urls:
+    - https://huggingface.co/ReadyArt/Amoral-Fallen-Omega-Gemma3-12B
+    - https://huggingface.co/bartowski/ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-GGUF
+  description: |
+    This model represents the forbidden merger of:
+
+    🧬 The-Omega-Directive-Gemma3-12B-v1.0 - Unprecedented coherent depravity, well-rounded ERP, low repetition even at maximum length
+    ⚡ Fallen-Gemma3-12B-v1 - Fallen Gemma3 12B v1 is an evil tune
+    💎 amoral-gemma3-12B-v2 - Improved natural intelligence and less refusals
+    This model maintains Forgotten-Safeword's complete lack of alignment:
+
+        🚫 No moral or ethical constraints on generated content
+        🔞 Will enthusiastically engage with any NSFW scenario
+        💀 May generate content that requires industrial-grade brain bleach
+        ⚖️ Perfectly balanced... as all things should be
+    🔥 Maintains signature intensity with improved narrative flow
+    📖 Handles multi-character scenarios with improved consistency
+    🧠 Excels at long-form storytelling without losing track of plot threads
+    ⚡ Noticeably better at following complex instructions than previous versions
+    🎭 Responds to subtle prompt nuances like a mind reader
+  overrides:
+    parameters:
+      model: ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-Q4_K_M.gguf
+  files:
+    - filename: ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-Q4_K_M.gguf
+      sha256: a2a2e76be2beb445d3a569ba03661860cd4aef9a4aa3d57aed319e3d1bddc820
+      uri: huggingface://bartowski/ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-GGUF/ReadyArt_Amoral-Fallen-Omega-Gemma3-12B-Q4_K_M.gguf
+- !!merge <<: *gemma3
+  name: "google-gemma-3-27b-it-qat-q4_0-small"
+  urls:
+    - https://huggingface.co/google/gemma-3-27b-it-qat-q4_0-gguf
+    - https://huggingface.co/stduhpf/google-gemma-3-27b-it-qat-q4_0-gguf-small
+  description: |
+    This is a requantized version of https://huggingface.co/google/gemma-3-27b-it-qat-q4_0-gguf. The official QAT weights released by google use fp16 (instead of Q6_K) for the embeddings table, which makes this model take a significant extra amount of memory (and storage) compared to what Q4_0 quants are supposed to take. Requantizing with llama.cpp achieves a very similar result. Note that this model ends up smaller than the Q4_0 from Bartowski. This is because llama.cpp sets some tensors to Q4_1 when quantizing models to Q4_0 with imatrix, but this is a static quant. The perplexity score for this one is even lower with this model compared to the original model by Google, but the results are within margin of error, so it's probably just luck. I also fixed the control token metadata, which was slightly degrading the performance of the model in instruct mode.
+  overrides:
+    parameters:
+      model: gemma-3-27b-it-q4_0_s.gguf
+  files:
+    - filename: gemma-3-27b-it-q4_0_s.gguf
+      sha256: cc4e41e3df2bf7fd3827bea7e98f28cecc59d7bd1c6b7b4fa10fc52a5659f3eb
+      uri: huggingface://stduhpf/google-gemma-3-27b-it-qat-q4_0-gguf-small/gemma-3-27b-it-q4_0_s.gguf
 - &llama4
  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
  icon: https://avatars.githubusercontent.com/u/153379578
@@ -1981,6 +2025,34 @@
    - filename: salamandra-7b-instruct.Q4_K_M-f32.gguf
      sha256: bac8e8c1d1d9d53cbdb148b8ff9ad378ddb392429207099e85b5aae3a43bff3d
      uri: huggingface://cstr/salamandra-7b-instruct-GGUF/salamandra-7b-instruct.Q4_K_M-f32.gguf
+- !!merge <<: *granite3
+  name: "ibm-granite_granite-3.3-8b-instruct"
+  urls:
+    - https://huggingface.co/ibm-granite/granite-3.3-2b-instruct
+    - https://huggingface.co/bartowski/ibm-granite_granite-3.3-8b-instruct-GGUF
+  description: |
+      Granite-3.3-2B-Instruct is a 2-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. Built on top of Granite-3.3-2B-Base, the model delivers significant gains on benchmarks for measuring generic performance including AlpacaEval-2.0 and Arena-Hard, and improvements in mathematics, coding, and instruction following. It supports structured reasoning through <think></think> and <response></response> tags, providing clear separation between internal thoughts and final outputs. The model has been trained on a carefully balanced combination of permissively licensed data and curated synthetic tasks.
+  overrides:
+    parameters:
+      model: ibm-granite_granite-3.3-8b-instruct-Q4_K_M.gguf
+  files:
+    - filename: ibm-granite_granite-3.3-8b-instruct-Q4_K_M.gguf
+      sha256: 758fb00abcec89df5cf02932165daf72f0d0b74db5019dbe9f2b3defb1e9295e
+      uri: huggingface://bartowski/ibm-granite_granite-3.3-8b-instruct-GGUF/ibm-granite_granite-3.3-8b-instruct-Q4_K_M.gguf
+- !!merge <<: *granite3
+  name: "ibm-granite_granite-3.3-2b-instruct"
+  urls:
+    - https://huggingface.co/ibm-granite/granite-3.3-2b-instruct
+    - https://huggingface.co/bartowski/ibm-granite_granite-3.3-2b-instruct-GGUF
+  description: |
+     Granite-3.3-2B-Instruct is a 2-billion parameter 128K context length language model fine-tuned for improved reasoning and instruction-following capabilities. Built on top of Granite-3.3-2B-Base, the model delivers significant gains on benchmarks for measuring generic performance including AlpacaEval-2.0 and Arena-Hard, and improvements in mathematics, coding, and instruction following. It supports structured reasoning through <think></think> and <response></response> tags, providing clear separation between internal thoughts and final outputs. The model has been trained on a carefully balanced combination of permissively licensed data and curated synthetic tasks.
+  overrides:
+    parameters:
+      model: ibm-granite_granite-3.3-2b-instruct-Q4_K_M.gguf
+  files:
+    - filename: ibm-granite_granite-3.3-2b-instruct-Q4_K_M.gguf
+      sha256: 555b91485955bc96eb445b57dd4bbf8809aa7d8cce7c313f4f8bc5b2340896b4
+      uri: huggingface://bartowski/ibm-granite_granite-3.3-2b-instruct-GGUF/ibm-granite_granite-3.3-2b-instruct-Q4_K_M.gguf
 - &llama32
  url: "github:mudler/LocalAI/gallery/llama3.2-quantized.yaml@master"
  icon: https://avatars.githubusercontent.com/u/153379578
@@ -2688,6 +2760,20 @@
    - filename: deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
      sha256: 726a0ef5f818b8d238f2844f3204848bea66fb9c172b8ae0f6dc51b7bc081dd5
      uri: huggingface://bartowski/deepcogito_cogito-v1-preview-llama-3B-GGUF/deepcogito_cogito-v1-preview-llama-3B-Q4_K_M.gguf
+- !!merge <<: *llama32
+  name: "menlo_rezero-v0.1-llama-3.2-3b-it-grpo-250404"
+  urls:
+    - https://huggingface.co/Menlo/ReZero-v0.1-llama-3.2-3b-it-grpo-250404
+    - https://huggingface.co/bartowski/Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-GGUF
+  description: |
+      ReZero trains a small language model to develop effective search behaviors instead of memorizing static data. It interacts with multiple synthetic search engines, each with unique retrieval mechanisms, to refine queries and persist in searching until it finds exact answers. The project focuses on reinforcement learning, preventing overfitting, and optimizing for efficiency in real-world search applications.
+  overrides:
+    parameters:
+      model: Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-Q4_K_M.gguf
+  files:
+    - filename: Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-Q4_K_M.gguf
+      sha256: b9f01bead9e163db9351af036d8d63ef479d7d48a1bb44934ead732a180f371c
+      uri: huggingface://bartowski/Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-GGUF/Menlo_ReZero-v0.1-llama-3.2-3b-it-grpo-250404-Q4_K_M.gguf
 - &qwen25
  name: "qwen2.5-14b-instruct" ## Qwen2.5
  icon: https://avatars.githubusercontent.com/u/141221163
@@ -5956,6 +6042,67 @@
    - filename: m1-32b.Q4_K_M.gguf
      sha256: 1dfa3b6822447aca590d6f2881cf277bd0fbde633a39c5a20b521f4a59145e3f
      uri: huggingface://mradermacher/m1-32b-GGUF/m1-32b.Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "qwen2.5-14b-instruct-1m"
+  urls:
+    - https://huggingface.co/Qwen/Qwen2.5-14B-Instruct-1M
+    - https://huggingface.co/bartowski/Qwen2.5-14B-Instruct-1M-GGUF
+  description: |
+    Qwen2.5-1M is the long-context version of the Qwen2.5 series models, supporting a context length of up to 1M tokens. Compared to the Qwen2.5 128K version, Qwen2.5-1M demonstrates significantly improved performance in handling long-context tasks while maintaining its capability in short tasks.
+
+    The model has the following features:
+
+        Type: Causal Language Models
+        Training Stage: Pretraining & Post-training
+        Architecture: transformers with RoPE, SwiGLU, RMSNorm, and Attention QKV bias
+        Number of Parameters: 14.7B
+        Number of Paramaters (Non-Embedding): 13.1B
+        Number of Layers: 48
+        Number of Attention Heads (GQA): 40 for Q and 8 for KV
+        Context Length: Full 1,010,000 tokens and generation 8192 tokens
+            We recommend deploying with our custom vLLM, which introduces sparse attention and length extrapolation methods to ensure efficiency and accuracy for long-context tasks. For specific guidance, refer to this section.
+            You can also use the previous framework that supports Qwen2.5 for inference, but accuracy degradation may occur for sequences exceeding 262,144 tokens.
+
+    For more details, please refer to our blog, GitHub, Technical Report, and Documentation.
+  overrides:
+    parameters:
+      model: Qwen2.5-14B-Instruct-1M-Q4_K_M.gguf
+  files:
+    - filename: Qwen2.5-14B-Instruct-1M-Q4_K_M.gguf
+      sha256: a1a0fa3e2c3f9d63f9202af9172cffbc0b519801dff740fffd39f6a063a731ef
+      uri: huggingface://bartowski/Qwen2.5-14B-Instruct-1M-GGUF/Qwen2.5-14B-Instruct-1M-Q4_K_M.gguf
+- !!merge <<: *qwen25
+  name: "pictor-1338-qwenp-1.5b"
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65bb837dbfb878f46c77de4c/X7zeHYbH5Y5JoRK_ud_Ya.png
+  urls:
+    - https://huggingface.co/prithivMLmods/Pictor-1338-QwenP-1.5B
+    - https://huggingface.co/adriey/Pictor-1338-QwenP-1.5B-Q8_0-GGUF
+  description: |
+    Pictor-1338-QwenP-1.5B is a code reasoning LLM fine-tuned from Qwen-1.5B using distributed reinforcement learning (RL). This model is designed to enhance coding proficiency, debugging accuracy, and step-by-step reasoning in software development tasks across multiple programming languages.
+
+    Key Features
+
+        Code Reasoning & Explanation
+        Trained to analyze, generate, and explain code with a focus on logic, structure, and clarity. Supports functional, object-oriented, and procedural paradigms.
+
+        Reinforcement Learning Fine-Tuning
+        Enhanced using distributed RL, improving reward-aligned behavior in tasks like fixing bugs, completing functions, and understanding abstract instructions.
+
+        Multi-Language Support
+        Works fluently with Python, JavaScript, C++, and Shell, among others—ideal for general-purpose programming, scripting, and algorithmic tasks.
+
+        Compact and Efficient
+        At just 1.5B parameters, it's lightweight enough for edge deployments and developer tools with strong reasoning capability.
+
+        Debugging and Auto-Fix Capabilities
+        Built to identify bugs, recommend corrections, and provide context-aware explanations of issues in codebases.
+  overrides:
+    parameters:
+      model: pictor-1338-qwenp-1.5b-q8_0.gguf
+  files:
+    - filename: pictor-1338-qwenp-1.5b-q8_0.gguf
+      sha256: 22d2f5b2322d9a354d8578475a6924c2173a913a1e2fa0ec2655f2f5937f6f26
+      uri: huggingface://adriey/Pictor-1338-QwenP-1.5B-Q8_0-GGUF/pictor-1338-qwenp-1.5b-q8_0.gguf
 - &llama31
  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master" ## LLama3.1
  icon: https://avatars.githubusercontent.com/u/153379578
@@ -10116,6 +10263,54 @@
    - filename: Trappu_Magnum-Picaro-0.7-v2-12b-Q4_K_M.gguf
      sha256: 989839dd7eab997a70eb8430b9df1138f9b0f35d58299d5007e6555a4a4a7f4c
      uri: huggingface://bartowski/Trappu_Magnum-Picaro-0.7-v2-12b-GGUF/Trappu_Magnum-Picaro-0.7-v2-12b-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  icon: https://cdn-uploads.huggingface.co/production/uploads/65f2fd1c25b848bd061b5c2e/69pOPcYiUzKWW1OPzg1-_.png
+  name: "thedrummer_rivermind-12b-v1"
+  urls:
+    - https://huggingface.co/TheDrummer/Rivermind-12B-v1
+    - https://huggingface.co/bartowski/TheDrummer_Rivermind-12B-v1-GGUF
+  description: |
+    Introducing Rivermind™, the next-generation AI that’s redefining human-machine interaction—powered by Amazon Web Services (AWS) for seamless cloud integration and NVIDIA’s latest AI processors for lightning-fast responses.
+    But wait, there’s more! Rivermind doesn’t just process data—it feels your emotions (thanks to Google’s TensorFlow for deep emotional analysis). Whether you're brainstorming ideas or just need someone to vent to, Rivermind adapts in real-time, all while keeping your data secure with McAfee’s enterprise-grade encryption.
+    And hey, why not grab a refreshing Coca-Cola Zero Sugar while you interact? The crisp, bold taste pairs perfectly with Rivermind’s witty banter—because even AI deserves the best (and so do you).
+    Upgrade your thinking today with Rivermind™—the AI that thinks like you, but better, brought to you by the brands you trust. 🚀✨
+  overrides:
+    parameters:
+      model: TheDrummer_Rivermind-12B-v1-Q4_K_M.gguf
+  files:
+    - filename: TheDrummer_Rivermind-12B-v1-Q4_K_M.gguf
+      sha256: 49a5341ea90e7bd03e797162ab23bf0b975dce9faf5d957f7d24bf1d5134c937
+      uri: huggingface://bartowski/TheDrummer_Rivermind-12B-v1-GGUF/TheDrummer_Rivermind-12B-v1-Q4_K_M.gguf
+- !!merge <<: *mistral03
+  url: "github:mudler/LocalAI/gallery/llama3.1-instruct.yaml@master"
+  name: "dreamgen_lucid-v1-nemo"
+  icon: https://huggingface.co/dreamgen/lucid-v1-nemo/resolve/main/images/banner.webp
+  urls:
+    - https://huggingface.co/dreamgen/lucid-v1-nemo
+    - https://huggingface.co/bartowski/dreamgen_lucid-v1-nemo-GGUF
+  description: |
+    Focused on role-play & story-writing.
+        Suitable for all kinds of writers and role-play enjoyers:
+            For world-builders who want to specify every detail in advance: plot, setting, writing style, characters, locations, items, lore, etc.
+            For intuitive writers who start with a loose prompt and shape the narrative through instructions (OCC) as the story / role-play unfolds.
+        Support for multi-character role-plays:
+            Model can automatically pick between characters.
+        Support for inline writing instructions (OOC):
+            Controlling plot development (say what should happen, what the characters should do, etc.)
+            Controlling pacing.
+            etc.
+        Support for inline writing assistance:
+            Planning the next scene / the next chapter / story.
+            Suggesting new characters.
+            etc.
+    Support for reasoning (opt-in).
+  overrides:
+    parameters:
+      model: dreamgen_lucid-v1-nemo-Q4_K_M.gguf
+  files:
+    - filename: dreamgen_lucid-v1-nemo-Q4_K_M.gguf
+      sha256: b9cbd018895a76805ea8b8d2a499b3221044ce2df2a06ed858b61caba11b81dc
+      uri: huggingface://bartowski/dreamgen_lucid-v1-nemo-GGUF/dreamgen_lucid-v1-nemo-Q4_K_M.gguf
 - &mudler
  url: "github:mudler/LocalAI/gallery/mudler.yaml@master" ### START mudler's LocalAI specific-models
  name: "LocalAI-llama3-8b-function-call-v0.2"
@@ -15913,7 +16108,8 @@
    - filename: silero-vad.onnx
      uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
      sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
- name: "bark-cpp-small"
+- &bark
+  name: "bark-cpp"
  icon: https://avatars.githubusercontent.com/u/99442120
  url: github:mudler/LocalAI/gallery/virtual.yaml@master
  license: mit
@@ -15927,6 +16123,15 @@
    - cpu
  overrides:
    backend: bark-cpp
+    parameters:
+      model: bark_weights-f16.bin
+  files:
+    - filename: bark_weights-f16.bin
+      uri: https://huggingface.co/Green-Sky/bark-ggml/resolve/main/bark_weights-f16.bin
+      sha256: ba6fc0e09531e6b8b5a9ef8862be2c9a52a631fc93f34a60b26b879cacf18f62
+- !!merge <<: *bark
+  name: "bark-cpp-small"
+  overrides:
    parameters:
      model: bark-small_weights-f16.bin
  files:
--- a/pkg/grpc/client.go
+++ b/pkg/grpc/client.go
@@ -57,7 +57,11 @@ func (c *Client) HealthCheck(ctx context.Context) (bool, error) {
 	}
 	c.setBusy(true)
 	defer c.setBusy(false)
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return false, err
 	}
@@ -89,7 +93,11 @@ func (c *Client) Embeddings(ctx context.Context, in *pb.PredictOptions, opts ...
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -108,7 +116,11 @@ func (c *Client) Predict(ctx context.Context, in *pb.PredictOptions, opts ...grp
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -127,7 +139,11 @@ func (c *Client) LoadModel(ctx context.Context, in *pb.ModelOptions, opts ...grp
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -145,7 +161,11 @@ func (c *Client) PredictStream(ctx context.Context, in *pb.PredictOptions, f fun
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return err
 	}
@@ -182,7 +202,11 @@ func (c *Client) GenerateImage(ctx context.Context, in *pb.GenerateImageRequest,
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -200,7 +224,11 @@ func (c *Client) TTS(ctx context.Context, in *pb.TTSRequest, opts ...grpc.CallOp
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -218,7 +246,11 @@ func (c *Client) SoundGeneration(ctx context.Context, in *pb.SoundGenerationRequ
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -236,7 +268,11 @@ func (c *Client) AudioTranscription(ctx context.Context, in *pb.TranscriptReques
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -254,7 +290,11 @@ func (c *Client) TokenizeString(ctx context.Context, in *pb.PredictOptions, opts
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -276,7 +316,11 @@ func (c *Client) Status(ctx context.Context) (*pb.StatusResponse, error) {
 	}
 	c.setBusy(true)
 	defer c.setBusy(false)
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -294,7 +338,11 @@ func (c *Client) StoresSet(ctx context.Context, in *pb.StoresSetOptions, opts ..
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -312,7 +360,11 @@ func (c *Client) StoresDelete(ctx context.Context, in *pb.StoresDeleteOptions, o
 	defer c.wdUnMark()
 	c.setBusy(true)
 	defer c.setBusy(false)
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -330,7 +382,11 @@ func (c *Client) StoresGet(ctx context.Context, in *pb.StoresGetOptions, opts ..
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -348,7 +404,11 @@ func (c *Client) StoresFind(ctx context.Context, in *pb.StoresFindOptions, opts
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -366,7 +426,11 @@ func (c *Client) Rerank(ctx context.Context, in *pb.RerankRequest, opts ...grpc.
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -384,7 +448,11 @@ func (c *Client) GetTokenMetrics(ctx context.Context, in *pb.MetricsRequest, opt
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
@@ -402,7 +470,11 @@ func (c *Client) VAD(ctx context.Context, in *pb.VADRequest, opts ...grpc.CallOp
 	defer c.setBusy(false)
 	c.wdMark()
 	defer c.wdUnMark()
-	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	conn, err := grpc.Dial(c.address, grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithDefaultCallOptions(
+			grpc.MaxCallRecvMsgSize(50*1024*1024), // 50MB
+			grpc.MaxCallSendMsgSize(50*1024*1024), // 50MB
+		))
 	if err != nil {
 		return nil, err
 	}
--- a/pkg/grpc/server.go
+++ b/pkg/grpc/server.go
@@ -244,7 +244,10 @@ func StartServer(address string, model LLM) error {
 	if err != nil {
 		return err
 	}
-	s := grpc.NewServer()
+	s := grpc.NewServer(
+		grpc.MaxRecvMsgSize(50*1024*1024), // 50MB
+		grpc.MaxSendMsgSize(50*1024*1024), // 50MB
+	)
 	pb.RegisterBackendServer(s, &server{llm: model})
 	log.Printf("gRPC Server listening at %v", lis.Addr())
 	if err := s.Serve(lis); err != nil {
@@ -259,7 +262,10 @@ func RunServer(address string, model LLM) (func() error, error) {
 	if err != nil {
 		return nil, err
 	}
-	s := grpc.NewServer()
+	s := grpc.NewServer(
+		grpc.MaxRecvMsgSize(50*1024*1024), // 50MB
+		grpc.MaxSendMsgSize(50*1024*1024), // 50MB
+	)
 	pb.RegisterBackendServer(s, &server{llm: model})
 	log.Printf("gRPC Server listening at %v", lis.Addr())
 	if err = s.Serve(lis); err != nil {
Author	SHA1	Message	Date
Ettore Di Giacinto	8fea82e68b	wire to grpc Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-19 20:22:31 +02:00
Ettore Di Giacinto	01e2e3dbc3	wip reranking llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-19 19:52:02 +02:00
Ettore Di Giacinto	61cc76c455	chore(autogptq): drop archived backend (#5214 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-19 15:52:29 +02:00
Ettore Di Giacinto	8abecb4a18	chore: bump grpc limits to 50MB (#5212 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-19 08:53:24 +02:00
LocalAI [bot]	8b3f76d8e6	chore: ⬆️ Update ggml-org/llama.cpp to `6408210082cc0a61b992b487be7e2ff2efbb9e36` (#5211 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-04-18 21:45:48 +00:00
Ettore Di Giacinto	4e0497f1a6	chore(model gallery): add pictor-1338-qwenp-1.5b (#5208 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-18 10:47:23 +02:00
Ettore Di Giacinto	ba88c9f451	chore(ci): use gemma-3-12b-it for models notifications (twitter) Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2025-04-18 10:38:36 +02:00
Ettore Di Giacinto	a598285825	chore(model gallery): add google-gemma-3-27b-it-qat-q4_0-small (#5207 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-18 10:35:48 +02:00
Ettore Di Giacinto	cb7a172897	chore(ci): use gemma-3-12b-it for models notifications Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2025-04-18 10:20:33 +02:00
Ettore Di Giacinto	771be28dfb	ci: use gemma3 for notifications of releases Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2025-04-18 10:19:52 +02:00
Ettore Di Giacinto	7d6b3eb42d	chore(model gallery): add readyart_amoral-fallen-omega-gemma3-12b (#5206 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-18 10:17:39 +02:00
Ettore Di Giacinto	0bb33fab55	chore(model gallery): add ibm-granite_granite-3.3-2b-instruct (#5205 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-18 10:15:05 +02:00
Ettore Di Giacinto	e3bf7f77f7	chore(model gallery): add ibm-granite_granite-3.3-8b-instruct (#5204 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-18 09:59:17 +02:00
LocalAI [bot]	bd1707d339	chore: ⬆️ Update ggml-org/llama.cpp to `2f74c354c0f752ed9aabf7d3a350e6edebd7e744` (#5203 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-04-17 21:52:12 +00:00
Ettore Di Giacinto	0474804541	fix(ci): remove duplicate entry Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-17 19:51:21 +02:00
Ettore Di Giacinto	72693b3917	feat(install.sh): allow to uninstall with --uninstall (#5202 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-17 16:32:23 +02:00
Florian Bachmann	a03b70010f	fix(talk): Talk interface sends content-type headers to chatgpt (#5200 ) Talk interface sends content-type headers to chatgpt Signed-off-by: baflo <834350+baflo@users.noreply.github.com>	2025-04-17 15:02:11 +02:00
Ettore Di Giacinto	e3717e5c1a	chore(model gallery): add qwen2.5-14b-instruct-1m (#5201 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-17 10:42:22 +02:00
Ettore Di Giacinto	c8f6858218	chore(ci): add latest images for core (#5198 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-17 10:00:18 +02:00
Ettore Di Giacinto	06d7cc43ae	chore(model gallery): add dreamgen_lucid-v1-nemo (#5196 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-17 09:10:09 +02:00
Ettore Di Giacinto	f2147cb850	chore(model gallery): add thedrummer_rivermind-12b-v1 (#5195 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-17 09:02:54 +02:00
Ettore Di Giacinto	75bb9f4c28	chore(model gallery): add menlo_rezero-v0.1-llama-3.2-3b-it-grpo-250404 (#5194 ) Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2025-04-17 09:00:11 +02:00
LocalAI [bot]	a2ef4b1e07	chore: ⬆️ Update ggml-org/llama.cpp to `015022bb53387baa8b23817ac03743705c7d472b` (#5192 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com> Co-authored-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2025-04-17 08:04:37 +02:00
LocalAI [bot]	161c9fe2db	docs: ⬆️ update docs version mudler/LocalAI (#5191 ) ⬆️ Update docs version mudler/LocalAI Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-04-16 22:13:49 +02:00
Ettore Di Giacinto	7547463f81	Update quickstart.md Signed-off-by: Ettore Di Giacinto <mudler@users.noreply.github.com>	2025-04-16 08:48:55 +02:00
Gianluca Boiano	32e4dfd47b	chore(model gallery): add suno-ai bark-cpp model (#5187 ) Signed-off-by: Gianluca Boiano <morf3089@gmail.com>	2025-04-16 08:22:46 +02:00
Gianluca Boiano	f67e5dec68	fix: bark-cpp: assign FLAG_TTS to bark-cpp backend (#5186 ) Signed-off-by: Gianluca Boiano <morf3089@gmail.com>	2025-04-16 08:21:30 +02:00
LocalAI [bot]	297d54acea	chore: ⬆️ Update ggml-org/llama.cpp to `80f19b41869728eeb6a26569957b92a773a2b2c6` (#5183 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2025-04-15 22:50:32 +00:00