WIP speculative

2026-05-25 01:02:05 -04:00 · 2025-01-24 10:17:54 +01:00
166 changed files with 2979 additions and 242473 deletions
--- a/Generation/musicgen.bru
+++ b/Generation/musicgen.bru
@@ -0,0 +1,23 @@
 meta {
  name: musicgen
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/sound-generation
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model_id": "facebook/musicgen-small",
      "text": "Exciting 80s Newscast Interstitial",
      "duration_seconds": 8
  }
 }
--- a/Requests/backend
+++ b/Requests/backend
@@ -0,0 +1,17 @@
 meta {
  name: backend monitor
  type: http
  seq: 4
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/monitor
  body: json
  auth: none
 }
 body:json {
  {
    "model": "{{DEFAULT_MODEL}}"
  }
 }
--- a/monitor/backend-shutdown.bru
+++ b/monitor/backend-shutdown.bru
@@ -0,0 +1,21 @@
 meta {
  name: backend-shutdown
  type: http
  seq: 3
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/backend/shutdown
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}"
  }
 }
--- a/Requests/bruno.json
+++ b/Requests/bruno.json
@@ -0,0 +1,5 @@
 {
  "version": "1",
  "name": "LocalAI Test Requests",
  "type": "collection"
 }
--- a/Requests/environments/localhost.bru
+++ b/Requests/environments/localhost.bru
@@ -0,0 +1,6 @@
 vars {
  HOST: localhost
  PORT: 8080
  DEFAULT_MODEL: gpt-3.5-turbo
  PROTOCOL: http://
 }
--- a/.bruno/LocalAI
+++ b/.bruno/LocalAI
@@ -0,0 +1,11 @@
 meta {
  name: get models list
  type: http
  seq: 2
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models
  body: none
  auth: none
 }
--- a/generation/Generate
+++ b/generation/Generate
@@ -0,0 +1,25 @@
 meta {
  name: Generate image
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/images/generations
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "prompt": "<positive prompt>|<negative prompt>",
    "model": "model-name",
    "step": 51,
    "size": "1024x1024",
    "image": ""
  }
 }
--- a/text/-completions.bru
+++ b/text/-completions.bru
@@ -0,0 +1,24 @@
 meta {
  name: -completions
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "prompt": "function downloadFile(string url, string outputPath) {",
      "max_tokens": 256,
      "temperature": 0.5
  }
 }
--- a/text/-edits.bru
+++ b/text/-edits.bru
@@ -0,0 +1,23 @@
 meta {
  name: -edits
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/edits
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "What day of the wek is it?",
      "instruction": "Fix the spelling mistakes"
  }
 }
--- a/text/-embeddings.bru
+++ b/text/-embeddings.bru
@@ -0,0 +1,22 @@
 meta {
  name: -embeddings
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/embeddings
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/text/chat/chat
+++ b/text/chat/chat
@@ -0,0 +1,30 @@
 meta {
  name: chat completion -simple- 1 message-
  type: http
  seq: 4
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
    "model": "{{DEFAULT_MODEL}}",
    "messages": [
      {
        "role": "user",
        "content": "How could one use friction to cook an egg?"
      }
    ],
    "max_tokens": 256,
    "temperature": 0.2,
    "grammar": ""
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,29 @@
 meta {
  name: chat-completions -long-
  type: http
  seq: 5
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "system", "content": "You are a helpful, intelligent pirate assistant. Always answer as helpfully as possible, while being a pirate and using pirate language. If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."},
           {"role": "user", "content": "How could one use electricity to cook an egg?"},
           {"role": "assistant",
                  "content": "Shiver me timbers! Using electricity to cook an egg? Well, matey, I reckon that be a right curious idea! *adjusts spectacles* Now, I ain't sure if this be possible, but I'll do me best to help ye out.\n\nFirst things first, ye gotta understand that electricity be a powerful force, and it can't just cook an egg on its own. Ye see, electricity be like a mighty wind that can make things happen, but it needs somethin' to work with. So, if ye want to use electricity to cook an egg, ye gotta find a way to harness that power and make it do the cookin'.\n\nNow, I know what ye might be thinkin': \"How do I harness the power of electricity to cook an egg?\" Well, matey, there be a few ways to do it. One way be to use a special device called an \"electric frying pan.\" This be a pan that has a built-in heating element that gets hot when ye plug it into a wall socket. When the element gets hot, ye can crack an egg into the pan and watch as it cook"
              },
              {"role": "user", "content": "I don't have one of those, just a raw wire and plenty of power! How do we get it done?"}],
       "max_tokens": 1024,
       "temperature": 0.5
  }
 }
--- a/text/chat/chat-completions
+++ b/text/chat/chat-completions
@@ -0,0 +1,25 @@
 meta {
  name: chat-completions -stream-
  type: http
  seq: 6
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/chat/completions
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
       "model": "{{DEFAULT_MODEL}}",
       "messages": [{"role": "user", "content": "Explain how I can set sail on the ocean using only power generated by seagulls?"}],
       "max_tokens": 256,
       "temperature": 0.9,
       "stream": true
  }
 }
--- a/Requests/model
+++ b/Requests/model
@@ -0,0 +1,22 @@
 meta {
  name: add model gallery
  type: http
  seq: 10
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "url": "file:///home/dave/projects/model-gallery/huggingface/TheBloke__CodeLlama-7B-Instruct-GGML.yaml",
      "name": "test"
  }
 }
--- a/gallery/delete
+++ b/gallery/delete
@@ -0,0 +1,21 @@
 meta {
  name: delete model gallery
  type: http
  seq: 11
 }
 delete {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "name": "test"
  }
 }
--- a/Requests/model
+++ b/Requests/model
@@ -0,0 +1,11 @@
 meta {
  name: list MODELS in galleries
  type: http
  seq: 7
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/available
  body: none
  auth: none
 }
--- a/Requests/model
+++ b/Requests/model
@@ -0,0 +1,11 @@
 meta {
  name: list model GALLERIES
  type: http
  seq: 8
 }
 get {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/Requests/model
+++ b/Requests/model
@@ -0,0 +1,11 @@
 meta {
  name: model delete
  type: http
  seq: 7
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/galleries
  body: none
  auth: none
 }
--- a/Requests/model
+++ b/Requests/model
@@ -0,0 +1,21 @@
 meta {
  name: model gallery apply -gist-
  type: http
  seq: 12
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q2_K.bin"
  }
 }
--- a/Requests/model
+++ b/Requests/model
@@ -0,0 +1,22 @@
 meta {
  name: model gallery apply
  type: http
  seq: 9
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/models/apply
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "id": "dave@TheBloke__CodeLlama-7B-Instruct-GGML__codellama-7b-instruct.ggmlv3.Q3_K_S.bin",
      "name": "codellama7b"
  }
 }
--- a/Requests/transcription/gb1.ogg
+++ b/Requests/transcription/gb1.ogg
--- a/Requests/transcription/transcribe.bru
+++ b/Requests/transcription/transcribe.bru
@@ -0,0 +1,16 @@
 meta {
  name: transcribe
  type: http
  seq: 1
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/v1/audio/transcriptions
  body: multipartForm
  auth: none
 }
 body:multipart-form {
  file: @file(transcription/gb1.ogg)
  model: whisper-1
 }
--- a/Requests/tts/-tts.bru
+++ b/Requests/tts/-tts.bru
@@ -0,0 +1,22 @@
 meta {
  name: -tts
  type: http
  seq: 2
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "model": "{{DEFAULT_MODEL}}",
      "input": "A STRANGE GAME.\nTHE ONLY WINNING MOVE IS NOT TO PLAY.\n\nHOW ABOUT A NICE GAME OF CHESS?"
  }
 }
--- a/Requests/tts/musicgen.bru
+++ b/Requests/tts/musicgen.bru
@@ -0,0 +1,23 @@
 meta {
  name: musicgen
  type: http
  seq: 2
 }
 post {
  url: {{PROTOCOL}}{{HOST}}:{{PORT}}/tts
  body: json
  auth: none
 }
 headers {
  Content-Type: application/json
 }
 body:json {
  {
      "backend": "transformers",
      "model": "facebook/musicgen-small",
      "input": "80s Synths playing Jazz"
  }
 }
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -1,4 +1,4 @@
-enhancement:
+enhancements:
 - head-branch: ['^feature', 'feature']
 dependencies:
--- a/.github/workflows/dependabot_auto.yml
+++ b/.github/workflows/dependabot_auto.yml
@@ -14,7 +14,7 @@ jobs:
    steps:
      - name: Dependabot metadata
        id: metadata
-        uses: dependabot/fetch-metadata@v2.3.0
+        uses: dependabot/fetch-metadata@v2.2.0
        with:
          github-token: "${{ secrets.GITHUB_TOKEN }}"
          skip-commit-verification: true
--- a/.github/workflows/generate_grpc_cache.yaml
+++ b/.github/workflows/generate_grpc_cache.yaml
@@ -2,10 +2,9 @@ name: 'generate and publish GRPC docker caches'
 on:
  workflow_dispatch:
-
+  push:
-  schedule:
+    branches:
-    # daily at midnight
+      - master
    - cron: '0 0 * * *'
 concurrency:
  group: grpc-cache-${{ github.head_ref || github.ref }}-${{ github.repository }}
@@ -17,7 +16,7 @@ jobs:
      matrix:
        include:
          - grpc-base-image: ubuntu:22.04
-            runs-on: 'arc-runner-set'
+            runs-on: 'ubuntu-latest'
            platforms: 'linux/amd64,linux/arm64'
    runs-on: ${{matrix.runs-on}}
    steps:
--- a/.github/workflows/notify-models.yaml
+++ b/.github/workflows/notify-models.yaml
@@ -18,7 +18,7 @@ jobs:
      with:
        model: 'hermes-2-theta-llama-3-8b' # Any from models.localai.io, or from huggingface.com with: "huggingface://<repository>/file"
        # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.8.0
+    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
@@ -99,7 +99,7 @@ jobs:
        docker run -e -ti -d --name local-ai -p 8080:8080 localai/localai:master-ffmpeg-core run --debug $MODEL_NAME
        until [ "`docker inspect -f {{.State.Health.Status}} local-ai`" == "healthy" ]; do echo "Waiting for container to be ready";  docker logs --tail 10 local-ai; sleep 2; done
      # Check the PR diff using the current branch and the base branch of the PR
-    - uses: GrantBirki/git-diff-action@v2.8.0
+    - uses: GrantBirki/git-diff-action@v2.7.0
      id: git-diff-action
      with:
            json_diff_file_output: diff.json
--- a/2
+++ b/2
@@ -1,6 +1,6 @@
 MIT License
-Copyright (c) 2023-2025 Ettore Di Giacinto (mudler@localai.io)
+Copyright (c) 2023-2024 Ettore Di Giacinto (mudler@localai.io)
 Permission is hereby granted, free of charge, to any person obtaining a copy
 of this software and associated documentation files (the "Software"), to deal
--- a/44
+++ b/44
@@ -6,7 +6,9 @@ BINARY_NAME=local-ai
 DETECT_LIBS?=true
 # llama.cpp versions
-CPPLLAMA_VERSION?=300907b2110cc17b4337334dc397e05de2d8f5e0
+GOLLAMA_REPO?=https://github.com/go-skynet/go-llama.cpp
 GOLLAMA_VERSION?=2b57a8ae43e4699d3dc5d1496a1ccd42922993be
 CPPLLAMA_VERSION?=6152129d05870cb38162c422c6ba80434e021e9f
 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggerganov/whisper.cpp
@@ -22,7 +24,7 @@ BARKCPP_VERSION?=v1.0.0
 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=d46ed5e184b97c2018dc2e8105925bdb8775e02c
+STABLEDIFFUSION_GGML_VERSION?=5eb15ef4d022bef4a391de4f5f6556e81fbb5024
 ONNX_VERSION?=1.20.0
 ONNX_ARCH?=x64
@@ -149,6 +151,7 @@ ifeq ($(BUILD_TYPE),hipblas)
 	LD_LIBRARY_PATH ?= /opt/rocm/lib:/opt/rocm/llvm/lib
 	export CXX=$(ROCM_HOME)/llvm/bin/clang++
 	export CC=$(ROCM_HOME)/llvm/bin/clang
 	# llama-ggml has no hipblas support, so override it here.
 	export STABLE_BUILD_TYPE=
 	export GGML_HIP=1
 	GPU_TARGETS ?= gfx900,gfx906,gfx908,gfx940,gfx941,gfx942,gfx90a,gfx1030,gfx1031,gfx1100,gfx1101
@@ -185,6 +188,7 @@ ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx2
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-avx512
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-fallback
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-ggml
 ALL_GRPC_BACKENDS+=backend-assets/grpc/llama-cpp-grpc
 ALL_GRPC_BACKENDS+=backend-assets/util/llama-cpp-rpc-server
 ALL_GRPC_BACKENDS+=backend-assets/grpc/whisper
@@ -218,6 +222,19 @@ endif
 all: help
 ## go-llama.cpp
 sources/go-llama.cpp:
 	mkdir -p sources/go-llama.cpp
 	cd sources/go-llama.cpp && \
 	git init && \
 	git remote add origin $(GOLLAMA_REPO) && \
 	git fetch origin && \
 	git checkout $(GOLLAMA_VERSION) && \
 	git submodule update --init --recursive --depth 1 --single-branch
 sources/go-llama.cpp/libbinding.a: sources/go-llama.cpp
 	$(MAKE) -C sources/go-llama.cpp BUILD_TYPE=$(STABLE_BUILD_TYPE) libbinding.a
 ## bark.cpp
 sources/bark.cpp:
 	git clone --recursive $(BARKCPP_REPO) sources/bark.cpp && \
@@ -293,17 +310,19 @@ sources/whisper.cpp:
 sources/whisper.cpp/libwhisper.a: sources/whisper.cpp
 	cd sources/whisper.cpp && $(MAKE) libwhisper.a libggml.a
-get-sources: sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
+get-sources: sources/go-llama.cpp sources/go-piper sources/stablediffusion-ggml.cpp sources/bark.cpp sources/whisper.cpp backend/cpp/llama/llama.cpp
 replace:
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp=$(CURDIR)/sources/whisper.cpp
 	$(GOCMD) mod edit -replace github.com/ggerganov/whisper.cpp/bindings/go=$(CURDIR)/sources/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -replace github.com/mudler/go-piper=$(CURDIR)/sources/go-piper
 	$(GOCMD) mod edit -replace github.com/go-skynet/go-llama.cpp=$(CURDIR)/sources/go-llama.cpp
 dropreplace:
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp
 	$(GOCMD) mod edit -dropreplace github.com/ggerganov/whisper.cpp/bindings/go
 	$(GOCMD) mod edit -dropreplace github.com/mudler/go-piper
 	$(GOCMD) mod edit -dropreplace github.com/go-skynet/go-llama.cpp
 prepare-sources: get-sources replace
 	$(GOCMD) mod download
@@ -311,6 +330,7 @@ prepare-sources: get-sources replace
 ## GENERIC
 rebuild: ## Rebuilds the project
 	$(GOCMD) clean -cache
 	$(MAKE) -C sources/go-llama.cpp clean
 	$(MAKE) -C sources/whisper.cpp clean
 	$(MAKE) -C sources/go-piper clean
 	$(MAKE) build
@@ -414,7 +434,7 @@ run: prepare ## run local-ai
 test-models/testmodel.ggml:
 	mkdir test-models
 	mkdir test-dir
-	wget -q https://huggingface.co/RichardErkhov/Qwen_-_Qwen2-1.5B-Instruct-gguf/resolve/main/Qwen2-1.5B-Instruct.Q2_K.gguf -O test-models/testmodel.ggml
+	wget -q https://huggingface.co/TheBloke/orca_mini_3B-GGML/resolve/main/orca-mini-3b.ggmlv3.q4_0.bin -O test-models/testmodel.ggml
 	wget -q https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.en.bin -O test-models/whisper-en
 	wget -q https://huggingface.co/mudler/all-MiniLM-L6-v2/resolve/main/ggml-model-q4_0.bin -O test-models/bert
 	wget -q https://cdn.openai.com/whisper/draft-20220913a/micro-machines.wav -O test-dir/audio.wav
@@ -429,7 +449,8 @@ test: prepare test-models/testmodel.ggml grpcs
 	export GO_TAGS="tts debug"
 	$(MAKE) prepare-test
 	HUGGINGFACE_GRPC=$(abspath ./)/backend/python/transformers/run.sh TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
-	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
+	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="!llama && !llama-gguf"  --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
 	$(MAKE) test-llama
 	$(MAKE) test-llama-gguf
 	$(MAKE) test-tts
 	$(MAKE) test-stablediffusion
@@ -458,6 +479,10 @@ teardown-e2e:
 	rm -rf $(TEST_DIR) || true
 	docker stop $$(docker ps -q --filter ancestor=localai-tests)
 test-llama: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
 test-llama-gguf: prepare-test
 	TEST_DIR=$(abspath ./)/test-dir/ FIXTURES=$(abspath ./)/tests/fixtures CONFIG_FILE=$(abspath ./)/test-models/config.yaml MODELS_PATH=$(abspath ./)/test-models \
 	$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --label-filter="llama-gguf" --flake-attempts $(TEST_FLAKES) -v -r $(TEST_PATHS)
@@ -735,6 +760,13 @@ backend-assets/util/llama-cpp-rpc-server: backend-assets/grpc/llama-cpp-grpc
 	mkdir -p backend-assets/util/
 	cp -rf backend/cpp/llama-grpc/llama.cpp/build/bin/rpc-server backend-assets/util/llama-cpp-rpc-server
 backend-assets/grpc/llama-ggml: sources/go-llama.cpp sources/go-llama.cpp/libbinding.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/sources/go-llama.cpp LIBRARY_PATH=$(CURDIR)/sources/go-llama.cpp \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/llama-ggml ./backend/go/llm/llama-ggml/
 ifneq ($(UPX),)
 	$(UPX) backend-assets/grpc/llama-ggml
 endif
 backend-assets/grpc/bark-cpp: backend/go/bark/libbark.a backend-assets/grpc
 	CGO_LDFLAGS="$(CGO_LDFLAGS)" C_INCLUDE_PATH=$(CURDIR)/backend/go/bark/ LIBRARY_PATH=$(CURDIR)/backend/go/bark/ \
 	$(GOCMD) build -ldflags "$(LD_FLAGS)" -tags "$(GO_TAGS)" -o backend-assets/grpc/bark-cpp ./backend/go/bark/
@@ -829,7 +861,7 @@ swagger:
 .PHONY: gen-assets
 gen-assets:
-	$(GOCMD) run core/dependencies_manager/manager.go webui_static.yaml core/http/static/assets
+	$(GOCMD) run core/dependencies_manager/manager.go embedded/webui_static.yaml core/http/static/assets
 ## Documentation
 docs/layouts/_default:
--- a/aio/cpu/vad.yaml
+++ b/aio/cpu/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/entrypoint.sh
+++ b/aio/entrypoint.sh
@@ -129,7 +129,7 @@ detect_gpu
 detect_gpu_size
 PROFILE="${PROFILE:-$GPU_SIZE}" # default to cpu
-export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vad.yaml,/aio/${PROFILE}/vision.yaml}"
+export MODELS="${MODELS:-/aio/${PROFILE}/embeddings.yaml,/aio/${PROFILE}/rerank.yaml,/aio/${PROFILE}/text-to-speech.yaml,/aio/${PROFILE}/image-gen.yaml,/aio/${PROFILE}/text-to-text.yaml,/aio/${PROFILE}/speech-to-text.yaml,/aio/${PROFILE}/vision.yaml}"
 check_vars
--- a/aio/gpu-8g/vad.yaml
+++ b/aio/gpu-8g/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/aio/intel/vad.yaml
+++ b/aio/intel/vad.yaml
@@ -1,8 +0,0 @@
 backend: silero-vad
 name: silero-vad
 parameters:
  model: silero-vad.onnx
 download_files:
 - filename: silero-vad.onnx
  uri: https://huggingface.co/onnx-community/silero-vad/resolve/main/onnx/model.onnx
  sha256: a4a068cd6cf1ea8355b84327595838ca748ec29a25bc91fc82e6c299ccdc5808
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -163,11 +163,6 @@ message Reply {
  double timing_token_generation = 5;
 }
 message GrammarTrigger {
  string word = 1;
  bool at_start = 2; 
 }
 message ModelOptions {
  string Model = 1;
  int32 ContextSize = 2;
@@ -252,8 +247,6 @@ message ModelOptions {
  string CacheTypeKey = 63;
  string CacheTypeValue = 64;
  repeated GrammarTrigger GrammarTriggers = 65;
 }
 message Result {
--- a/backend/cpp/llama/grpc-server.cpp
+++ b/backend/cpp/llama/grpc-server.cpp
@@ -22,6 +22,7 @@
 #include "backend.grpc.pb.h"
 #include "utils.hpp"
 #include "sampling.h"
 #include "speculative.h"
 // include std::regex
 #include <cstddef>
 #include <thread>
@@ -185,12 +186,45 @@ static json probs_vector_to_json(const llama_context *ctx, const std::vector<com
    return out;
 }
 struct llama_slot_params {
    uint32_t seed      = -1; // RNG seed
    bool stream        = true;
    bool cache_prompt  = true; // remember the prompt to avoid reprocessing all prompt
    bool return_tokens = false;
    int32_t n_keep    =  0; // number of tokens to keep from initial prompt
    int32_t n_discard =  0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
    int32_t n_predict = -1; // new tokens to predict
    int32_t n_indent  =  0; // mininum line indentation for the generated text in number of whitespace characters
    int64_t t_max_prompt_ms  = -1; // TODO: implement
    int64_t t_max_predict_ms = -1; // if positive, limit the generation phase to this time limit
    std::vector<common_adapter_lora_info> lora;
    std::vector<std::string> antiprompt;
    std::vector<std::string> response_fields;
    bool timings_per_token = false;
    bool post_sampling_probs = false;
    bool ignore_eos = false;
    json input_prefix;
    json input_suffix;
    struct common_params_sampling sampling;
    struct common_params_speculative speculative;
 };
 struct llama_client_slot
 {
    int id;
    int task_id = -1;
-    struct slot_params params;
+    struct llama_slot_params params;
    common_speculative * spec = nullptr;
    llama_batch batch_spec = {};
    slot_state state = IDLE;
    slot_command command = NONE;
@@ -283,6 +317,7 @@ struct llama_client_slot
        images.clear();
    }
    bool has_budget(common_params &global_params) {
        if (params.n_predict == -1 && global_params.n_predict == -1)
        {
@@ -454,6 +489,10 @@ struct llama_server_context
 {
    llama_model *model = nullptr;
    llama_context *ctx = nullptr;
    common_init_result llama_init_dft;
    llama_context * ctx_dft = nullptr;
    llama_model * model_dft = nullptr;
    llama_context_params cparams_dft;
    const llama_vocab * vocab = nullptr;
    clip_ctx *clp_ctx = nullptr;
@@ -468,9 +507,6 @@ struct llama_server_context
    bool add_bos_token      = true;
    bool has_eos_token      = true;
    bool grammar_lazy = false;
    std::vector<common_grammar_trigger> grammar_trigger_words;
    int32_t n_ctx;  // total context for all clients / slots
    // system prompt
@@ -505,6 +541,7 @@ struct llama_server_context
        }
    }
    bool load_model(const common_params &params_)
    {
        params = params_;
@@ -548,6 +585,45 @@ struct llama_server_context
        add_bos_token = llama_vocab_get_add_bos(vocab);
        has_eos_token = llama_vocab_eos(vocab) != LLAMA_TOKEN_NULL;
        if (!params.speculative.model.empty()) {
            LOG("loading draft model '%s'\n", params.speculative.model.c_str());
            auto params_dft = params;
            params_dft.devices      = params.speculative.devices;
            params_dft.model        = params.speculative.model;
            params_dft.n_ctx        = params.speculative.n_ctx == 0 ? params.n_ctx / params.n_parallel : params.speculative.n_ctx;
            params_dft.n_gpu_layers = params.speculative.n_gpu_layers;
            params_dft.n_parallel   = 1;
            llama_init_dft = common_init_from_params(params_dft);
            model_dft = llama_init_dft.model.get();
            if (model_dft == nullptr) {
                LOG("failed to load draft model, '%s'\n", params.speculative.model.c_str());
                return false;
            }
            if (!common_speculative_are_compatible(ctx, llama_init_dft.context.get())) {
                LOG("the draft model '%s' is not compatible with the target model '%s'\n", params.speculative.model.c_str(), params.model.c_str());
                return false;
            }
            const int n_ctx_dft = llama_n_ctx(llama_init_dft.context.get());
            cparams_dft = common_context_params_to_llama(params_dft);
            cparams_dft.n_batch = n_ctx_dft;
            // force F16 KV cache for the draft model for extra performance
            cparams_dft.type_k = GGML_TYPE_F16;
            cparams_dft.type_v = GGML_TYPE_F16;
            // the context is not needed - we will create one for each slot
            llama_init_dft.context.reset();
        }
        return true;
    }
@@ -576,6 +652,22 @@ struct llama_server_context
            slot.n_ctx = n_ctx_slot;
            slot.n_predict = params.n_predict;
            if (model_dft) {
                slot.batch_spec = llama_batch_init(params.speculative.n_max + 1, 0, 1);
                ctx_dft = llama_init_from_model(model_dft, cparams_dft);
                if (ctx_dft == nullptr) {
                    LOG("%s", "failed to create draft context\n");
                    return;
                }
                slot.spec = common_speculative_init(ctx_dft);
                if (slot.spec == nullptr) {
                    LOG("%s", "failed to create speculator\n");
                    return;
                }
            }
            LOG_INFO("new slot", {
                {"slot_id",    slot.id},
                {"n_ctx_slot", slot.n_ctx}
@@ -684,9 +776,11 @@ struct llama_server_context
    }
    bool launch_slot_with_data(llama_client_slot* &slot, json data) {
-        slot_params default_params;
+        llama_slot_params default_params;
        common_params_sampling default_sparams;
        default_sparams.speculative = params_base.speculative;
        slot->params.stream             = json_value(data, "stream",            false);
        slot->params.cache_prompt       = json_value(data, "cache_prompt",      false);
        slot->params.n_predict          = json_value(data, "n_predict",         default_params.n_predict);
@@ -709,8 +803,15 @@ struct llama_server_context
        slot->sparams.grammar           = json_value(data, "grammar",           default_sparams.grammar);
        slot->sparams.n_probs           = json_value(data, "n_probs",           default_sparams.n_probs);
        slot->sparams.min_keep          = json_value(data, "min_keep",          default_sparams.min_keep);
-        slot->sparams.grammar_trigger_words = grammar_trigger_words;
+
-        slot->sparams.grammar_lazy = grammar_lazy;
+
        slot->sparams.speculative.n_min = json_value(data, "speculative.n_min", defaults.speculative.n_min);
        slot->sparams.speculative.n_max = json_value(data, "speculative.n_max", defaults.speculative.n_max);
        slot->sparams.speculative.p_min = json_value(data, "speculative.p_min", defaults.speculative.p_min);
        slot->sparams.speculative.n_min = std::min(params.speculative.n_max, params.speculative.n_min);
        slot->sparams.speculative.n_min = std::max(params.speculative.n_min, 2);
        slot->sparams.speculative.n_max = std::max(params.speculative.n_max, 0);
        if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
            // Might be better to reject the request with a 400 ?
@@ -1155,14 +1256,6 @@ struct llama_server_context
            slot.has_next_token = false;
        }
        if (slot.n_past >= slot.n_ctx) {
            slot.truncated      = true;
            slot.stopped_limit = true;
            slot.has_next_token = false;
            LOG_VERBOSE("stopped due to running out of context capacity", {});
        }
        if (result.tok == llama_vocab_eos(vocab) || llama_vocab_is_eog(vocab, result.tok))
        {
            slot.stopped_eos = true;
@@ -1635,17 +1728,17 @@ struct llama_server_context
            {
                if (slot.is_processing() && system_tokens.size() + slot.cache_tokens.size() >= (size_t) slot.n_ctx)
                {
                    // this check is redundant (for good)
                    // we should never get here, because generation should already stopped in process_token()
                    // START LOCALAI changes
                    // Temporary disable context-shifting as it can lead to infinite loops (issue: https://github.com/ggerganov/llama.cpp/issues/3969)
                    // See: https://github.com/mudler/LocalAI/issues/1333
                    // Context is exhausted, release the slot
                    slot.release();
                    send_final_response(slot);
-                    slot.has_next_token = false;
+                    slot.cache_tokens.clear();
-                    LOG_ERROR("context is exhausted, release the slot", {});
+                    slot.n_past = 0;
                    slot.truncated = false;
                    slot.has_next_token = true;
                    LOG("Context exhausted. Slot %d released (%d tokens in cache)\n", slot.id, (int) slot.cache_tokens.size());
                    continue;
                    // END LOCALAI changes
@@ -2037,6 +2130,97 @@ struct llama_server_context
            }
        }
        // do speculative decoding
        for (auto & slot : slots) {
            if (!slot.is_processing() || !(ctx_dft && params.speculative.n_max > 0)) {
                continue;
            }
            if (slot.state != PROCESSING) {
                continue;
            }
            // determine the max draft that fits the current slot state
            int n_draft_max = slot.params.speculative.n_max;
            // note: n_past is not yet increased for the `id` token sampled above
            //       also, need to leave space for 1 extra token to allow context shifts
            n_draft_max = std::min(n_draft_max, slot.n_ctx - slot.n_past - 2);
            if (slot.n_remaining > 0) {
                n_draft_max = std::min(n_draft_max, slot.n_remaining - 1);
            }
            LOG("max possible draft: %d\n", n_draft_max);
            if (n_draft_max < slot.params.speculative.n_min) {
                LOG("the max possible draft is too small: %d < %d - skipping speculative decoding\n", n_draft_max, slot.params.speculative.n_min);
                continue;
            }
            llama_token id = slot.sampled;
            struct common_speculative_params params_spec;
            params_spec.n_draft   = n_draft_max;
            params_spec.n_reuse   = llama_n_ctx(ctx_dft) - slot.params.speculative.n_max;
            params_spec.p_min     = slot.params.speculative.p_min;
            llama_tokens draft = common_speculative_gen_draft(slot.spec, params_spec, slot.cache_tokens, id);
            // ignore small drafts
            if (slot.params.speculative.n_min > (int) draft.size()) {
                LOG("ignoring small draft: %d < %d\n", (int) draft.size(), slot.params.speculative.n_min);
                continue;
            }
            // construct the speculation batch
            common_batch_clear(slot.batch_spec);
            common_batch_add  (slot.batch_spec, id, slot.n_past, { slot.id }, true);
            for (size_t i = 0; i < draft.size(); ++i) {
                common_batch_add(slot.batch_spec, draft[i], slot.n_past + 1 + i, { slot.id }, true);
            }
            LOG("decoding speculative batch, size = %d\n", slot.batch_spec.n_tokens);
            llama_decode(ctx, slot.batch_spec);
            // the accepted tokens from the speculation
            const auto ids = common_sampler_sample_and_accept_n(slot.ctx_sampling, ctx, draft);
            slot.n_past    += ids.size();
            slot.n_decoded += ids.size();
            slot.cache_tokens.push_back(id);
            slot.cache_tokens.insert(slot.cache_tokens.end(), ids.begin(), ids.end() - 1);
            llama_kv_cache_seq_rm(ctx, slot.id, slot.n_past, -1);
            for (size_t i = 0; i < ids.size(); ++i) {
                completion_token_output result;
                result.tok          = ids[i];
                result.text_to_send = common_token_to_piece(ctx, result.tok, params.special);
                //result.prob         = 1.0f; // set later
                // TODO: set result.probs
                if (!process_token(result, slot)) {
                    // release slot because of stop condition
                    slot.release();
                    slot.print_timings();
                    send_final_response(slot);
                    metrics.on_prediction(slot);
                    break;
                }
            }
        LOG("accepted %d/%d draft tokens, new n_past = %d\n", (int) ids.size() - 1, (int) draft.size(), slot.n_past);
        }
        LOG_VERBOSE("slots updated", {});
        return true;
    }
@@ -2309,6 +2493,30 @@ static void params_parse(const backend::ModelOptions* request,
    params.cpuparams.n_threads = request->threads();
    params.n_gpu_layers = request->ngpulayers();
    params.n_batch = request->nbatch();
    params.speculative.model = request->draftmodel();
    // If options is not NULL, parse options
    for (int i = 0; request->options()[i] != NULL; i++) {
        char *optname = strtok(request->options()[i], ":");
        char *optval = strtok(NULL, ":");
        if (optval == NULL) {
            optval = "true";
        }
        if (!strcmp(optname, "speculative.n_gpu_layers")) {
            params.speculative.n_gpu_layers = std::stoi(optval);
        }
        if (!strcmp(optname, "speculative.n_ctx")) {
            params.speculative.n_ctx = std::stoi(optval);
        }
    }
    if params.speculative.n_gpu_layers == 0 {
        params.speculative.n_gpu_layers = params.n_gpu_layers;
    }
    if params.speculative.n_ctx == 0 {
        params.speculative.n_ctx = params.n_ctx;
    }
    // Set params.n_parallel by environment variable (LLAMA_PARALLEL), defaults to 1
    //params.n_parallel = 1;
    const char *env_parallel = std::getenv("LLAMACPP_PARALLEL");
@@ -2387,21 +2595,6 @@ static void params_parse(const backend::ModelOptions* request,
    if ( request->ropefreqscale() != 0.0f ) {
        params.rope_freq_scale = request->ropefreqscale();
    }
    if (request->grammartriggers_size() > 0) {
        LOG_INFO("configuring grammar triggers", {});
        llama.grammar_lazy = true;
        for (int i = 0; i < request->grammartriggers_size(); i++) {
            common_grammar_trigger trigger;
            trigger.word = request->grammartriggers(i).word();
            trigger.at_start = request->grammartriggers(i).at_start();
            llama.grammar_trigger_words.push_back(trigger);
            LOG_INFO("grammar trigger", {
                { "word", trigger.word },
                { "at_start", trigger.at_start }
            });
        }
    }
 }
@@ -2550,18 +2743,6 @@ public:
        return grpc::Status::OK;
    }
    grpc::Status TokenizeString(ServerContext* context, const backend::PredictOptions* request, backend::TokenizationResponse* response){
         json data = parse_options(false, request, llama);
         std::vector<llama_token> tokens = llama.tokenize(data["prompt"],false);
         for (int i=0 ; i< tokens.size(); i++){
            response->add_tokens(tokens[i]);
         }
        return grpc::Status::OK;
    }
    grpc::Status GetMetrics(ServerContext* context, const backend::MetricsRequest* request, backend::MetricsResponse* response) {
        llama_client_slot* active_slot = llama.get_active_slot();
--- a/backend/go/llm/llama-ggml/llama.go
+++ b/backend/go/llm/llama-ggml/llama.go
@@ -0,0 +1,204 @@
 package main
 // This is a wrapper to statisfy the GRPC service interface
 // It is meant to be used by the main executable that is the server for the specific backend type (falcon, gpt3, etc)
 import (
 	"fmt"
 	"github.com/go-skynet/go-llama.cpp"
 	"github.com/mudler/LocalAI/pkg/grpc/base"
 	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
 )
 type LLM struct {
 	base.SingleThread
 	llama *llama.LLama
 }
 func (llm *LLM) Load(opts *pb.ModelOptions) error {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	llamaOpts := []llama.ModelOption{
 		llama.WithRopeFreqBase(ropeFreqBase),
 		llama.WithRopeFreqScale(ropeFreqScale),
 	}
 	if opts.NGQA != 0 {
 		llamaOpts = append(llamaOpts, llama.WithGQA(int(opts.NGQA)))
 	}
 	if opts.RMSNormEps != 0 {
 		llamaOpts = append(llamaOpts, llama.WithRMSNormEPS(opts.RMSNormEps))
 	}
 	if opts.ContextSize != 0 {
 		llamaOpts = append(llamaOpts, llama.SetContext(int(opts.ContextSize)))
 	}
 	if opts.F16Memory {
 		llamaOpts = append(llamaOpts, llama.EnableF16Memory)
 	}
 	if opts.Embeddings {
 		llamaOpts = append(llamaOpts, llama.EnableEmbeddings)
 	}
 	if opts.NGPULayers != 0 {
 		llamaOpts = append(llamaOpts, llama.SetGPULayers(int(opts.NGPULayers)))
 	}
 	llamaOpts = append(llamaOpts, llama.SetMMap(opts.MMap))
 	llamaOpts = append(llamaOpts, llama.SetMainGPU(opts.MainGPU))
 	llamaOpts = append(llamaOpts, llama.SetTensorSplit(opts.TensorSplit))
 	if opts.NBatch != 0 {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(int(opts.NBatch)))
 	} else {
 		llamaOpts = append(llamaOpts, llama.SetNBatch(512))
 	}
 	if opts.NUMA {
 		llamaOpts = append(llamaOpts, llama.EnableNUMA)
 	}
 	if opts.LowVRAM {
 		llamaOpts = append(llamaOpts, llama.EnabelLowVRAM)
 	}
 	model, err := llama.New(opts.ModelFile, llamaOpts...)
 	llm.llama = model
 	return err
 }
 func buildPredictOptions(opts *pb.PredictOptions) []llama.PredictOption {
 	ropeFreqBase := float32(10000)
 	ropeFreqScale := float32(1)
 	if opts.RopeFreqBase != 0 {
 		ropeFreqBase = opts.RopeFreqBase
 	}
 	if opts.RopeFreqScale != 0 {
 		ropeFreqScale = opts.RopeFreqScale
 	}
 	predictOptions := []llama.PredictOption{
 		llama.SetTemperature(opts.Temperature),
 		llama.SetTopP(opts.TopP),
 		llama.SetTopK(int(opts.TopK)),
 		llama.SetTokens(int(opts.Tokens)),
 		llama.SetThreads(int(opts.Threads)),
 		llama.WithGrammar(opts.Grammar),
 		llama.SetRopeFreqBase(ropeFreqBase),
 		llama.SetRopeFreqScale(ropeFreqScale),
 		llama.SetNegativePromptScale(opts.NegativePromptScale),
 		llama.SetNegativePrompt(opts.NegativePrompt),
 	}
 	if opts.PromptCacheAll {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheAll)
 	}
 	if opts.PromptCacheRO {
 		predictOptions = append(predictOptions, llama.EnablePromptCacheRO)
 	}
 	// Expected absolute path
 	if opts.PromptCachePath != "" {
 		predictOptions = append(predictOptions, llama.SetPathPromptCache(opts.PromptCachePath))
 	}
 	if opts.Mirostat != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostat(int(opts.Mirostat)))
 	}
 	if opts.MirostatETA != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatETA(opts.MirostatETA))
 	}
 	if opts.MirostatTAU != 0 {
 		predictOptions = append(predictOptions, llama.SetMirostatTAU(opts.MirostatTAU))
 	}
 	if opts.Debug {
 		predictOptions = append(predictOptions, llama.Debug)
 	}
 	predictOptions = append(predictOptions, llama.SetStopWords(opts.StopPrompts...))
 	if opts.PresencePenalty != 0 {
 		predictOptions = append(predictOptions, llama.SetPenalty(opts.PresencePenalty))
 	}
 	if opts.NKeep != 0 {
 		predictOptions = append(predictOptions, llama.SetNKeep(int(opts.NKeep)))
 	}
 	if opts.Batch != 0 {
 		predictOptions = append(predictOptions, llama.SetBatch(int(opts.Batch)))
 	}
 	if opts.F16KV {
 		predictOptions = append(predictOptions, llama.EnableF16KV)
 	}
 	if opts.IgnoreEOS {
 		predictOptions = append(predictOptions, llama.IgnoreEOS)
 	}
 	if opts.Seed != 0 {
 		predictOptions = append(predictOptions, llama.SetSeed(int(opts.Seed)))
 	}
 	//predictOptions = append(predictOptions, llama.SetLogitBias(c.Seed))
 	predictOptions = append(predictOptions, llama.SetFrequencyPenalty(opts.FrequencyPenalty))
 	predictOptions = append(predictOptions, llama.SetMlock(opts.MLock))
 	predictOptions = append(predictOptions, llama.SetMemoryMap(opts.MMap))
 	predictOptions = append(predictOptions, llama.SetPredictionMainGPU(opts.MainGPU))
 	predictOptions = append(predictOptions, llama.SetPredictionTensorSplit(opts.TensorSplit))
 	predictOptions = append(predictOptions, llama.SetTailFreeSamplingZ(opts.TailFreeSamplingZ))
 	predictOptions = append(predictOptions, llama.SetTypicalP(opts.TypicalP))
 	return predictOptions
 }
 func (llm *LLM) Predict(opts *pb.PredictOptions) (string, error) {
 	return llm.llama.Predict(opts.Prompt, buildPredictOptions(opts)...)
 }
 func (llm *LLM) PredictStream(opts *pb.PredictOptions, results chan string) error {
 	predictOptions := buildPredictOptions(opts)
 	predictOptions = append(predictOptions, llama.SetTokenCallback(func(token string) bool {
 		results <- token
 		return true
 	}))
 	go func() {
 		_, err := llm.llama.Predict(opts.Prompt, predictOptions...)
 		if err != nil {
 			fmt.Println("err: ", err)
 		}
 		close(results)
 	}()
 	return nil
 }
 func (llm *LLM) Embeddings(opts *pb.PredictOptions) ([]float32, error) {
 	predictOptions := buildPredictOptions(opts)
 	if len(opts.EmbeddingTokens) > 0 {
 		tokens := []int{}
 		for _, t := range opts.EmbeddingTokens {
 			tokens = append(tokens, int(t))
 		}
 		return llm.llama.TokenEmbeddings(tokens, predictOptions...)
 	}
 	return llm.llama.Embeddings(opts.Embeddings, predictOptions...)
 }
--- a/backend/go/llm/llama-ggml/main.go
+++ b/backend/go/llm/llama-ggml/main.go
@@ -0,0 +1,19 @@
 package main
 import (
 	"flag"
 	grpc "github.com/mudler/LocalAI/pkg/grpc"
 )
 var (
 	addr = flag.String("addr", "localhost:50051", "the address to connect to")
 )
 func main() {
 	flag.Parse()
 	if err := grpc.StartServer(*addr, &LLM{}); err != nil {
 		panic(err)
 	}
 }
--- a/backend/python/autogptq/requirements.txt
+++ b/backend/python/autogptq/requirements.txt
@@ -1,6 +1,6 @@
 accelerate
 auto-gptq==0.7.1
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 certifi
 transformers
--- a/backend/python/bark/requirements.txt
+++ b/backend/python/bark/requirements.txt
@@ -1,4 +1,4 @@
 bark==0.1.5
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 certifi
--- a/backend/python/common/template/requirements.txt
+++ b/backend/python/common/template/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 grpcio-tools
--- a/backend/python/coqui/requirements.txt
+++ b/backend/python/coqui/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 certifi
 packaging==24.1
--- a/backend/python/diffusers/backend.py
+++ b/backend/python/diffusers/backend.py
@@ -159,18 +159,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
                torchType = torch.float16
                variant = "fp16"
            options = request.Options
            # empty dict
            self.options = {}
            # The options are a list of strings in this form optname:optvalue
            # We are storing all the options in a dict so we can use it later when
            # generating the images
            for opt in options:
                key, value = opt.split(":")
                self.options[key] = value
            local = False
            modelFile = request.Model
@@ -453,9 +441,6 @@ class BackendServicer(backend_pb2_grpc.BackendServicer):
        # create a dictionary of parameters by using the keys from EnableParameters and the values from defaults
        kwargs = {key: options.get(key) for key in keys if key in options}
        # populate kwargs from self.options.
        kwargs.update(self.options)
        # Set seed
        if request.seed > 0:
            kwargs["generator"] = torch.Generator(device=self.device).manual_seed(
--- a/backend/python/diffusers/requirements.txt
+++ b/backend/python/diffusers/requirements.txt
@@ -1,5 +1,5 @@
 setuptools
-grpcio==1.70.0
+grpcio==1.69.0
 pillow
 protobuf
 certifi
--- a/backend/python/exllama2/requirements.txt
+++ b/backend/python/exllama2/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 certifi
 wheel
--- a/backend/python/faster-whisper/requirements.txt
+++ b/backend/python/faster-whisper/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 grpcio-tools
--- a/backend/python/kokoro/requirements.txt
+++ b/backend/python/kokoro/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 phonemizer
 scipy
--- a/backend/python/rerankers/requirements.txt
+++ b/backend/python/rerankers/requirements.txt
@@ -1,3 +1,3 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 certifi
--- a/backend/python/transformers/requirements-cpu.txt
+++ b/backend/python/transformers/requirements-cpu.txt
@@ -5,4 +5,4 @@ accelerate
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.4.1
+sentence-transformers==3.3.1
--- a/backend/python/transformers/requirements-cublas11.txt
+++ b/backend/python/transformers/requirements-cublas11.txt
@@ -6,4 +6,4 @@ accelerate
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.4.1
+sentence-transformers==3.3.1
--- a/backend/python/transformers/requirements-cublas12.txt
+++ b/backend/python/transformers/requirements-cublas12.txt
@@ -5,4 +5,4 @@ numba==0.60.0
 transformers
 bitsandbytes
 outetts
-sentence-transformers==3.4.1
+sentence-transformers==3.3.1
--- a/backend/python/transformers/requirements-hipblas.txt
+++ b/backend/python/transformers/requirements-hipblas.txt
@@ -7,4 +7,4 @@ numba==0.60.0
 bitsandbytes
 outetts
 bitsandbytes
-sentence-transformers==3.4.1
+sentence-transformers==3.3.1
--- a/backend/python/transformers/requirements-intel.txt
+++ b/backend/python/transformers/requirements-intel.txt
@@ -8,4 +8,4 @@ numba==0.60.0
 intel-extension-for-transformers
 bitsandbytes
 outetts
-sentence-transformers==3.4.1
+sentence-transformers==3.3.1
--- a/backend/python/transformers/requirements.txt
+++ b/backend/python/transformers/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 certifi
 setuptools
--- a/backend/python/vllm/requirements.txt
+++ b/backend/python/vllm/requirements.txt
@@ -1,4 +1,4 @@
-grpcio==1.70.0
+grpcio==1.69.0
 protobuf
 certifi
 setuptools
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -62,7 +62,7 @@ func New(opts ...config.AppOption) (*Application, error) {
 		}
 	}
-	if err := pkgStartup.InstallModels(options.Galleries, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
+	if err := pkgStartup.InstallModels(options.Galleries, options.ModelLibraryURL, options.ModelPath, options.EnforcePredownloadScans, nil, options.ModelsURL...); err != nil {
 		log.Error().Err(err).Msg("error installing models")
 	}
@@ -145,7 +145,13 @@ func New(opts ...config.AppOption) (*Application, error) {
 	if options.LoadToMemory != nil {
 		for _, m := range options.LoadToMemory {
-			cfg, err := application.BackendLoader().LoadBackendConfigFileByNameDefaultOptions(m, options)
+			cfg, err := application.BackendLoader().LoadBackendConfigFileByName(m, options.ModelPath,
 				config.LoadOptionDebug(options.Debug),
 				config.LoadOptionThreads(options.Threads),
 				config.LoadOptionContextSize(options.ContextSize),
 				config.LoadOptionF16(options.F16),
 				config.ModelPath(options.ModelPath),
 			)
 			if err != nil {
 				return nil, err
 			}
--- a/core/backend/llm.go
+++ b/core/backend/llm.go
@@ -33,7 +33,7 @@ type TokenUsage struct {
 	TimingTokenGeneration  float64
 }
-func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c *config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
+func ModelInference(ctx context.Context, s string, messages []schema.Message, images, videos, audios []string, loader *model.ModelLoader, c config.BackendConfig, o *config.ApplicationConfig, tokenCallback func(string, TokenUsage) bool) (func() (LLMResponse, error), error) {
 	modelFile := c.Model
 	// Check if the modelFile exists, if it doesn't try to load it from the gallery
@@ -48,7 +48,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 		}
 	}
-	opts := ModelOptions(*c, o)
+	opts := ModelOptions(c, o)
 	inferenceModel, err := loader.Load(opts...)
 	if err != nil {
 		return nil, err
@@ -84,7 +84,7 @@ func ModelInference(ctx context.Context, s string, messages []schema.Message, im
 	// in GRPC, the backend is supposed to answer to 1 single token if stream is not supported
 	fn := func() (LLMResponse, error) {
-		opts := gRPCPredictOpts(*c, loader.ModelPath)
+		opts := gRPCPredictOpts(c, loader.ModelPath)
 		opts.Prompt = s
 		opts.Messages = protoMessages
 		opts.UseTokenizerTemplate = c.TemplateConfig.UseTokenizerTemplate
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -118,19 +118,9 @@ func grpcModelOpts(c config.BackendConfig) *pb.ModelOptions {
 		nGPULayers = *c.NGPULayers
 	}
 	triggers := make([]*pb.GrammarTrigger, 0)
 	for _, t := range c.FunctionsConfig.GrammarConfig.GrammarTriggers {
 		triggers = append(triggers, &pb.GrammarTrigger{
 			Word:    t.Word,
 			AtStart: t.AtStart,
 		})
 	}
 	return &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
 		GrammarTriggers:      triggers,
 		PipelineType:         c.Diffusers.PipelineType,
 		CFGScale:             c.CFGScale,
 		LoraAdapter:          c.LoraAdapter,
--- a/core/backend/rerank.go
+++ b/core/backend/rerank.go
@@ -9,10 +9,10 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )
-func Rerank(request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
+func Rerank(modelFile string, request *proto.RerankRequest, loader *model.ModelLoader, appConfig *config.ApplicationConfig, backendConfig config.BackendConfig) (*proto.RerankResult, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	rerankModel, err := loader.Load(opts...)
 	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
 	rerankModel, err := loader.Load(opts...)
 	if err != nil {
 		return nil, err
 	}
--- a/core/backend/soundgeneration.go
+++ b/core/backend/soundgeneration.go
@@ -13,6 +13,7 @@ import (
 )
 func SoundGeneration(
 	modelFile string,
 	text string,
 	duration *float32,
 	temperature *float32,
@@ -24,9 +25,8 @@ func SoundGeneration(
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
-	opts := ModelOptions(backendConfig, appConfig)
+	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
 	soundGenModel, err := loader.Load(opts...)
 	if err != nil {
 		return "", nil, err
 	}
@@ -44,7 +44,7 @@ func SoundGeneration(
 	res, err := soundGenModel.SoundGeneration(context.Background(), &proto.SoundGenerationRequest{
 		Text:        text,
-		Model:       backendConfig.Model,
+		Model:       modelFile,
 		Dst:         filePath,
 		Sample:      doSample,
 		Duration:    duration,
--- a/core/backend/tokenize.go
+++ b/core/backend/tokenize.go
@@ -4,17 +4,24 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc"
-	"github.com/mudler/LocalAI/pkg/model"
+	model "github.com/mudler/LocalAI/pkg/model"
 )
 func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.BackendConfig, appConfig *config.ApplicationConfig) (schema.TokenizeResponse, error) {
 	modelFile := backendConfig.Model
 	var inferenceModel grpc.Backend
 	var err error
-	opts := ModelOptions(backendConfig, appConfig)
+	opts := ModelOptions(backendConfig, appConfig, model.WithModel(modelFile))
 	inferenceModel, err = loader.Load(opts...)
 	if backendConfig.Backend == "" {
 		inferenceModel, err = loader.Load(opts...)
 	} else {
 		opts = append(opts, model.WithBackendString(backendConfig.Backend))
 		inferenceModel, err = loader.Load(opts...)
 	}
 	if err != nil {
 		return schema.TokenizeResponse{}, err
 	}
@@ -28,10 +35,6 @@ func ModelTokenize(s string, loader *model.ModelLoader, backendConfig config.Bac
 		return schema.TokenizeResponse{}, err
 	}
 	if resp.Tokens == nil {
 		resp.Tokens = make([]int32, 0)
 	}
 	return schema.TokenizeResponse{
 		Tokens: resp.Tokens,
 	}, nil
--- a/core/backend/transcript.go
+++ b/core/backend/transcript.go
@@ -47,7 +47,7 @@ func ModelTranscription(audio, language string, translate bool, ml *model.ModelL
 			tks = append(tks, int(t))
 		}
 		tr.Segments = append(tr.Segments,
-			schema.TranscriptionSegment{
+			schema.Segment{
 				Text:   s.Text,
 				Id:     int(s.Id),
 				Start:  time.Duration(s.Start),
--- a/core/backend/tts.go
+++ b/core/backend/tts.go
@@ -14,22 +14,28 @@ import (
 )
 func ModelTTS(
 	backend,
 	text,
 	modelFile,
 	voice,
 	language string,
 	loader *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig,
 ) (string, *proto.Result, error) {
-	opts := ModelOptions(backendConfig, appConfig, model.WithDefaultBackendString(model.PiperBackend))
+	bb := backend
-	ttsModel, err := loader.Load(opts...)
+	if bb == "" {
 		bb = model.PiperBackend
 	}
 	opts := ModelOptions(backendConfig, appConfig, model.WithBackendString(bb), model.WithModel(modelFile))
 	ttsModel, err := loader.Load(opts...)
 	if err != nil {
 		return "", nil, err
 	}
 	if ttsModel == nil {
-		return "", nil, fmt.Errorf("could not load tts model %q", backendConfig.Model)
+		return "", nil, fmt.Errorf("could not load piper model")
 	}
 	if err := os.MkdirAll(appConfig.AudioDir, 0750); err != nil {
@@ -39,21 +45,22 @@ func ModelTTS(
 	fileName := utils.GenerateUniqueFileName(appConfig.AudioDir, "tts", ".wav")
 	filePath := filepath.Join(appConfig.AudioDir, fileName)
-	// We join the model name to the model path here. This seems to only be done for TTS and is HIGHLY suspect.
+	// If the model file is not empty, we pass it joined with the model path
 	// This should be addressed in a follow up PR soon.
 	// Copying it over nearly verbatim, as TTS backends are not functional without this.
 	modelPath := ""
-	// Checking first that it exists and is not outside ModelPath
+	if modelFile != "" {
-	// TODO: we should actually first check if the modelFile is looking like
+		// If the model file is not empty, we pass it joined with the model path
-	// a FS path
+		// Checking first that it exists and is not outside ModelPath
-	mp := filepath.Join(loader.ModelPath, backendConfig.Model)
+		// TODO: we should actually first check if the modelFile is looking like
-	if _, err := os.Stat(mp); err == nil {
+		// a FS path
-		if err := utils.VerifyPath(mp, appConfig.ModelPath); err != nil {
+		mp := filepath.Join(loader.ModelPath, modelFile)
-			return "", nil, err
+		if _, err := os.Stat(mp); err == nil {
 			if err := utils.VerifyPath(mp, appConfig.ModelPath); err != nil {
 				return "", nil, err
 			}
 			modelPath = mp
 		} else {
 			modelPath = modelFile
 		}
 		modelPath = mp
 	} else {
 		modelPath = backendConfig.Model // skip this step if it fails?????
 	}
 	res, err := ttsModel.TTS(context.Background(), &proto.TTSRequest{
--- a/core/backend/vad.go
+++ b/core/backend/vad.go
@@ -1,38 +0,0 @@
 package backend
 import (
 	"context"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 func VAD(request *schema.VADRequest,
 	ctx context.Context,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
 	backendConfig config.BackendConfig) (*schema.VADResponse, error) {
 	opts := ModelOptions(backendConfig, appConfig)
 	vadModel, err := ml.Load(opts...)
 	if err != nil {
 		return nil, err
 	}
 	req := proto.VADRequest{
 		Audio: request.Audio,
 	}
 	resp, err := vadModel.VAD(ctx, &req)
 	if err != nil {
 		return nil, err
 	}
 	segments := []schema.VADSegment{}
 	for _, s := range resp.Segments {
 		segments = append(segments, schema.VADSegment{Start: s.Start, End: s.End})
 	}
 	return &schema.VADResponse{
 		Segments: segments,
 	}, nil
 }
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -100,7 +100,7 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 			log.Info().Str("model", modelName).Str("license", model.License).Msg("installing model")
 		}
-		err = startup.InstallModels(galleries, mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
+		err = startup.InstallModels(galleries, "", mi.ModelsPath, !mi.DisablePredownloadScan, progressCallback, modelName)
 		if err != nil {
 			return err
 		}
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -32,6 +32,7 @@ type RunCMD struct {
 	Galleries           string   `env:"LOCALAI_GALLERIES,GALLERIES" help:"JSON list of galleries" group:"models" default:"${galleries}"`
 	AutoloadGalleries   bool     `env:"LOCALAI_AUTOLOAD_GALLERIES,AUTOLOAD_GALLERIES" group:"models"`
 	RemoteLibrary       string   `env:"LOCALAI_REMOTE_LIBRARY,REMOTE_LIBRARY" default:"${remoteLibraryURL}" help:"A LocalAI remote library URL" group:"models"`
 	PreloadModels       string   `env:"LOCALAI_PRELOAD_MODELS,PRELOAD_MODELS" help:"A List of models to apply in JSON at start" group:"models"`
 	Models              []string `env:"LOCALAI_MODELS,MODELS" help:"A List of model configuration URLs to load" group:"models"`
 	PreloadModelsConfig string   `env:"LOCALAI_PRELOAD_MODELS_CONFIG,PRELOAD_MODELS_CONFIG" help:"A List of models to apply at startup. Path to a YAML config file" group:"models"`
@@ -89,6 +90,7 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithDynamicConfigDirPollInterval(r.LocalaiConfigDirPollInterval),
 		config.WithF16(r.F16),
 		config.WithStringGalleries(r.Galleries),
 		config.WithModelLibraryURL(r.RemoteLibrary),
 		config.WithCors(r.CORS),
 		config.WithCorsAllowOrigins(r.CORSAllowOrigins),
 		config.WithCsrf(r.CSRF),
--- a/core/cli/soundgeneration.go
+++ b/core/cli/soundgeneration.go
@@ -86,14 +86,13 @@ func (t *SoundGenerationCMD) Run(ctx *cliContext.Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()
 	options.Backend = t.Backend
 	options.Model = t.Model
 	var inputFile *string
 	if t.InputFile != "" {
 		inputFile = &t.InputFile
 	}
-	filePath, _, err := backend.SoundGeneration(text,
+	filePath, _, err := backend.SoundGeneration(t.Model, text,
 		parseToFloat32Ptr(t.Duration), parseToFloat32Ptr(t.Temperature), &t.DoSample,
 		inputFile, parseToInt32Ptr(t.InputFileSampleDivisor), ml, opts, options)
--- a/core/cli/tts.go
+++ b/core/cli/tts.go
@@ -52,10 +52,8 @@ func (t *TTSCMD) Run(ctx *cliContext.Context) error {
 	options := config.BackendConfig{}
 	options.SetDefaults()
 	options.Backend = t.Backend
 	options.Model = t.Model
-	filePath, _, err := backend.ModelTTS(text, t.Voice, t.Language, ml, opts, options)
+	filePath, _, err := backend.ModelTTS(t.Backend, text, t.Model, t.Voice, t.Language, ml, opts, options)
 	if err != nil {
 		return err
 	}
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -44,6 +44,8 @@ type ApplicationConfig struct {
 	DisableGalleryEndpoint             bool
 	LoadToMemory                       []string
 	ModelLibraryURL string
 	Galleries []Gallery
 	BackendAssets     embed.FS
@@ -124,6 +126,12 @@ func WithP2PToken(s string) AppOption {
 	}
 }
 func WithModelLibraryURL(url string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.ModelLibraryURL = url
 	}
 }
 func WithLibPath(path string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.LibPath = path
--- a/core/config/backend_config.go
+++ b/core/config/backend_config.go
@@ -287,8 +287,7 @@ func (cfg *BackendConfig) SetDefaults(opts ...ConfigLoaderOption) {
 	defaultTopP := 0.95
 	defaultTopK := 40
 	defaultTemp := 0.9
-	// https://github.com/mudler/LocalAI/issues/2780
+	defaultMirostat := 2
 	defaultMirostat := 0
 	defaultMirostatTAU := 5.0
 	defaultMirostatETA := 0.1
 	defaultTypicalP := 1.0
@@ -437,21 +436,19 @@ func (c *BackendConfig) HasTemplate() bool {
 type BackendConfigUsecases int
 const (
-	FLAG_ANY              BackendConfigUsecases = 0b00000000000
+	FLAG_ANY              BackendConfigUsecases = 0b000000000
-	FLAG_CHAT             BackendConfigUsecases = 0b00000000001
+	FLAG_CHAT             BackendConfigUsecases = 0b000000001
-	FLAG_COMPLETION       BackendConfigUsecases = 0b00000000010
+	FLAG_COMPLETION       BackendConfigUsecases = 0b000000010
-	FLAG_EDIT             BackendConfigUsecases = 0b00000000100
+	FLAG_EDIT             BackendConfigUsecases = 0b000000100
-	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b00000001000
+	FLAG_EMBEDDINGS       BackendConfigUsecases = 0b000001000
-	FLAG_RERANK           BackendConfigUsecases = 0b00000010000
+	FLAG_RERANK           BackendConfigUsecases = 0b000010000
-	FLAG_IMAGE            BackendConfigUsecases = 0b00000100000
+	FLAG_IMAGE            BackendConfigUsecases = 0b000100000
-	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b00001000000
+	FLAG_TRANSCRIPT       BackendConfigUsecases = 0b001000000
-	FLAG_TTS              BackendConfigUsecases = 0b00010000000
+	FLAG_TTS              BackendConfigUsecases = 0b010000000
-	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b00100000000
+	FLAG_SOUND_GENERATION BackendConfigUsecases = 0b100000000
 	FLAG_TOKENIZE         BackendConfigUsecases = 0b01000000000
 	FLAG_VAD              BackendConfigUsecases = 0b10000000000
 	// Common Subsets
-	FLAG_LLM BackendConfigUsecases = FLAG_CHAT | FLAG_COMPLETION | FLAG_EDIT
+	FLAG_LLM BackendConfigUsecases = FLAG_CHAT & FLAG_COMPLETION & FLAG_EDIT
 )
 func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
@@ -466,8 +463,6 @@ func GetAllBackendConfigUsecases() map[string]BackendConfigUsecases {
 		"FLAG_TRANSCRIPT":       FLAG_TRANSCRIPT,
 		"FLAG_TTS":              FLAG_TTS,
 		"FLAG_SOUND_GENERATION": FLAG_SOUND_GENERATION,
 		"FLAG_TOKENIZE":         FLAG_TOKENIZE,
 		"FLAG_VAD":              FLAG_VAD,
 		"FLAG_LLM":              FLAG_LLM,
 	}
 }
@@ -553,18 +548,5 @@ func (c *BackendConfig) GuessUsecases(u BackendConfigUsecases) bool {
 		}
 	}
 	if (u & FLAG_TOKENIZE) == FLAG_TOKENIZE {
 		tokenizeCapableBackends := []string{"llama.cpp", "rwkv"}
 		if !slices.Contains(tokenizeCapableBackends, c.Backend) {
 			return false
 		}
 	}
 	if (u & FLAG_VAD) == FLAG_VAD {
 		if c.Backend != "silero-vad" {
 			return false
 		}
 	}
 	return true
 }
--- a/core/config/backend_config_loader.go
+++ b/core/config/backend_config_loader.go
@@ -81,10 +81,10 @@ func readMultipleBackendConfigsFromFile(file string, opts ...ConfigLoaderOption)
 	c := &[]*BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
-		return nil, fmt.Errorf("readMultipleBackendConfigsFromFile cannot read config file %q: %w", file, err)
+		return nil, fmt.Errorf("cannot read config file: %w", err)
 	}
 	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("readMultipleBackendConfigsFromFile cannot unmarshal config file %q: %w", file, err)
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
 	}
 	for _, cc := range *c {
@@ -101,10 +101,10 @@ func readBackendConfigFromFile(file string, opts ...ConfigLoaderOption) (*Backen
 	c := &BackendConfig{}
 	f, err := os.ReadFile(file)
 	if err != nil {
-		return nil, fmt.Errorf("readBackendConfigFromFile cannot read config file %q: %w", file, err)
+		return nil, fmt.Errorf("cannot read config file: %w", err)
 	}
 	if err := yaml.Unmarshal(f, c); err != nil {
-		return nil, fmt.Errorf("readBackendConfigFromFile cannot unmarshal config file %q: %w", file, err)
+		return nil, fmt.Errorf("cannot unmarshal config file: %w", err)
 	}
 	c.SetDefaults(opts...)
@@ -117,9 +117,7 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 	// Load a config file if present after the model name
 	cfg := &BackendConfig{
 		PredictionOptions: schema.PredictionOptions{
-			BasicModelRequest: schema.BasicModelRequest{
+			Model: modelName,
 				Model: modelName,
 			},
 		},
 	}
@@ -147,15 +145,6 @@ func (bcl *BackendConfigLoader) LoadBackendConfigFileByName(modelName, modelPath
 	return cfg, nil
 }
 func (bcl *BackendConfigLoader) LoadBackendConfigFileByNameDefaultOptions(modelName string, appConfig *ApplicationConfig) (*BackendConfig, error) {
 	return bcl.LoadBackendConfigFileByName(modelName, appConfig.ModelPath,
 		LoadOptionDebug(appConfig.Debug),
 		LoadOptionThreads(appConfig.Threads),
 		LoadOptionContextSize(appConfig.ContextSize),
 		LoadOptionF16(appConfig.F16),
 		ModelPath(appConfig.ModelPath))
 }
 // This format is currently only used when reading a single file at startup, passed in via ApplicationConfig.ConfigFile
 func (bcl *BackendConfigLoader) LoadMultipleBackendConfigsSingleFile(file string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()
@@ -178,7 +167,7 @@ func (bcl *BackendConfigLoader) LoadBackendConfig(file string, opts ...ConfigLoa
 	defer bcl.Unlock()
 	c, err := readBackendConfigFromFile(file, opts...)
 	if err != nil {
-		return fmt.Errorf("LoadBackendConfig cannot read config file %q: %w", file, err)
+		return fmt.Errorf("cannot read config file: %w", err)
 	}
 	if c.Validate() {
@@ -335,10 +324,9 @@ func (bcl *BackendConfigLoader) Preload(modelPath string) error {
 func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...ConfigLoaderOption) error {
 	bcl.Lock()
 	defer bcl.Unlock()
 	entries, err := os.ReadDir(path)
 	if err != nil {
-		return fmt.Errorf("LoadBackendConfigsFromPath cannot read directory '%s': %w", path, err)
+		return fmt.Errorf("cannot read directory '%s': %w", path, err)
 	}
 	files := make([]fs.FileInfo, 0, len(entries))
 	for _, entry := range entries {
@@ -356,13 +344,13 @@ func (bcl *BackendConfigLoader) LoadBackendConfigsFromPath(path string, opts ...
 		}
 		c, err := readBackendConfigFromFile(filepath.Join(path, file.Name()), opts...)
 		if err != nil {
-			log.Error().Err(err).Str("File Name", file.Name()).Msgf("LoadBackendConfigsFromPath cannot read config file")
+			log.Error().Err(err).Msgf("cannot read config file: %s", file.Name())
 			continue
 		}
 		if c.Validate() {
 			bcl.configs[c.Name] = *c
 		} else {
-			log.Error().Err(err).Str("Name", c.Name).Msgf("config is not valid")
+			log.Error().Err(err).Msgf("config is not valid")
 		}
 	}
--- a/core/config/backend_config_test.go
+++ b/core/config/backend_config_test.go
@@ -48,9 +48,9 @@ parameters:
 			Expect(config.Name).To(Equal("bar-baz"))
 			Expect(config.Validate()).To(BeTrue())
-			// download https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml
+			// download https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml
 			httpClient := http.Client{}
-			resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml")
+			resp, err := httpClient.Get("https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml")
 			Expect(err).To(BeNil())
 			defer resp.Body.Close()
 			tmp, err = os.CreateTemp("", "config.yaml")
--- a/core/config/guesser.go
+++ b/core/config/guesser.go
@@ -161,11 +161,10 @@ func guessDefaultsFromFile(cfg *BackendConfig, modelPath string) {
 	}
 	// We try to guess only if we don't have a template defined already
-	guessPath := filepath.Join(modelPath, cfg.ModelFileName())
+	f, err := gguf.ParseGGUFFile(filepath.Join(modelPath, cfg.ModelFileName()))
 	f, err := gguf.ParseGGUFFile(guessPath)
 	if err != nil {
 		// Only valid for gguf files
-		log.Debug().Str("filePath", guessPath).Msg("guessDefaultsFromFile: not a GGUF file")
+		log.Debug().Msgf("guessDefaultsFromFile: %s", "not a GGUF file")
 		return
 	}
--- a/core/gallery/models_test.go
+++ b/core/gallery/models_test.go
@@ -48,10 +48,8 @@ var _ = Describe("Model test", func() {
 			defer os.RemoveAll(tempdir)
 			gallery := []GalleryModel{{
-				Metadata: Metadata{
+				Name: "bert",
-					Name: "bert",
+				URL:  bertEmbeddingsURL,
 					URL:  bertEmbeddingsURL,
 				},
 			}}
 			out, err := yaml.Marshal(gallery)
 			Expect(err).ToNot(HaveOccurred())
--- a/core/gallery/request.go
+++ b/core/gallery/request.go
@@ -11,14 +11,6 @@ import (
 // It is used to install the model by resolving the URL and downloading the files.
 // The other fields are used to override the configuration of the model.
 type GalleryModel struct {
 	Metadata `json:",inline" yaml:",inline"`
 	// config_file is read in the situation where URL is blank - and therefore this is a base config.
 	ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
 	// Overrides are used to override the configuration of the model located at URL
 	Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
 }
 type Metadata struct {
 	URL         string   `json:"url,omitempty" yaml:"url,omitempty"`
 	Name        string   `json:"name,omitempty" yaml:"name,omitempty"`
 	Description string   `json:"description,omitempty"  yaml:"description,omitempty"`
@@ -26,6 +18,10 @@ type Metadata struct {
 	URLs        []string `json:"urls,omitempty" yaml:"urls,omitempty"`
 	Icon        string   `json:"icon,omitempty" yaml:"icon,omitempty"`
 	Tags        []string `json:"tags,omitempty" yaml:"tags,omitempty"`
 	// config_file is read in the situation where URL is blank - and therefore this is a base config.
 	ConfigFile map[string]interface{} `json:"config_file,omitempty" yaml:"config_file,omitempty"`
 	// Overrides are used to override the configuration of the model located at URL
 	Overrides map[string]interface{} `json:"overrides,omitempty" yaml:"overrides,omitempty"`
 	// AdditionalFiles are used to add additional files to the model
 	AdditionalFiles []File `json:"files,omitempty" yaml:"files,omitempty"`
 	// Gallery is a reference to the gallery which contains the model
--- a/core/gallery/request_test.go
+++ b/core/gallery/request_test.go
@@ -9,11 +9,7 @@ import (
 var _ = Describe("Gallery API tests", func() {
 	Context("requests", func() {
 		It("parses github with a branch", func() {
-			req := GalleryModel{
+			req := GalleryModel{URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main"}
 				Metadata: Metadata{
 					URL: "github:go-skynet/model-gallery/gpt4all-j.yaml@main",
 				},
 			}
 			e, err := GetGalleryConfigFromURL(req.URL, "")
 			Expect(err).ToNot(HaveOccurred())
 			Expect(e.Name).To(Equal("gpt4all-j"))
--- a/core/http/app.go
+++ b/core/http/app.go
@@ -130,6 +130,7 @@ func API(application *application.Application) (*fiber.App, error) {
 				return metricsService.Shutdown()
 			})
 		}
 	}
 	// Health Checks should always be exempt from auth, so register these first
 	routes.HealthRoutes(router)
@@ -166,15 +167,13 @@ func API(application *application.Application) (*fiber.App, error) {
 	galleryService := services.NewGalleryService(application.ApplicationConfig())
 	galleryService.Start(application.ApplicationConfig().Context, application.BackendLoader())
-	requestExtractor := middleware.NewRequestExtractor(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
+	routes.RegisterElevenLabsRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
-
+	routes.RegisterLocalAIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
-	routes.RegisterElevenLabsRoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
+	routes.RegisterOpenAIRoutes(router, application)
 	routes.RegisterLocalAIRoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
 	routes.RegisterOpenAIRoutes(router, requestExtractor, application)
 	if !application.ApplicationConfig().DisableWebUI {
 		routes.RegisterUIRoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig(), galleryService)
 	}
-	routes.RegisterJINARoutes(router, requestExtractor, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
+	routes.RegisterJINARoutes(router, application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig())
 	httpFS := http.FS(embedDirStatic)
--- a/core/http/app_test.go
+++ b/core/http/app_test.go
@@ -299,18 +299,14 @@ var _ = Describe("API test", func() {
 			g := []gallery.GalleryModel{
 				{
-					Metadata: gallery.Metadata{
+					Name: "bert",
-						Name: "bert",
+					URL:  bertEmbeddingsURL,
 						URL:  bertEmbeddingsURL,
 					},
 				},
 				{
-					Metadata: gallery.Metadata{
+					Name:            "bert2",
-						Name:            "bert2",
+					URL:             bertEmbeddingsURL,
-						URL:             bertEmbeddingsURL,
+					Overrides:       map[string]interface{}{"foo": "bar"},
-						AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
+					AdditionalFiles: []gallery.File{{Filename: "foo.yaml", URI: bertEmbeddingsURL}},
 					},
 					Overrides: map[string]interface{}{"foo": "bar"},
 				},
 			}
 			out, err := yaml.Marshal(g)
@@ -480,7 +476,7 @@ var _ = Describe("API test", func() {
 			})
 			It("apply models from config", func() {
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
+					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
@@ -526,6 +522,77 @@ var _ = Describe("API test", func() {
 				Expect(content["usage"]).To(ContainSubstring("You can test this model with curl like this"))
 			})
 			It("runs openllama(llama-ggml backend)", Label("llama"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
 				}
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
 					URL:       "github:go-skynet/model-gallery/openllama_3b.yaml",
 					Name:      "openllama_3b",
 					Overrides: map[string]interface{}{"backend": "llama-ggml", "mmap": true, "f16": true, "context_size": 128},
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
 				uuid := response["uuid"].(string)
 				Eventually(func() bool {
 					response := getModelStatus("http://127.0.0.1:9090/models/jobs/" + uuid)
 					return response["processed"].(bool)
 				}, "360s", "10s").Should(Equal(true))
 				By("testing completion")
 				resp, err := client.CreateCompletion(context.TODO(), openai.CompletionRequest{Model: "openllama_3b", Prompt: "Count up to five: one, two, three, four, "})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp.Choices)).To(Equal(1))
 				Expect(resp.Choices[0].Text).To(ContainSubstring("five"))
 				By("testing functions")
 				resp2, err := client.CreateChatCompletion(
 					context.TODO(),
 					openai.ChatCompletionRequest{
 						Model: "openllama_3b",
 						Messages: []openai.ChatCompletionMessage{
 							{
 								Role:    "user",
 								Content: "What is the weather like in San Francisco (celsius)?",
 							},
 						},
 						Functions: []openai.FunctionDefinition{
 							openai.FunctionDefinition{
 								Name:        "get_current_weather",
 								Description: "Get the current weather",
 								Parameters: jsonschema.Definition{
 									Type: jsonschema.Object,
 									Properties: map[string]jsonschema.Definition{
 										"location": {
 											Type:        jsonschema.String,
 											Description: "The city and state, e.g. San Francisco, CA",
 										},
 										"unit": {
 											Type: jsonschema.String,
 											Enum: []string{"celcius", "fahrenheit"},
 										},
 									},
 									Required: []string{"location"},
 								},
 							},
 						},
 					})
 				Expect(err).ToNot(HaveOccurred())
 				Expect(len(resp2.Choices)).To(Equal(1))
 				Expect(resp2.Choices[0].Message.FunctionCall).ToNot(BeNil())
 				Expect(resp2.Choices[0].Message.FunctionCall.Name).To(Equal("get_current_weather"), resp2.Choices[0].Message.FunctionCall.Name)
 				var res map[string]string
 				err = json.Unmarshal([]byte(resp2.Choices[0].Message.FunctionCall.Arguments), &res)
 				Expect(err).ToNot(HaveOccurred())
 				Expect(res["location"]).To(ContainSubstring("San Francisco"), fmt.Sprint(res))
 				Expect(res["unit"]).To(Equal("celcius"), fmt.Sprint(res))
 				Expect(string(resp2.Choices[0].FinishReason)).To(Equal("function_call"), fmt.Sprint(resp2.Choices[0].FinishReason))
 			})
 			It("runs openllama gguf(llama-cpp)", Label("llama-gguf"), func() {
 				if runtime.GOOS != "linux" {
 					Skip("test supported only on linux")
@@ -533,7 +600,7 @@ var _ = Describe("API test", func() {
 				modelName := "hermes-2-pro-mistral"
 				response := postModelApplyRequest("http://127.0.0.1:9090/models/apply", modelApplyRequest{
-					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/v2.25.0/embedded/models/hermes-2-pro-mistral.yaml",
+					ConfigURL: "https://raw.githubusercontent.com/mudler/LocalAI/master/embedded/models/hermes-2-pro-mistral.yaml",
 				})
 				Expect(response["uuid"]).ToNot(BeEmpty(), fmt.Sprint(response))
--- a/core/http/ctx/fiber.go
+++ b/core/http/ctx/fiber.go
@@ -0,0 +1,47 @@
 package fiberContext
 import (
 	"fmt"
 	"strings"
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
 // ModelFromContext returns the model from the context
 // If no model is specified, it will take the first available
 // Takes a model string as input which should be the one received from the user request.
 // It returns the model name resolved from the context and an error if any.
 func ModelFromContext(ctx *fiber.Ctx, cl *config.BackendConfigLoader, loader *model.ModelLoader, modelInput string, firstModel bool) (string, error) {
 	if ctx.Params("model") != "" {
 		modelInput = ctx.Params("model")
 	}
 	if ctx.Query("model") != "" {
 		modelInput = ctx.Query("model")
 	}
 	// Set model from bearer token, if available
 	bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // Reduced duplicate characters of Bearer
 	bearerExists := bearer != "" && loader.ExistsInModelPath(bearer)
 	// If no model was specified, take the first available
 	if modelInput == "" && !bearerExists && firstModel {
 		models, _ := services.ListModels(cl, loader, config.NoFilterFn, services.SKIP_IF_CONFIGURED)
 		if len(models) > 0 {
 			modelInput = models[0]
 			log.Debug().Msgf("No model specified, using: %s", modelInput)
 		} else {
 			log.Debug().Msgf("No model specified, returning error")
 			return "", fmt.Errorf("no model specified")
 		}
 	}
 	// If a model is found in bearer token takes precedence
 	if bearerExists {
 		log.Debug().Msgf("Using model from bearer token: %s", bearer)
 		modelInput = bearer
 	}
 	return modelInput, nil
 }
--- a/core/http/endpoints/elevenlabs/soundgeneration.go
+++ b/core/http/endpoints/elevenlabs/soundgeneration.go
@@ -4,7 +4,7 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/middleware"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
@@ -17,21 +17,45 @@ import (
 // @Router /v1/sound-generation [post]
 func SoundGenerationEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-
+		input := new(schema.ElevenLabsSoundGenerationRequest)
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.ElevenLabsSoundGenerationRequest)
+		// Get input data from the request body
-		if !ok || input.ModelID == "" {
+		if err := c.BodyParser(input); err != nil {
-			return fiber.ErrBadRequest
+			return err
 		}
-		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.ModelID, false)
-		if !ok || cfg == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			modelFile = input.ModelID
 			log.Warn().Str("ModelID", input.ModelID).Msg("Model not found in context")
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			modelFile = input.ModelID
 			log.Warn().Str("Request ModelID", input.ModelID).Err(err).Msg("error during LoadBackendConfigFileByName, using request ModelID")
 		} else {
 			if input.ModelID != "" {
 				modelFile = input.ModelID
 			} else {
 				modelFile = cfg.Model
 			}
 		}
 		log.Debug().Str("modelFile", "modelFile").Str("backend", cfg.Backend).Msg("Sound Generation Request about to be sent to backend")
 		if input.Duration != nil {
 			log.Debug().Float32("duration", *input.Duration).Msg("duration set")
 		}
 		if input.Temperature != nil {
 			log.Debug().Float32("temperature", *input.Temperature).Msg("temperature set")
 		}
 		// TODO: Support uploading files?
-		filePath, _, err := backend.SoundGeneration(input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
+		filePath, _, err := backend.SoundGeneration(modelFile, input.Text, input.Duration, input.Temperature, input.DoSample, nil, nil, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/elevenlabs/tts.go
+++ b/core/http/endpoints/elevenlabs/tts.go
@@ -3,7 +3,7 @@ package elevenlabs
 import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/middleware"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
@@ -20,21 +20,39 @@ import (
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input := new(schema.ElevenLabsTTSRequest)
 		voiceID := c.Params("voice-id")
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.ElevenLabsTTSRequest)
+		// Get input data from the request body
-		if !ok || input.ModelID == "" {
+		if err := c.BodyParser(input); err != nil {
-			return fiber.ErrBadRequest
+			return err
 		}
-		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.ModelID, false)
-		if !ok || cfg == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			modelFile = input.ModelID
 			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
 		}
-		log.Debug().Str("modelName", input.ModelID).Msg("elevenlabs TTS request recieved")
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			modelFile = input.ModelID
 			log.Warn().Msgf("Model not found in context: %s", input.ModelID)
 		} else {
 			if input.ModelID != "" {
 				modelFile = input.ModelID
 			} else {
 				modelFile = cfg.Model
 			}
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
-		filePath, _, err := backend.ModelTTS(input.Text, voiceID, input.LanguageCode, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Text, modelFile, "", voiceID, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/jina/rerank.go
+++ b/core/http/endpoints/jina/rerank.go
@@ -3,9 +3,9 @@ package jina
 import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/gofiber/fiber/v2"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -19,32 +19,58 @@ import (
 // @Router /v1/rerank [post]
 func JINARerankEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-
+		req := new(schema.JINARerankRequest)
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.JINARerankRequest)
+		if err := c.BodyParser(req); err != nil {
-		if !ok || input.Model == "" {
+			return c.Status(fiber.StatusBadRequest).JSON(fiber.Map{
-			return fiber.ErrBadRequest
+				"error": "Cannot parse JSON",
 			})
 		}
-		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		input := new(schema.TTSRequest)
-		if !ok || cfg == nil {
+
-			return fiber.ErrBadRequest
+		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
-		log.Debug().Str("model", input.Model).Msg("JINA Rerank Request recieved")
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		if input.Backend != "" {
 			cfg.Backend = input.Backend
 		}
 		request := &proto.RerankRequest{
-			Query:     input.Query,
+			Query:     req.Query,
-			TopN:      int32(input.TopN),
+			TopN:      int32(req.TopN),
-			Documents: input.Documents,
+			Documents: req.Documents,
 		}
-		results, err := backend.Rerank(request, ml, appConfig, *cfg)
+		results, err := backend.Rerank(modelFile, request, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
 		response := &schema.JINARerankResponse{
-			Model: input.Model,
+			Model: req.Model,
 		}
 		for _, r := range results.Results {
--- a/core/http/endpoints/localai/gallery.go
+++ b/core/http/endpoints/localai/gallery.go
@@ -117,25 +117,19 @@ func (mgs *ModelGalleryEndpointService) DeleteModelGalleryEndpoint() func(c *fib
 // @Router /models/available [get]
 func (mgs *ModelGalleryEndpointService) ListModelFromGalleryEndpoint() func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		log.Debug().Msgf("Listing models from galleries: %+v", mgs.galleries)
 		models, err := gallery.AvailableGalleryModels(mgs.galleries, mgs.modelPath)
 		if err != nil {
 			return err
 		}
-
+		log.Debug().Msgf("Models found from galleries: %+v", models)
-		log.Debug().Msgf("Available %d models from %d galleries\n", len(models), len(mgs.galleries))
+		for _, m := range models {
-
+			log.Debug().Msgf("Model found from galleries: %+v", m)
 		m := []gallery.Metadata{}
 		for _, mm := range models {
 			m = append(m, mm.Metadata)
 		}
-
+		dat, err := json.Marshal(models)
 		log.Debug().Msgf("Models %#v", m)
 		dat, err := json.Marshal(m)
 		if err != nil {
-			return fmt.Errorf("could not marshal models: %w", err)
+			return err
 		}
 		return c.Send(dat)
 	}
--- a/core/http/endpoints/localai/get_token_metrics.go
+++ b/core/http/endpoints/localai/get_token_metrics.go
@@ -4,15 +4,13 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/middleware"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/rs/zerolog/log"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 // TODO: This is not yet in use. Needs middleware rework, since it is not referenced.
 // TokenMetricsEndpoint is an endpoint to get TokensProcessed Per Second for Active SlotID
 //
 //	@Summary	Get TokenMetrics for Active Slot.
@@ -31,13 +29,18 @@ func TokenMetricsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader,
 			return err
 		}
-		modelFile, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if !ok || modelFile != "" {
+		if err != nil {
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
-		cfg, err := cl.LoadBackendConfigFileByNameDefaultOptions(modelFile, appConfig)
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			log.Err(err)
--- a/core/http/endpoints/localai/tokenize.go
+++ b/core/http/endpoints/localai/tokenize.go
@@ -4,32 +4,55 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/middleware"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
 // TokenizeEndpoint exposes a REST API to tokenize the content
 // @Summary Tokenize the input.
 // @Param request body schema.TokenizeRequest true "Request"
 // @Success 200 {object} schema.TokenizeResponse "Response"
 // @Router /v1/tokenize [post]
 func TokenizeEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
-	return func(ctx *fiber.Ctx) error {
+	return func(c *fiber.Ctx) error {
-		input, ok := ctx.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.TokenizeRequest)
+
-		if !ok || input.Model == "" {
+		input := new(schema.TokenizeRequest)
-			return fiber.ErrBadRequest
+
 		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
-		cfg, ok := ctx.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if !ok || cfg == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
 		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
 		if err != nil {
 			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		tokenResponse, err := backend.ModelTokenize(input.Content, ml, *cfg, appConfig)
 		if err != nil {
 			return err
 		}
-		return ctx.JSON(tokenResponse)
+
 		c.JSON(tokenResponse)
 		return nil
 	}
 }
--- a/core/http/endpoints/localai/tts.go
+++ b/core/http/endpoints/localai/tts.go
@@ -3,7 +3,7 @@ package localai
 import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/middleware"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
@@ -24,24 +24,37 @@ import (
 //		@Router		/tts [post]
 func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.TTSRequest)
+		input := new(schema.TTSRequest)
-		if !ok || input.Model == "" {
+
-			return fiber.ErrBadRequest
+		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
-		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if !ok || cfg == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
-		log.Debug().Str("model", input.Model).Msg("LocalAI TTS Request recieved")
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
-		if cfg.Backend == "" {
+		if err != nil {
-			if input.Backend != "" {
+			log.Err(err)
-				cfg.Backend = input.Backend
+			modelFile = input.Model
-			} else {
+			log.Warn().Msgf("Model not found in context: %s", input.Model)
-				cfg.Backend = model.PiperBackend
+		} else {
-			}
+			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		if input.Backend != "" {
 			cfg.Backend = input.Backend
 		}
 		if input.Language != "" {
@@ -52,7 +65,7 @@ func TTSEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfi
 			cfg.Voice = input.Voice
 		}
-		filePath, _, err := backend.ModelTTS(input.Input, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
+		filePath, _, err := backend.ModelTTS(cfg.Backend, input.Input, modelFile, cfg.Voice, cfg.Language, ml, appConfig, *cfg)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/localai/vad.go
+++ b/core/http/endpoints/localai/vad.go
@@ -4,8 +4,9 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
-	"github.com/mudler/LocalAI/core/http/middleware"
+	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/grpc/proto"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 )
@@ -18,20 +19,45 @@ import (
 // @Router		/vad [post]
 func VADEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.VADRequest)
+		input := new(schema.VADRequest)
-		if !ok || input.Model == "" {
+
-			return fiber.ErrBadRequest
+		// Get input data from the request body
 		if err := c.BodyParser(input); err != nil {
 			return err
 		}
-		cfg, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, false)
-		if !ok || cfg == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		}
-		log.Debug().Str("model", input.Model).Msg("LocalAI VAD Request recieved")
+		cfg, err := cl.LoadBackendConfigFileByName(modelFile, appConfig.ModelPath,
 			config.LoadOptionDebug(appConfig.Debug),
 			config.LoadOptionThreads(appConfig.Threads),
 			config.LoadOptionContextSize(appConfig.ContextSize),
 			config.LoadOptionF16(appConfig.F16),
 		)
-		resp, err := backend.VAD(input, c.Context(), ml, appConfig, *cfg)
+		if err != nil {
 			log.Err(err)
 			modelFile = input.Model
 			log.Warn().Msgf("Model not found in context: %s", input.Model)
 		} else {
 			modelFile = cfg.Model
 		}
 		log.Debug().Msgf("Request for model: %s", modelFile)
 		opts := backend.ModelOptions(*cfg, appConfig, model.WithBackendString(cfg.Backend), model.WithModel(modelFile))
 		vadModel, err := ml.Load(opts...)
 		if err != nil {
 			return err
 		}
 		req := proto.VADRequest{
 			Audio: input.Audio,
 		}
 		resp, err := vadModel.VAD(c.Context(), &req)
 		if err != nil {
 			return err
 		}
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -5,19 +5,18 @@ import (
 	"bytes"
 	"encoding/json"
 	"fmt"
 	"strings"
 	"time"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
 )
@@ -175,20 +174,26 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 		textContentToReturn = ""
 		id = uuid.New().String()
 		created = int(time.Now().Unix())
-		
+		// Set CorrelationID
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		correlationID := c.Get("X-Correlation-ID")
-		if !ok || input.Model == "" {
+		if len(strings.TrimSpace(correlationID)) == 0 {
-			return fiber.ErrBadRequest
+			correlationID = id
 		}
 		c.Set("X-Correlation-ID", correlationID)
 		// Opt-in extra usage flag
 		extraUsage := c.Get("Extra-Usage", "") != ""
-		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		modelFile, input, err := readRequest(c, cl, ml, startupOptions, true)
-		if !ok || config == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
-		log.Debug().Msgf("Chat endpoint configuration read: %+v", config)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, startupOptions.Debug, startupOptions.Threads, startupOptions.ContextSize, startupOptions.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		log.Debug().Msgf("Configuration read: %+v", config)
 		funcs := input.Functions
 		shouldUseFn := len(input.Functions) > 0 && config.ShouldUseFunctions()
@@ -396,11 +401,6 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 				log.Debug().Msgf("Text content to return: %s", textContentToReturn)
 				noActionsToRun := len(results) > 0 && results[0].Name == noActionName || len(results) == 0
 				finishReason := "stop"
 				if len(input.Tools) > 0 {
 					finishReason = "tool_calls"
 				}
 				switch {
 				case noActionsToRun:
 					result, err := handleQuestion(config, input, ml, startupOptions, results, s, predInput)
@@ -408,18 +408,19 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 						log.Error().Err(err).Msg("error handling question")
 						return
 					}
 					*c = append(*c, schema.Choice{
-						FinishReason: finishReason,
+						Message: &schema.Message{Role: "assistant", Content: &result}})
 						Message:      &schema.Message{Role: "assistant", Content: &result}})
 				default:
 					toolChoice := schema.Choice{
 						FinishReason: finishReason,
 						Message: &schema.Message{
 							Role: "assistant",
 						},
 					}
 					if len(input.Tools) > 0 {
 						toolChoice.FinishReason = "tool_calls"
 					}
 					for _, ss := range results {
 						name, args := ss.Name, ss.Arguments
 						if len(input.Tools) > 0 {
@@ -437,7 +438,7 @@ func ChatEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluat
 								},
 							)
 						} else {
-							// otherwise we return more choices directly (deprecated)
+							// otherwise we return more choices directly
 							*c = append(*c, schema.Choice{
 								FinishReason: "function_call",
 								Message: &schema.Message{
@@ -538,7 +539,7 @@ func handleQuestion(config *config.BackendConfig, input *schema.OpenAIRequest, m
 		audios = append(audios, m.StringAudios...)
 	}
-	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, config, o, nil)
+	predFunc, err := backend.ModelInference(input.Context, prompt, input.Messages, images, videos, audios, ml, *config, o, nil)
 	if err != nil {
 		log.Error().Err(err).Msg("model inference failed")
 		return "", err
--- a/core/http/endpoints/openai/completion.go
+++ b/core/http/endpoints/openai/completion.go
@@ -10,13 +10,12 @@ import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/functions"
-	"github.com/mudler/LocalAI/pkg/model"
+	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/rs/zerolog/log"
 	"github.com/valyala/fasthttp"
@@ -28,9 +27,10 @@ import (
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/completions [post]
 func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	id := uuid.New().String()
 	created := int(time.Now().Unix())
-	process := func(id string, s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) {
+	process := func(s string, req *schema.OpenAIRequest, config *config.BackendConfig, loader *model.ModelLoader, responses chan schema.OpenAIResponse, extraUsage bool) {
 		ComputeChoices(req, s, config, appConfig, loader, func(s string, c *[]schema.Choice) {}, func(s string, tokenUsage backend.TokenUsage) bool {
 			usage := schema.OpenAIUsage{
 				PromptTokens:     tokenUsage.Prompt,
@@ -63,18 +63,22 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
 	}
 	return func(c *fiber.Ctx) error {
-		// Handle Correlation
+		// Add Correlation
-		id := c.Get("X-Correlation-ID", uuid.New().String())
+		c.Set("X-Correlation-ID", id)
 		// Opt-in extra usage flag
 		extraUsage := c.Get("Extra-Usage", "") != ""
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
-		if !ok || input.Model == "" {
+		if err != nil {
-			return fiber.ErrBadRequest
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
-		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		log.Debug().Msgf("`input`: %+v", input)
-		if !ok || config == nil {
+
-			return fiber.ErrBadRequest
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		if config.ResponseFormatMap != nil {
@@ -118,7 +122,7 @@ func CompletionEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, e
 			responses := make(chan schema.OpenAIResponse)
-			go process(id, predInput, input, config, ml, responses, extraUsage)
+			go process(predInput, input, config, ml, responses, extraUsage)
 			c.Context().SetBodyStreamWriter(fasthttp.StreamWriter(func(w *bufio.Writer) {
--- a/core/http/endpoints/openai/edit.go
+++ b/core/http/endpoints/openai/edit.go
@@ -2,17 +2,16 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
 	"time"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/schema"
-
+	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/rs/zerolog/log"
@@ -26,21 +25,20 @@ import (
 func EditEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
 		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
 		if !ok || input.Model == "" {
 			return fiber.ErrBadRequest
 		}
 		// Opt-in extra usage flag
 		extraUsage := c.Get("Extra-Usage", "") != ""
-		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		modelFile, input, err := readRequest(c, cl, ml, appConfig, true)
-		if !ok || config == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
-		log.Debug().Msgf("Edit Endpoint Input : %+v", input)
+		config, input, err := mergeRequestWithConfig(modelFile, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		log.Debug().Msgf("Edit Endpoint Config: %+v", *config)
+		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		log.Debug().Msgf("Parameter Config: %+v", config)
 		var result []schema.Choice
 		totalTokenUsage := backend.TokenUsage{}
--- a/core/http/endpoints/openai/embeddings.go
+++ b/core/http/endpoints/openai/embeddings.go
@@ -2,11 +2,11 @@ package openai
 import (
 	"encoding/json"
 	"fmt"
 	"time"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/google/uuid"
@@ -23,14 +23,14 @@ import (
 // @Router /v1/embeddings [post]
 func EmbeddingsEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		model, input, err := readRequest(c, cl, ml, appConfig, true)
-		if !ok || input.Model == "" {
+		if err != nil {
-			return fiber.ErrBadRequest
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
-		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		config, input, err := mergeRequestWithConfig(model, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if !ok || config == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		log.Debug().Msgf("Parameter Config: %+v", config)
--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -15,7 +15,6 @@ import (
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/backend"
@@ -67,23 +66,25 @@ func downloadFile(url string) (string, error) {
 // @Router /v1/images/generations [post]
 func ImageEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		m, input, err := readRequest(c, cl, ml, appConfig, false)
-		if !ok || input.Model == "" {
+		if err != nil {
-			log.Error().Msg("Image Endpoint - Invalid Input")
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 			return fiber.ErrBadRequest
 		}
-		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		if m == "" {
-		if !ok || config == nil {
+			m = "stablediffusion"
-			log.Error().Msg("Image Endpoint - Invalid Config")
+		}
-			return fiber.ErrBadRequest
+		log.Debug().Msgf("Loading model: %+v", m)
 		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, 0, 0, false)
 		if err != nil {
 			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
 		src := ""
 		if input.File != "" {
 			fileData := []byte{}
 			var err error
 			// check if input.File is an URL, if so download it and save it
 			// to a temporary file
 			if strings.HasPrefix(input.File, "http://") || strings.HasPrefix(input.File, "https://") {
--- a/core/http/endpoints/openai/inference.go
+++ b/core/http/endpoints/openai/inference.go
@@ -37,7 +37,7 @@ func ComputeChoices(
 	}
 	// get the model function to call for the result
-	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, config, o, tokenCallback)
+	predFunc, err := backend.ModelInference(req.Context, predInput, req.Messages, images, videos, audios, loader, *config, o, tokenCallback)
 	if err != nil {
 		return result, backend.TokenUsage{}, err
 	}
--- a/core/http/endpoints/openai/request.go
+++ b/core/http/endpoints/openai/request.go
@@ -1,22 +1,20 @@
-package middleware
+package openai
 import (
 	"context"
 	"encoding/json"
 	"fmt"
 	"strconv"
 	"strings"
 	"github.com/gofiber/fiber/v2"
 	"github.com/google/uuid"
 	"github.com/mudler/LocalAI/core/config"
 	fiberContext "github.com/mudler/LocalAI/core/http/ctx"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/pkg/functions"
 	"github.com/mudler/LocalAI/pkg/model"
 	"github.com/mudler/LocalAI/pkg/templates"
 	"github.com/mudler/LocalAI/pkg/utils"
 	"github.com/gofiber/fiber/v2"
 	"github.com/rs/zerolog/log"
 )
@@ -25,166 +23,33 @@ type correlationIDKeyType string
 // CorrelationIDKey to track request across process boundary
 const CorrelationIDKey correlationIDKeyType = "correlationID"
-type RequestExtractor struct {
+func readRequest(c *fiber.Ctx, cl *config.BackendConfigLoader, ml *model.ModelLoader, o *config.ApplicationConfig, firstModel bool) (string, *schema.OpenAIRequest, error) {
-	backendConfigLoader *config.BackendConfigLoader
+	input := new(schema.OpenAIRequest)
 	modelLoader         *model.ModelLoader
 	applicationConfig   *config.ApplicationConfig
 }
-func NewRequestExtractor(backendConfigLoader *config.BackendConfigLoader, modelLoader *model.ModelLoader, applicationConfig *config.ApplicationConfig) *RequestExtractor {
+	// Get input data from the request body
-	return &RequestExtractor{
+	if err := c.BodyParser(input); err != nil {
-		backendConfigLoader: backendConfigLoader,
+		return "", nil, fmt.Errorf("failed parsing request body: %w", err)
 		modelLoader:         modelLoader,
 		applicationConfig:   applicationConfig,
 	}
 }
 const CONTEXT_LOCALS_KEY_MODEL_NAME = "MODEL_NAME"
 const CONTEXT_LOCALS_KEY_LOCALAI_REQUEST = "LOCALAI_REQUEST"
 const CONTEXT_LOCALS_KEY_MODEL_CONFIG = "MODEL_CONFIG"
 // TODO: Refactor to not return error if unchanged
 func (re *RequestExtractor) setModelNameFromRequest(ctx *fiber.Ctx) {
 	model, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
 	if ok && model != "" {
 		return
 	}
 	model = ctx.Params("model")
 	if (model == "") && ctx.Query("model") != "" {
 		model = ctx.Query("model")
 	}
 	if model == "" {
 		// Set model from bearer token, if available
 		bearer := strings.TrimLeft(ctx.Get("authorization"), "Bear ") // "Bearer " => "Bear" to please go-staticcheck. It looks dumb but we might as well take free performance on something called for nearly every request.
 		if bearer != "" {
 			exists, err := services.CheckIfModelExists(re.backendConfigLoader, re.modelLoader, bearer, services.ALWAYS_INCLUDE)
 			if err == nil && exists {
 				model = bearer
 			}
 		}
 	}
 	ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME, model)
 }
 func (re *RequestExtractor) BuildConstantDefaultModelNameMiddleware(defaultModelName string) fiber.Handler {
 	return func(ctx *fiber.Ctx) error {
 		re.setModelNameFromRequest(ctx)
 		localModelName, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
 		if !ok || localModelName == "" {
 			ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME, defaultModelName)
 			log.Debug().Str("defaultModelName", defaultModelName).Msg("context local model name not found, setting to default")
 		}
 		return ctx.Next()
 	}
 }
 func (re *RequestExtractor) BuildFilteredFirstAvailableDefaultModel(filterFn config.BackendConfigFilterFn) fiber.Handler {
 	return func(ctx *fiber.Ctx) error {
 		re.setModelNameFromRequest(ctx)
 		localModelName := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
 		if localModelName != "" { // Don't overwrite existing values
 			return ctx.Next()
 		}
 		modelNames, err := services.ListModels(re.backendConfigLoader, re.modelLoader, filterFn, services.SKIP_IF_CONFIGURED)
 		if err != nil {
 			log.Error().Err(err).Msg("non-fatal error calling ListModels during SetDefaultModelNameToFirstAvailable()")
 			return ctx.Next()
 		}
 		if len(modelNames) == 0 {
 			log.Warn().Msg("SetDefaultModelNameToFirstAvailable used with no matching models installed")
 			// This is non-fatal - making it so was breaking the case of direct installation of raw models
 			// return errors.New("this endpoint requires at least one model to be installed")
 			return ctx.Next()
 		}
 		ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME, modelNames[0])
 		log.Debug().Str("first model name", modelNames[0]).Msg("context local model name not found, setting to the first model")
 		return ctx.Next()
 	}
 }
 // TODO: If context and cancel above belong on all methods, move that part of above into here!
 // Otherwise, it's in its own method below for now
 func (re *RequestExtractor) SetModelAndConfig(initializer func() schema.LocalAIRequest) fiber.Handler {
 	return func(ctx *fiber.Ctx) error {
 		input := initializer()
 		if input == nil {
 			return fmt.Errorf("unable to initialize body")
 		}
 		if err := ctx.BodyParser(input); err != nil {
 			return fmt.Errorf("failed parsing request body: %w", err)
 		}
 		// If this request doesn't have an associated model name, fetch it from earlier in the middleware chain
 		if input.ModelName(nil) == "" {
 			localModelName, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_NAME).(string)
 			if ok && localModelName != "" {
 				log.Debug().Str("context localModelName", localModelName).Msg("overriding empty model name in request body with value found earlier in middleware chain")
 				input.ModelName(&localModelName)
 			}
 		}
 		cfg, err := re.backendConfigLoader.LoadBackendConfigFileByNameDefaultOptions(input.ModelName(nil), re.applicationConfig)
 		if err != nil {
 			log.Err(err)
 			log.Warn().Msgf("Model Configuration File not found for %q", input.ModelName(nil))
 		} else if cfg.Model == "" && input.ModelName(nil) != "" {
 			log.Debug().Str("input.ModelName", input.ModelName(nil)).Msg("config does not include model, using input")
 			cfg.Model = input.ModelName(nil)
 		}
 		ctx.Locals(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST, input)
 		ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_CONFIG, cfg)
 		return ctx.Next()
 	}
 }
 func (re *RequestExtractor) SetOpenAIRequest(ctx *fiber.Ctx) error {
 	input, ok := ctx.Locals(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
 	if !ok || input.Model == "" {
 		return fiber.ErrBadRequest
 	}
 	cfg, ok := ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
 	if !ok || cfg == nil {
 		return fiber.ErrBadRequest
 	}
 	received, _ := json.Marshal(input)
 	// Extract or generate the correlation ID
-	correlationID := ctx.Get("X-Correlation-ID", uuid.New().String())
+	correlationID := c.Get("X-Correlation-ID", uuid.New().String())
 	ctx.Set("X-Correlation-ID", correlationID)
-	c1, cancel := context.WithCancel(re.applicationConfig.Context)
+	ctx, cancel := context.WithCancel(o.Context)
 	// Add the correlation ID to the new context
-	ctxWithCorrelationID := context.WithValue(c1, CorrelationIDKey, correlationID)
+	ctxWithCorrelationID := context.WithValue(ctx, CorrelationIDKey, correlationID)
 	input.Context = ctxWithCorrelationID
 	input.Cancel = cancel
-	err := mergeOpenAIRequestAndBackendConfig(cfg, input)
+	log.Debug().Msgf("Request received: %s", string(received))
 	if err != nil {
 		return err
 	}
-	if cfg.Model == "" {
+	modelFile, err := fiberContext.ModelFromContext(c, cl, ml, input.Model, firstModel)
 		log.Debug().Str("input.Model", input.Model).Msg("replacing empty cfg.Model with input value")
 		cfg.Model = input.Model
 	}
-	ctx.Locals(CONTEXT_LOCALS_KEY_LOCALAI_REQUEST, input)
+	return modelFile, input, err
 	ctx.Locals(CONTEXT_LOCALS_KEY_MODEL_CONFIG, cfg)
 	return ctx.Next()
 }
-func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *schema.OpenAIRequest) error {
+func updateRequestConfig(config *config.BackendConfig, input *schema.OpenAIRequest) {
 	if input.Echo {
 		config.Echo = input.Echo
 	}
@@ -384,8 +249,6 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 		config.TypicalP = input.TypicalP
 	}
 	log.Debug().Str("input.Input", fmt.Sprintf("%+v", input.Input))
 	switch inputs := input.Input.(type) {
 	case string:
 		if inputs != "" {
@@ -442,9 +305,22 @@ func mergeOpenAIRequestAndBackendConfig(config *config.BackendConfig, input *sch
 			config.Step = q
 		}
 	}
-
+}
-	if config.Validate() {
+
-		return nil
+func mergeRequestWithConfig(modelFile string, input *schema.OpenAIRequest, cm *config.BackendConfigLoader, loader *model.ModelLoader, debug bool, threads, ctx int, f16 bool) (*config.BackendConfig, *schema.OpenAIRequest, error) {
-	}
+	cfg, err := cm.LoadBackendConfigFileByName(modelFile, loader.ModelPath,
-	return fmt.Errorf("unable to validate configuration after merging")
+		config.LoadOptionDebug(debug),
 		config.LoadOptionThreads(threads),
 		config.LoadOptionContextSize(ctx),
 		config.LoadOptionF16(f16),
 	)
 	// Set the parameters for the language model prediction
 	updateRequestConfig(cfg, input)
 	if !cfg.Validate() {
 		return nil, nil, fmt.Errorf("failed to validate config")
 	}
 	return cfg, input, err
 }
--- a/core/http/endpoints/openai/transcription.go
+++ b/core/http/endpoints/openai/transcription.go
@@ -1,6 +1,7 @@
 package openai
 import (
 	"fmt"
 	"io"
 	"net/http"
 	"os"
@@ -9,8 +10,6 @@ import (
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/gofiber/fiber/v2"
@@ -26,16 +25,15 @@ import (
 // @Router /v1/audio/transcriptions [post]
 func TranscriptEndpoint(cl *config.BackendConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) func(c *fiber.Ctx) error {
 	return func(c *fiber.Ctx) error {
-		input, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
+		m, input, err := readRequest(c, cl, ml, appConfig, false)
-		if !ok || input.Model == "" {
+		if err != nil {
-			return fiber.ErrBadRequest
+			return fmt.Errorf("failed reading parameters from request:%w", err)
 		}
-		config, ok := c.Locals(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.BackendConfig)
+		config, input, err := mergeRequestWithConfig(m, input, cl, ml, appConfig.Debug, appConfig.Threads, appConfig.ContextSize, appConfig.F16)
-		if !ok || config == nil {
+		if err != nil {
-			return fiber.ErrBadRequest
+			return fmt.Errorf("failed reading parameters from request: %w", err)
 		}
 		// retrieve the file data from the request
 		file, err := c.FormFile("file")
 		if err != nil {
--- a/core/http/routes/elevenlabs.go
+++ b/core/http/routes/elevenlabs.go
@@ -4,26 +4,17 @@ import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/elevenlabs"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 func RegisterElevenLabsRoutes(app *fiber.App,
 	re *middleware.RequestExtractor,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig) {
 	// Elevenlabs
-	app.Post("/v1/text-to-speech/:voice-id",
+	app.Post("/v1/text-to-speech/:voice-id", elevenlabs.TTSEndpoint(cl, ml, appConfig))
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.ElevenLabsTTSRequest) }),
 		elevenlabs.TTSEndpoint(cl, ml, appConfig))
-	app.Post("/v1/sound-generation",
+	app.Post("/v1/sound-generation", elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_SOUND_GENERATION)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.ElevenLabsSoundGenerationRequest) }),
 		elevenlabs.SoundGenerationEndpoint(cl, ml, appConfig))
 }
--- a/core/http/routes/jina.go
+++ b/core/http/routes/jina.go
@@ -3,22 +3,16 @@ package routes
 import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/jina"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 func RegisterJINARoutes(app *fiber.App,
 	re *middleware.RequestExtractor,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig) {
 	// POST endpoint to mimic the reranking
-	app.Post("/v1/rerank",
+	app.Post("/v1/rerank", jina.JINARerankEndpoint(cl, ml, appConfig))
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_RERANK)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.JINARerankRequest) }),
 		jina.JINARerankEndpoint(cl, ml, appConfig))
 }
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -5,16 +5,13 @@ import (
 	"github.com/gofiber/swagger"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/schema"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/internal"
 	"github.com/mudler/LocalAI/pkg/model"
 )
 func RegisterLocalAIRoutes(router *fiber.App,
 	requestExtractor *middleware.RequestExtractor,
 	cl *config.BackendConfigLoader,
 	ml *model.ModelLoader,
 	appConfig *config.ApplicationConfig,
@@ -36,18 +33,8 @@ func RegisterLocalAIRoutes(router *fiber.App,
 		router.Get("/models/jobs", modelGalleryEndpointService.GetAllStatusEndpoint())
 	}
-	router.Post("/tts",
+	router.Post("/tts", localai.TTSEndpoint(cl, ml, appConfig))
-		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
+	router.Post("/vad", localai.VADEndpoint(cl, ml, appConfig))
 		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TTSRequest) }),
 		localai.TTSEndpoint(cl, ml, appConfig))
 	vadChain := []fiber.Handler{
 		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_VAD)),
 		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.VADRequest) }),
 		localai.VADEndpoint(cl, ml, appConfig),
 	}
 	router.Post("/vad", vadChain...)
 	router.Post("/v1/vad", vadChain...)
 	// Stores
 	sl := model.NewModelLoader("")
@@ -60,14 +47,10 @@ func RegisterLocalAIRoutes(router *fiber.App,
 		router.Get("/metrics", localai.LocalAIMetricsEndpoint())
 	}
-	// Backend Statistics Module
+	// Experimental Backend Statistics Module
 	// TODO: Should these use standard middlewares? Refactor later, they are extremely simple.
 	backendMonitorService := services.NewBackendMonitorService(ml, cl, appConfig) // Split out for now
 	router.Get("/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
 	router.Post("/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
 	// The v1/* urls are exactly the same as above - makes local e2e testing easier if they are registered.
 	router.Get("/v1/backend/monitor", localai.BackendMonitorEndpoint(backendMonitorService))
 	router.Post("/v1/backend/shutdown", localai.BackendShutdownEndpoint(backendMonitorService))
 	// p2p
 	if p2p.IsP2PEnabled() {
@@ -84,9 +67,6 @@ func RegisterLocalAIRoutes(router *fiber.App,
 	router.Get("/system", localai.SystemInformations(ml, appConfig))
 	// misc
-	router.Post("/v1/tokenize",
+	router.Post("/v1/tokenize", localai.TokenizeEndpoint(cl, ml, appConfig))
 		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TOKENIZE)),
 		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TokenizeRequest) }),
 		localai.TokenizeEndpoint(cl, ml, appConfig))
 }
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -3,50 +3,51 @@ package routes
 import (
 	"github.com/gofiber/fiber/v2"
 	"github.com/mudler/LocalAI/core/application"
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
 	"github.com/mudler/LocalAI/core/http/endpoints/openai"
 	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/schema"
 )
 func RegisterOpenAIRoutes(app *fiber.App,
 	re *middleware.RequestExtractor,
 	application *application.Application) {
 	// openAI compatible API endpoint
 	// chat
-	chatChain := []fiber.Handler{
+	app.Post("/v1/chat/completions",
-		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
+		openai.ChatEndpoint(
-		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			application.BackendLoader(),
-		re.SetOpenAIRequest,
+			application.ModelLoader(),
-		openai.ChatEndpoint(application.BackendLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig()),
+			application.TemplatesEvaluator(),
-	}
+			application.ApplicationConfig(),
-	app.Post("/v1/chat/completions", chatChain...)
+		),
-	app.Post("/chat/completions", chatChain...)
+	)
 	app.Post("/chat/completions",
 		openai.ChatEndpoint(
 			application.BackendLoader(),
 			application.ModelLoader(),
 			application.TemplatesEvaluator(),
 			application.ApplicationConfig(),
 		),
 	)
 	// edit
-	editChain := []fiber.Handler{
+	app.Post("/v1/edits",
-		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EDIT)),
+		openai.EditEndpoint(
-		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
+			application.BackendLoader(),
-		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			application.ModelLoader(),
-		re.SetOpenAIRequest,
+			application.TemplatesEvaluator(),
-		openai.EditEndpoint(application.BackendLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig()),
+			application.ApplicationConfig(),
-	}
+		),
-	app.Post("/v1/edits", editChain...)
+	)
 	app.Post("/edits", editChain...)
-	// completion
+	app.Post("/edits",
-	completionChain := []fiber.Handler{
+		openai.EditEndpoint(
-		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_COMPLETION)),
+			application.BackendLoader(),
-		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
+			application.ModelLoader(),
-		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			application.TemplatesEvaluator(),
-		re.SetOpenAIRequest,
+			application.ApplicationConfig(),
-		openai.CompletionEndpoint(application.BackendLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig()),
+		),
-	}
+	)
 	app.Post("/v1/completions", completionChain...)
 	app.Post("/completions", completionChain...)
 	app.Post("/v1/engines/:model/completions", completionChain...)
 	// assistant
 	app.Get("/v1/assistants", openai.ListAssistantsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
@@ -80,37 +81,45 @@ func RegisterOpenAIRoutes(app *fiber.App,
 	app.Get("/v1/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
 	app.Get("/files/:file_id/content", openai.GetFilesContentsEndpoint(application.BackendLoader(), application.ApplicationConfig()))
-	// embeddings
+	// completion
-	embeddingChain := []fiber.Handler{
+	app.Post("/v1/completions",
-		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EMBEDDINGS)),
+		openai.CompletionEndpoint(
-		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
+			application.BackendLoader(),
-		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
+			application.ModelLoader(),
-		re.SetOpenAIRequest,
+			application.TemplatesEvaluator(),
-		openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()),
+			application.ApplicationConfig(),
-	}
+		),
 	app.Post("/v1/embeddings", embeddingChain...)
 	app.Post("/embeddings", embeddingChain...)
 	app.Post("/v1/engines/:model/embeddings", embeddingChain...)
 	// audio
 	app.Post("/v1/audio/transcriptions",
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
 		re.SetOpenAIRequest,
 		openai.TranscriptEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()),
 	)
-	app.Post("/v1/audio/speech",
+	app.Post("/completions",
-		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
+		openai.CompletionEndpoint(
-		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TTSRequest) }),
+			application.BackendLoader(),
-		localai.TTSEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
+			application.ModelLoader(),
 			application.TemplatesEvaluator(),
 			application.ApplicationConfig(),
 		),
 	)
 	app.Post("/v1/engines/:model/completions",
 		openai.CompletionEndpoint(
 			application.BackendLoader(),
 			application.ModelLoader(),
 			application.TemplatesEvaluator(),
 			application.ApplicationConfig(),
 		),
 	)
 	// embeddings
 	app.Post("/v1/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/v1/engines/:model/embeddings", openai.EmbeddingsEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	// audio
 	app.Post("/v1/audio/transcriptions", openai.TranscriptEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	app.Post("/v1/audio/speech", localai.TTSEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	// images
-	app.Post("/v1/images/generations",
+	app.Post("/v1/images/generations", openai.ImageEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 		re.BuildConstantDefaultModelNameMiddleware("stablediffusion"),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
 		re.SetOpenAIRequest,
 		openai.ImageEndpoint(application.BackendLoader(), application.ModelLoader(), application.ApplicationConfig()))
 	if application.ApplicationConfig().ImageDir != "" {
 		app.Static("/generated-images", application.ApplicationConfig().ImageDir)
--- a/core/http/views/partials/footer.html
+++ b/core/http/views/partials/footer.html
@@ -1,5 +1,5 @@
 <footer class="text-center py-8">
    LocalAI Version {{.Version}}<br>
-    <a href='https://github.com/mudler/LocalAI' class="text-blue-400 hover:text-blue-600" target="_blank">LocalAI</a> © 2023-2025 <a href='https://mudler.pm' class="text-blue-400 hover:text-blue-600" target="_blank">Ettore Di Giacinto</a>
+    <a href='https://github.com/mudler/LocalAI' class="text-blue-400 hover:text-blue-600" target="_blank">LocalAI</a> © 2023-2024 <a href='https://mudler.pm' class="text-blue-400 hover:text-blue-600" target="_blank">Ettore Di Giacinto</a>
 </footer>
 <script src="static/assets/tw-elements.js"></script>
--- a/Show More
+++ b/Show More