Address feedback from review

Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
fix: resolve duplicate MCP route registration causing 50% failure rate
2026-02-03 11:13:31 -05:00 · 2026-01-02 21:34:23 +01:00 · 2026-01-02 21:29:05 +01:00 · 2026-01-02 20:17:30 +01:00 · 2026-01-02 15:10:00 +01:00 · 2026-01-02 09:05:49 +01:00
69 changed files with 2827 additions and 693 deletions
--- a/.github/gallery-agent/agent.go
+++ b/.github/gallery-agent/agent.go
@@ -11,6 +11,7 @@ import (
 	"slices"
 	"strings"

+	"github.com/ghodss/yaml"
 	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
 	cogito "github.com/mudler/cogito"

@@ -52,6 +53,11 @@ func cleanTextContent(text string) string {
 	return stripThinkingTags(strings.TrimRight(result, "\n"))
 }

+type galleryModel struct {
+	Name string   `yaml:"name"`
+	Urls []string `yaml:"urls"`
+}
+
 // isModelExisting checks if a specific model ID exists in the gallery using text search
 func isModelExisting(modelID string) (bool, error) {
 	indexPath := getGalleryIndexPath()
@@ -60,9 +66,20 @@ func isModelExisting(modelID string) (bool, error) {
 		return false, fmt.Errorf("failed to read %s: %w", indexPath, err)
 	}

-	contentStr := string(content)
-	// Simple text search - if the model ID appears anywhere in the file, it exists
-	return strings.Contains(contentStr, modelID), nil
+	var galleryModels []galleryModel
+
+	err = yaml.Unmarshal(content, &galleryModels)
+	if err != nil {
+		return false, fmt.Errorf("failed to unmarshal %s: %w", indexPath, err)
+	}
+
+	for _, galleryModel := range galleryModels {
+		if slices.Contains(galleryModel.Urls, modelID) {
+			return true, nil
+		}
+	}
+
+	return false, nil
 }

 // filterExistingModels removes models that already exist in the gallery
@@ -134,6 +151,11 @@ func getRealReadme(ctx context.Context, repository string) (string, error) {
 }

 func selectMostInterestingModels(ctx context.Context, searchResult *SearchResult) ([]ProcessedModel, error) {
+
+	if len(searchResult.Models) == 1 {
+		return searchResult.Models, nil
+	}
+
 	// Create a conversation fragment
 	fragment := cogito.NewEmptyFragment().
 		AddMessage("user",
--- a/.github/gallery-agent/main.go
+++ b/.github/gallery-agent/main.go
@@ -119,14 +119,24 @@ func main() {
 	}

 	fmt.Println(result.FormattedOutput)
+	var models []ProcessedModel

-	// Use AI agent to select the most interesting models
-	fmt.Println("Using AI agent to select the most interesting models...")
-	models, err := selectMostInterestingModels(context.Background(), result)
-	if err != nil {
-		fmt.Fprintf(os.Stderr, "Error in model selection: %v\n", err)
-		// Continue with original result if selection fails
+	if len(result.Models) > 1 {
+		fmt.Println("More than one model found (", len(result.Models), "), using AI agent to select the most interesting models")
+		for _, model := range result.Models {
+			fmt.Println("Model: ", model.ModelID)
+		}
+		// Use AI agent to select the most interesting models
+		fmt.Println("Using AI agent to select the most interesting models...")
+		models, err = selectMostInterestingModels(context.Background(), result)
+		if err != nil {
+			fmt.Fprintf(os.Stderr, "Error in model selection: %v\n", err)
+			// Continue with original result if selection fails
+			models = result.Models
+		}
+	} else if len(result.Models) == 1 {
 		models = result.Models
+		fmt.Println("Only one model found, using it directly")
 	}

 	fmt.Print(models)
@@ -315,7 +325,7 @@ func searchAndProcessModels(searchTerm string, limit int, quantization string) (
 				outputBuilder.WriteString(fmt.Sprintf("   README Content Preview: %s\n",
 					processedModel.ReadmeContentPreview))
 			} else {
-				continue
+				fmt.Printf("   Warning: Failed to get real readme: %v\n", err)
 			}
 			fmt.Println("Real readme got", readmeContent)

--- a/.github/workflows/gallery-agent.yaml
+++ b/.github/workflows/gallery-agent.yaml
@@ -49,12 +49,12 @@ jobs:
          PATH="$PATH:$HOME/go/bin" make protogen-go
      - uses: mudler/localai-github-action@v1.1
        with:
-          model: 'qwen3-4b'
+          model: 'https://huggingface.co/bartowski/Qwen_Qwen3-1.7B-GGUF'

      - name: Run gallery agent
        env:
          #OPENAI_MODEL: ${{ secrets.OPENAI_MODEL }}
-          OPENAI_MODE: qwen3-4b
+          OPENAI_MODE: Qwen_Qwen3-1.7B-GGUF
          OPENAI_BASE_URL: "http://localhost:8080"
          OPENAI_KEY: ${{ secrets.OPENAI_KEY }}
          #OPENAI_BASE_URL: ${{ secrets.OPENAI_BASE_URL }}
--- a/README.md
+++ b/README.md
@@ -146,6 +146,9 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest
 ### NVIDIA GPU Images:

 ```bash
+# CUDA 13.0
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-13
+
 # CUDA 12.0
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12

@@ -153,7 +156,11 @@ docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gp
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-11

 # NVIDIA Jetson (L4T) ARM64
+# CUDA 12 (for Nvidia AGX Orin and similar platforms)
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64
+
+# CUDA 13 (for Nvidia DGX Spark)
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-nvidia-l4t-arm64-cuda-13
 ```

 ### AMD GPU Images (ROCm):
@@ -180,6 +187,9 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
 # CPU version
 docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

+# NVIDIA CUDA 13 version
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-13
+
 # NVIDIA CUDA 12 version
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12

@@ -269,39 +279,40 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
 ### Text Generation & Language Models
 | Backend | Description | Acceleration Support |
 |---------|-------------|---------------------|
-| **llama.cpp** | LLM inference in C/C++ | CUDA 11/12, ROCm, Intel SYCL, Vulkan, Metal, CPU |
-| **vLLM** | Fast LLM inference with PagedAttention | CUDA 12, ROCm, Intel |
-| **transformers** | HuggingFace transformers framework | CUDA 11/12, ROCm, Intel, CPU |
-| **exllama2** | GPTQ inference library | CUDA 12 |
+| **llama.cpp** | LLM inference in C/C++ | CUDA 11/12/13, ROCm, Intel SYCL, Vulkan, Metal, CPU |
+| **vLLM** | Fast LLM inference with PagedAttention | CUDA 12/13, ROCm, Intel |
+| **transformers** | HuggingFace transformers framework | CUDA 11/12/13, ROCm, Intel, CPU |
+| **exllama2** | GPTQ inference library | CUDA 12/13 |
 | **MLX** | Apple Silicon LLM inference | Metal (M1/M2/M3+) |
 | **MLX-VLM** | Apple Silicon Vision-Language Models | Metal (M1/M2/M3+) |

 ### Audio & Speech Processing
 | Backend | Description | Acceleration Support |
 |---------|-------------|---------------------|
-| **whisper.cpp** | OpenAI Whisper in C/C++ | CUDA 12, ROCm, Intel SYCL, Vulkan, CPU |
-| **faster-whisper** | Fast Whisper with CTranslate2 | CUDA 12, ROCm, Intel, CPU |
-| **bark** | Text-to-audio generation | CUDA 12, ROCm, Intel |
+| **whisper.cpp** | OpenAI Whisper in C/C++ | CUDA 12/13, ROCm, Intel SYCL, Vulkan, CPU |
+| **faster-whisper** | Fast Whisper with CTranslate2 | CUDA 12/13, ROCm, Intel, CPU |
+| **bark** | Text-to-audio generation | CUDA 12/13, ROCm, Intel |
 | **bark-cpp** | C++ implementation of Bark | CUDA, Metal, CPU |
-| **coqui** | Advanced TTS with 1100+ languages | CUDA 12, ROCm, Intel, CPU |
-| **kokoro** | Lightweight TTS model | CUDA 12, ROCm, Intel, CPU |
-| **chatterbox** | Production-grade TTS | CUDA 11/12, CPU |
+| **coqui** | Advanced TTS with 1100+ languages | CUDA 12/13, ROCm, Intel, CPU |
+| **kokoro** | Lightweight TTS model | CUDA 12/13, ROCm, Intel, CPU |
+| **chatterbox** | Production-grade TTS | CUDA 11/12/13, CPU |
 | **piper** | Fast neural TTS system | CPU |
 | **kitten-tts** | Kitten TTS models | CPU |
 | **silero-vad** | Voice Activity Detection | CPU |
-| **neutts** | Text-to-speech with voice cloning | CUDA 12, ROCm, CPU |
+| **neutts** | Text-to-speech with voice cloning | CUDA 12/13, ROCm, CPU |
+| **vibevoice** | Real-time TTS with voice cloning | CUDA 12/13, ROCm, Intel, CPU |

 ### Image & Video Generation
 | Backend | Description | Acceleration Support |
 |---------|-------------|---------------------|
-| **stablediffusion.cpp** | Stable Diffusion in C/C++ | CUDA 12, Intel SYCL, Vulkan, CPU |
-| **diffusers** | HuggingFace diffusion models | CUDA 11/12, ROCm, Intel, Metal, CPU |
+| **stablediffusion.cpp** | Stable Diffusion in C/C++ | CUDA 12/13, Intel SYCL, Vulkan, CPU |
+| **diffusers** | HuggingFace diffusion models | CUDA 11/12/13, ROCm, Intel, Metal, CPU |

 ### Specialized AI Tasks
 | Backend | Description | Acceleration Support |
 |---------|-------------|---------------------|
-| **rfdetr** | Real-time object detection | CUDA 12, Intel, CPU |
-| **rerankers** | Document reranking API | CUDA 11/12, ROCm, Intel, CPU |
+| **rfdetr** | Real-time object detection | CUDA 12/13, Intel, CPU |
+| **rerankers** | Document reranking API | CUDA 11/12/13, ROCm, Intel, CPU |
 | **local-store** | Vector database | CPU |
 | **huggingface** | HuggingFace API integration | API-based |

@@ -311,11 +322,13 @@ LocalAI supports a comprehensive range of AI backends with multiple acceleration
 |-------------------|-------------------|------------------|
 | **NVIDIA CUDA 11** | llama.cpp, whisper, stablediffusion, diffusers, rerankers, bark, chatterbox | Nvidia hardware |
 | **NVIDIA CUDA 12** | All CUDA-compatible backends | Nvidia hardware |
-| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts | AMD Graphics |
-| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark | Intel Arc, Intel iGPUs |
+| **NVIDIA CUDA 13** | All CUDA-compatible backends | Nvidia hardware |
+| **AMD ROCm** | llama.cpp, whisper, vllm, transformers, diffusers, rerankers, coqui, kokoro, bark, neutts, vibevoice | AMD Graphics |
+| **Intel oneAPI** | llama.cpp, whisper, stablediffusion, vllm, transformers, diffusers, rfdetr, rerankers, exllama2, coqui, kokoro, bark, vibevoice | Intel Arc, Intel iGPUs |
 | **Apple Metal** | llama.cpp, whisper, diffusers, MLX, MLX-VLM, bark-cpp | Apple M1/M2/M3+ |
 | **Vulkan** | llama.cpp, whisper, stablediffusion | Cross-platform GPUs |
-| **NVIDIA Jetson** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI |
+| **NVIDIA Jetson (CUDA 12)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (AGX Orin, etc.) |
+| **NVIDIA Jetson (CUDA 13)** | llama.cpp, whisper, stablediffusion, diffusers, rfdetr | ARM64 embedded AI (DGX Spark) |
 | **CPU Optimized** | All backends | AVX/AVX2/AVX512, quantization support |

 ### 🔗 Community and integrations
@@ -408,6 +421,10 @@ A huge thank you to our generous sponsors who support this project covering CI e
  </a>
 </p>

+### Individual sponsors
+
+A special thanks to individual sponsors that contributed to the project, a full list is in [Github](https://github.com/sponsors/mudler) and [buymeacoffee](https://buymeacoffee.com/mudler), a special shout out goes to [drikster80](https://github.com/drikster80) for being generous. Thank you everyone!
+
 ## 🌟 Star history

 [![LocalAI Star history Chart](https://api.star-history.com/svg?repos=go-skynet/LocalAI&type=Date)](https://star-history.com/#go-skynet/LocalAI&Date)
--- a/backend/backend.proto
+++ b/backend/backend.proto
@@ -301,7 +301,6 @@ message TranscriptSegment {
 message GenerateImageRequest {
  int32 height = 1;
  int32 width = 2;
-  int32 mode = 3;
  int32 step = 4;
  int32 seed = 5;
  string positive_prompt = 6;
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=5b6c9bc0f3c8f55598b9999b65aff7ce4119bc15
+LLAMA_VERSION?=ced765be44ce173c374f295b3c6f4175f8fd109b
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -358,9 +358,7 @@ static void params_parse(server_context& /*ctx_server*/, const backend::ModelOpt

    params.model.path = request->modelfile();
    if (!request->mmproj().empty()) {
-    // get the directory of modelfile
-      std::string model_dir = params.model.path.substr(0, params.model.path.find_last_of("/\\"));
-      params.mmproj.path = model_dir + "/"+ request->mmproj();
+      params.mmproj.path = request->mmproj();
    }
    //  params.model_alias ??
    params.model_alias =  request->modelfile();
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=bda7fab9f208dff4b67179a68f694b6ddec13326
+STABLEDIFFUSION_GGML_VERSION?=4ff2c8c74bd17c2cfffe3a01be77743fb3efba2f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/stablediffusion-ggml/gosd.cpp
+++ b/backend/go/stablediffusion-ggml/gosd.cpp
@@ -55,6 +55,7 @@ const char* schedulers[] = {
    "sgm_uniform",
    "simple",
    "smoothstep",
+    "kl_optimal",
    "lcm",
 };

@@ -147,26 +148,26 @@ static std::string lora_dir_path;
 static void build_embedding_vec(const char* embedding_dir) {
    embedding_vec.clear();
    embedding_strings.clear();
-    
+
    if (!embedding_dir || strlen(embedding_dir) == 0) {
        return;
    }
-    
+
    if (!std::filesystem::exists(embedding_dir) || !std::filesystem::is_directory(embedding_dir)) {
        fprintf(stderr, "Embedding directory does not exist or is not a directory: %s\n", embedding_dir);
        return;
    }
-    
+
    static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
-    
+
    for (const auto& entry : std::filesystem::directory_iterator(embedding_dir)) {
        if (!entry.is_regular_file()) {
            continue;
        }
-        
+
        auto path = entry.path();
        std::string ext = path.extension().string();
-        
+
        bool valid = false;
        for (const auto& e : valid_ext) {
            if (ext == e) {
@@ -177,51 +178,51 @@ static void build_embedding_vec(const char* embedding_dir) {
        if (!valid) {
            continue;
        }
-        
+
        std::string name = path.stem().string();
        std::string full_path = path.string();
-        
+
        // Store strings in persistent storage
        embedding_strings.push_back(name);
        embedding_strings.push_back(full_path);
-        
+
        sd_embedding_t item;
        item.name = embedding_strings[embedding_strings.size() - 2].c_str();
        item.path = embedding_strings[embedding_strings.size() - 1].c_str();
-        
+
        embedding_vec.push_back(item);
        fprintf(stderr, "Found embedding: %s -> %s\n", item.name, item.path);
    }
-    
+
    fprintf(stderr, "Loaded %zu embeddings from %s\n", embedding_vec.size(), embedding_dir);
 }

 // Discover LoRA files in directory and build a map of name -> path
 static std::map<std::string, std::string> discover_lora_files(const char* lora_dir) {
    std::map<std::string, std::string> lora_map;
-    
+
    if (!lora_dir || strlen(lora_dir) == 0) {
        fprintf(stderr, "LoRA directory not specified\n");
        return lora_map;
    }
-    
+
    if (!std::filesystem::exists(lora_dir) || !std::filesystem::is_directory(lora_dir)) {
        fprintf(stderr, "LoRA directory does not exist or is not a directory: %s\n", lora_dir);
        return lora_map;
    }
-    
+
    static const std::vector<std::string> valid_ext = {".safetensors", ".ckpt", ".pt", ".gguf"};
-    
+
    fprintf(stderr, "Discovering LoRA files in: %s\n", lora_dir);
-    
+
    for (const auto& entry : std::filesystem::directory_iterator(lora_dir)) {
        if (!entry.is_regular_file()) {
            continue;
        }
-        
+
        auto path = entry.path();
        std::string ext = path.extension().string();
-        
+
        bool valid = false;
        for (const auto& e : valid_ext) {
            if (ext == e) {
@@ -232,17 +233,17 @@ static std::map<std::string, std::string> discover_lora_files(const char* lora_d
        if (!valid) {
            continue;
        }
-        
+
        std::string name = path.stem().string();  // stem() already removes extension
        std::string full_path = path.string();
-        
+
        // Store the name (without extension) -> full path mapping
        // This allows users to specify just the name in <lora:name:strength>
        lora_map[name] = full_path;
-        
+
        fprintf(stderr, "Found LoRA file: %s -> %s\n", name.c_str(), full_path.c_str());
    }
-    
+
    fprintf(stderr, "Discovered %zu LoRA files in %s\n", lora_map.size(), lora_dir);
    return lora_map;
 }
@@ -264,31 +265,31 @@ static bool is_absolute_path(const std::string& p) {
 static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(const std::string& prompt, const char* lora_dir) {
    std::vector<sd_lora_t> loras;
    std::string cleaned_prompt = prompt;
-    
+
    if (!lora_dir || strlen(lora_dir) == 0) {
        fprintf(stderr, "LoRA directory not set, cannot parse LoRAs from prompt\n");
        return {loras, cleaned_prompt};
    }
-    
+
    // Discover LoRA files for name-based lookup
    std::map<std::string, std::string> discovered_lora_map = discover_lora_files(lora_dir);
-    
+
    // Map to accumulate multipliers for the same LoRA (matches upstream)
    std::map<std::string, float> lora_map;
    std::map<std::string, float> high_noise_lora_map;
-    
+
    static const std::regex re(R"(<lora:([^:>]+):([^>]+)>)");
    static const std::vector<std::string> valid_ext = {".pt", ".safetensors", ".gguf"};
    std::smatch m;
-    
+
    std::string tmp = prompt;
-    
+
    fprintf(stderr, "Parsing LoRAs from prompt: %s\n", prompt.c_str());
-    
+
    while (std::regex_search(tmp, m, re)) {
        std::string raw_path = m[1].str();
        const std::string raw_mul = m[2].str();
-        
+
        float mul = 0.f;
        try {
            mul = std::stof(raw_mul);
@@ -298,14 +299,14 @@ static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(co
            fprintf(stderr, "Invalid LoRA multiplier '%s', skipping\n", raw_mul.c_str());
            continue;
        }
-        
+
        bool is_high_noise = false;
        static const std::string prefix = "|high_noise|";
        if (raw_path.rfind(prefix, 0) == 0) {
            raw_path.erase(0, prefix.size());
            is_high_noise = true;
        }
-        
+
        std::filesystem::path final_path;
        if (is_absolute_path(raw_path)) {
            final_path = raw_path;
@@ -334,7 +335,7 @@ static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(co
                }
            }
        }
-        
+
        // Try adding extensions if file doesn't exist
        if (!std::filesystem::exists(final_path)) {
            bool found = false;
@@ -354,24 +355,24 @@ static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(co
                continue;
            }
        }
-        
+
        // Normalize path (matches upstream)
        const std::string key = final_path.lexically_normal().string();
-        
+
        // Accumulate multiplier if same LoRA appears multiple times (matches upstream)
        if (is_high_noise) {
            high_noise_lora_map[key] += mul;
        } else {
            lora_map[key] += mul;
        }
-        
-        fprintf(stderr, "Parsed LoRA: path='%s', multiplier=%.2f, is_high_noise=%s\n", 
+
+        fprintf(stderr, "Parsed LoRA: path='%s', multiplier=%.2f, is_high_noise=%s\n",
                key.c_str(), mul, is_high_noise ? "true" : "false");
-        
+
        cleaned_prompt = std::regex_replace(cleaned_prompt, re, "", std::regex_constants::format_first_only);
        tmp = m.suffix().str();
    }
-    
+
    // Build final LoRA vector from accumulated maps (matches upstream)
    // Store all path strings first to ensure they persist
    for (const auto& kv : lora_map) {
@@ -380,7 +381,7 @@ static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(co
    for (const auto& kv : high_noise_lora_map) {
        lora_strings.push_back(kv.first);
    }
-    
+
    // Now build the LoRA vector with pointers to the stored strings
    size_t string_idx = 0;
    for (const auto& kv : lora_map) {
@@ -391,7 +392,7 @@ static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(co
        loras.push_back(item);
        string_idx++;
    }
-    
+
    for (const auto& kv : high_noise_lora_map) {
        sd_lora_t item;
        item.is_high_noise = true;
@@ -400,7 +401,7 @@ static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(co
        loras.push_back(item);
        string_idx++;
    }
-    
+
    // Clean up extra spaces
    std::regex space_regex(R"(\s+)");
    cleaned_prompt = std::regex_replace(cleaned_prompt, space_regex, " ");
@@ -413,9 +414,9 @@ static std::pair<std::vector<sd_lora_t>, std::string> parse_loras_from_prompt(co
    if (last != std::string::npos) {
        cleaned_prompt.erase(last + 1);
    }
-    
+
    fprintf(stderr, "Parsed %zu LoRA(s) from prompt. Cleaned prompt: %s\n", loras.size(), cleaned_prompt.c_str());
-    
+
    return {loras, cleaned_prompt};
 }

@@ -752,7 +753,7 @@ int load_model(const char *model, char *model_path, char* options[], int threads
        }
    }
    if (scheduler == SCHEDULER_COUNT) {
-      scheduler = sd_get_default_scheduler(sd_ctx);
+      scheduler = sd_get_default_scheduler(sd_ctx, sample_method);
      fprintf(stderr, "Invalid scheduler, using default: %s\n", schedulers[scheduler]);
    }

@@ -787,7 +788,7 @@ sd_img_gen_params_t* sd_img_gen_params_new(void) {
    sd_img_gen_params_t *params = (sd_img_gen_params_t *)std::malloc(sizeof(sd_img_gen_params_t));
    sd_img_gen_params_init(params);
    sd_sample_params_init(&params->sample_params);
-    sd_easycache_params_init(&params->easycache);
+    sd_cache_params_init(&params->cache);
    params->control_strength = 0.9f;
    return params;
 }
@@ -819,18 +820,18 @@ void sd_img_gen_params_set_prompts(sd_img_gen_params_t *params, const char *prom
        fprintf(stderr, "Note: Found %zu LoRAs in negative prompt (may not be supported)\n", neg_loras.size());
    }
    cleaned_negative_prompt_storage = cleaned_negative;
-    
+
    // Set the cleaned prompts
    params->prompt = cleaned_prompt_storage.c_str();
    params->negative_prompt = cleaned_negative_prompt_storage.c_str();
-    
+
    // Set LoRAs in params
    params->loras = lora_vec.empty() ? nullptr : lora_vec.data();
    params->lora_count = static_cast<uint32_t>(lora_vec.size());
-    
+
    fprintf(stderr, "Set prompts with %zu LoRAs. Original prompt: %s\n", lora_vec.size(), prompt ? prompt : "(null)");
    fprintf(stderr, "Cleaned prompt: %s\n", cleaned_prompt_storage.c_str());
-    
+
    // Debug: Verify LoRAs are set correctly
    if (params->loras && params->lora_count > 0) {
        fprintf(stderr, "DEBUG: LoRAs set in params structure:\n");
@@ -1042,7 +1043,7 @@ int gen_image(sd_img_gen_params_t *p, int steps, char *dst, float cfg_scale, cha
        fprintf(stderr, "Using %u LoRA(s) in generation:\n", p->lora_count);
        for (uint32_t i = 0; i < p->lora_count; i++) {
            fprintf(stderr, "  LoRA[%u]: path='%s', multiplier=%.2f, is_high_noise=%s\n",
-                    i, 
+                    i,
                    p->loras[i].path ? p->loras[i].path : "(null)",
                    p->loras[i].multiplier,
                    p->loras[i].is_high_noise ? "true" : "false");
--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=6c22e792cb0ee155b6587ce71a8410c3aeb06949
+WHISPER_CPP_VERSION?=e9898ddfb908ffaa7026c66852a023889a5a7202
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/cmd/launcher/internal/release_manager.go
+++ b/cmd/launcher/internal/release_manager.go
@@ -49,6 +49,8 @@ type ReleaseManager struct {
 	ChecksumsPath string
 	// MetadataPath is where version metadata is stored
 	MetadataPath string
+	// HTTPClient is the HTTP client used for downloads
+	HTTPClient *http.Client
 }

 // NewReleaseManager creates a new release manager
@@ -65,6 +67,9 @@ func NewReleaseManager() *ReleaseManager {
 		CurrentVersion: internal.PrintableVersion(),
 		ChecksumsPath:  checksumsPath,
 		MetadataPath:   metadataPath,
+		HTTPClient: &http.Client{
+			Timeout: 30 * time.Second,
+		},
 	}
 }

@@ -72,7 +77,7 @@ func NewReleaseManager() *ReleaseManager {
 func (rm *ReleaseManager) GetLatestRelease() (*Release, error) {
 	url := fmt.Sprintf("https://api.github.com/repos/%s/%s/releases/latest", rm.GitHubOwner, rm.GitHubRepo)

-	resp, err := http.Get(url)
+	resp, err := rm.HTTPClient.Get(url)
 	if err != nil {
 		return nil, fmt.Errorf("failed to fetch latest release: %w", err)
 	}
@@ -125,18 +130,43 @@ func (rm *ReleaseManager) DownloadRelease(version string, progressCallback func(
 		rm.GitHubOwner, rm.GitHubRepo, version, version)

 	checksumPath := filepath.Join(rm.BinaryPath, "checksums.txt")
-	if err := rm.downloadFile(checksumURL, checksumPath, nil); err != nil {
-		return fmt.Errorf("failed to download checksums: %w", err)
+	manualChecksumPath := filepath.Join(rm.ChecksumsPath, fmt.Sprintf("checksums-%s.txt", version))
+
+	// First, check if there's already a checksum file (either manually placed or previously downloaded)
+	// and honor that, skipping download entirely in such case
+	var downloadErr error
+	if _, err := os.Stat(manualChecksumPath); err == nil {
+		log.Printf("Using existing checksums from: %s", manualChecksumPath)
+		checksumPath = manualChecksumPath
+	} else if _, err := os.Stat(checksumPath); err == nil {
+		log.Printf("Using existing checksums from: %s", checksumPath)
+	} else {
+		// No existing checksum file found, try to download
+		downloadErr = rm.downloadFile(checksumURL, checksumPath, nil)
+
+		if downloadErr != nil {
+			log.Printf("Warning: failed to download checksums: %v", downloadErr)
+			log.Printf("Warning: Checksum verification will be skipped. For security, you can manually place checksums at: %s", manualChecksumPath)
+			log.Printf("Download checksums from: %s", checksumURL)
+			// Continue without verification - log warning but don't fail
+		}
 	}

-	// Verify the checksum
-	if err := rm.VerifyChecksum(localPath, checksumPath, binaryName); err != nil {
-		return fmt.Errorf("checksum verification failed: %w", err)
-	}
+	// Verify the checksum if we have a checksum file
+	if _, err := os.Stat(checksumPath); err == nil {
+		if err := rm.VerifyChecksum(localPath, checksumPath, binaryName); err != nil {
+			return fmt.Errorf("checksum verification failed: %w", err)
+		}
+		log.Printf("Checksum verification successful")

-	// Save checksums persistently for future verification
-	if err := rm.saveChecksums(version, checksumPath, binaryName); err != nil {
-		log.Printf("Warning: failed to save checksums: %v", err)
+		// Save checksums persistently for future verification
+		if downloadErr == nil {
+			if err := rm.saveChecksums(version, checksumPath, binaryName); err != nil {
+				log.Printf("Warning: failed to save checksums: %v", err)
+			}
+		}
+	} else {
+		log.Printf("Warning: Proceeding without checksum verification")
 	}

 	// Make the binary executable
@@ -168,34 +198,61 @@ func (rm *ReleaseManager) GetBinaryName(version string) string {

 // downloadFile downloads a file from a URL to a local path with optional progress callback
 func (rm *ReleaseManager) downloadFile(url, filepath string, progressCallback func(float64)) error {
-	resp, err := http.Get(url)
-	if err != nil {
-		return err
-	}
-	defer resp.Body.Close()
+	return rm.downloadFileWithRetry(url, filepath, progressCallback, 3)
+}

-	if resp.StatusCode != http.StatusOK {
-		return fmt.Errorf("bad status: %s", resp.Status)
-	}
+// downloadFileWithRetry downloads a file from a URL with retry logic
+func (rm *ReleaseManager) downloadFileWithRetry(url, filepath string, progressCallback func(float64), maxRetries int) error {
+	var lastErr error

-	out, err := os.Create(filepath)
-	if err != nil {
-		return err
-	}
-	defer out.Close()
-
-	// Create a progress reader if callback is provided
-	var reader io.Reader = resp.Body
-	if progressCallback != nil && resp.ContentLength > 0 {
-		reader = &progressReader{
-			Reader:   resp.Body,
-			Total:    resp.ContentLength,
-			Callback: progressCallback,
+	for attempt := 1; attempt <= maxRetries; attempt++ {
+		if attempt > 1 {
+			log.Printf("Retrying download (attempt %d/%d): %s", attempt, maxRetries, url)
+			time.Sleep(time.Duration(attempt) * time.Second)
 		}
+
+		resp, err := rm.HTTPClient.Get(url)
+		if err != nil {
+			lastErr = err
+			continue
+		}
+
+		if resp.StatusCode != http.StatusOK {
+			resp.Body.Close()
+			lastErr = fmt.Errorf("bad status: %s", resp.Status)
+			continue
+		}
+
+		out, err := os.Create(filepath)
+		if err != nil {
+			resp.Body.Close()
+			return err
+		}
+
+		// Create a progress reader if callback is provided
+		var reader io.Reader = resp.Body
+		if progressCallback != nil && resp.ContentLength > 0 {
+			reader = &progressReader{
+				Reader:   resp.Body,
+				Total:    resp.ContentLength,
+				Callback: progressCallback,
+			}
+		}
+
+		_, err = io.Copy(out, reader)
+		resp.Body.Close()
+		out.Close()
+
+		if err != nil {
+			lastErr = err
+			os.Remove(filepath)
+			continue
+		}
+
+		return nil
 	}

-	_, err = io.Copy(out, reader)
-	return err
+	return fmt.Errorf("failed after %d attempts: %w", maxRetries, lastErr)
 }

 // saveChecksums saves checksums persistently for future verification
--- a/cmd/launcher/internal/release_manager_test.go
+++ b/cmd/launcher/internal/release_manager_test.go
@@ -4,6 +4,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"time"

 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
@@ -37,6 +38,8 @@ var _ = Describe("ReleaseManager", func() {
 			Expect(newRM.GitHubOwner).To(Equal("mudler"))
 			Expect(newRM.GitHubRepo).To(Equal("LocalAI"))
 			Expect(newRM.BinaryPath).To(ContainSubstring(".localai"))
+			Expect(newRM.HTTPClient).ToNot(BeNil())
+			Expect(newRM.HTTPClient.Timeout).To(Equal(30 * time.Second))
 		})
 	})

--- a/cmd/launcher/internal/systray_manager.go
+++ b/cmd/launcher/internal/systray_manager.go
@@ -382,7 +382,7 @@ func (sm *SystrayManager) showStatusDetails(status, version string) {
 // showErrorDialog shows a simple error dialog
 func (sm *SystrayManager) showErrorDialog(title, message string) {
 	fyne.DoAndWait(func() {
-		dialog.ShowError(fmt.Errorf(message), sm.window)
+		dialog.ShowError(fmt.Errorf("%s", message), sm.window)
 	})
 }

--- a/core/application/config_file_watcher.go
+++ b/core/application/config_file_watcher.go
@@ -214,6 +214,9 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 		envAutoloadGalleries := appConfig.AutoloadGalleries == startupAppConfig.AutoloadGalleries
 		envAutoloadBackendGalleries := appConfig.AutoloadBackendGalleries == startupAppConfig.AutoloadBackendGalleries
 		envAgentJobRetentionDays := appConfig.AgentJobRetentionDays == startupAppConfig.AgentJobRetentionDays
+		envForceEvictionWhenBusy := appConfig.ForceEvictionWhenBusy == startupAppConfig.ForceEvictionWhenBusy
+		envLRUEvictionMaxRetries := appConfig.LRUEvictionMaxRetries == startupAppConfig.LRUEvictionMaxRetries
+		envLRUEvictionRetryInterval := appConfig.LRUEvictionRetryInterval == startupAppConfig.LRUEvictionRetryInterval

 		if len(fileContent) > 0 {
 			var settings config.RuntimeSettings
@@ -277,6 +280,20 @@ func readRuntimeSettingsJson(startupAppConfig config.ApplicationConfig) fileHand
 			if settings.MemoryReclaimerThreshold != nil && !envMemoryReclaimerThreshold {
 				appConfig.MemoryReclaimerThreshold = *settings.MemoryReclaimerThreshold
 			}
+			if settings.ForceEvictionWhenBusy != nil && !envForceEvictionWhenBusy {
+				appConfig.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
+			}
+			if settings.LRUEvictionMaxRetries != nil && !envLRUEvictionMaxRetries {
+				appConfig.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
+			}
+			if settings.LRUEvictionRetryInterval != nil && !envLRUEvictionRetryInterval {
+				dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval)
+				if err == nil {
+					appConfig.LRUEvictionRetryInterval = dur
+				} else {
+					xlog.Warn("invalid LRU eviction retry interval in runtime_settings.json", "error", err, "interval", *settings.LRUEvictionRetryInterval)
+				}
+			}
 			if settings.Threads != nil && !envThreads {
 				appConfig.Threads = *settings.Threads
 			}
--- a/core/application/startup.go
+++ b/core/application/startup.go
@@ -350,9 +350,16 @@ func initializeWatchdog(application *Application, options *config.ApplicationCon
 			model.WithIdleCheck(options.WatchDogIdle),
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(options.MemoryReclaimerEnabled, options.MemoryReclaimerThreshold),
+			model.WithForceEvictionWhenBusy(options.ForceEvictionWhenBusy),
 		)
 		application.ModelLoader().SetWatchDog(wd)

+		// Initialize ModelLoader LRU eviction retry settings
+		application.ModelLoader().SetLRUEvictionRetrySettings(
+			options.LRUEvictionMaxRetries,
+			options.LRUEvictionRetryInterval,
+		)
+
 		// Start watchdog goroutine if any periodic checks are enabled
 		// LRU eviction doesn't need the Run() loop - it's triggered on model load
 		// But memory reclaimer needs the Run() loop for periodic checking
--- a/core/application/watchdog.go
+++ b/core/application/watchdog.go
@@ -35,6 +35,7 @@ func (a *Application) startWatchdog() error {
 			model.WithIdleCheck(appConfig.WatchDogIdle),
 			model.WithLRULimit(lruLimit),
 			model.WithMemoryReclaimer(appConfig.MemoryReclaimerEnabled, appConfig.MemoryReclaimerThreshold),
+			model.WithForceEvictionWhenBusy(appConfig.ForceEvictionWhenBusy),
 		)
 		a.modelLoader.SetWatchDog(wd)

--- a/core/backend/image.go
+++ b/core/backend/image.go
@@ -7,7 +7,7 @@ import (
 	model "github.com/mudler/LocalAI/pkg/model"
 )

-func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {
+func ImageGeneration(height, width, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {

 	opts := ModelOptions(modelConfig, appConfig)
 	inferenceModel, err := loader.Load(
@@ -23,7 +23,6 @@ func ImageGeneration(height, width, mode, step, seed int, positive_prompt, negat
 			&proto.GenerateImageRequest{
 				Height:           int32(height),
 				Width:            int32(width),
-				Mode:             int32(mode),
 				Step:             int32(step),
 				Seed:             int32(seed),
 				CLIPSkip:         int32(modelConfig.Diffusers.ClipSkip),
--- a/core/backend/options.go
+++ b/core/backend/options.go
@@ -36,7 +36,7 @@ func ModelOptions(c config.ModelConfig, so *config.ApplicationConfig, opts ...mo

 	c.Threads = &threads

-	grpcOpts := grpcModelOpts(c)
+	grpcOpts := grpcModelOpts(c, so.SystemState.Model.ModelsPath)
 	defOpts = append(defOpts, model.WithLoadGRPCLoadModelOpts(grpcOpts))

 	if so.ParallelBackendRequests {
@@ -72,7 +72,7 @@ func getSeed(c config.ModelConfig) int32 {
 	return seed
 }

-func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions {
+func grpcModelOpts(c config.ModelConfig, modelPath string) *pb.ModelOptions {
 	b := 512
 	if c.Batch != 0 {
 		b = c.Batch
@@ -131,7 +131,7 @@ func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions {
 		})
 	}

-	return &pb.ModelOptions{
+	opts := &pb.ModelOptions{
 		CUDA:                 c.CUDA || c.Diffusers.CUDA,
 		SchedulerType:        c.Diffusers.SchedulerType,
 		GrammarTriggers:      triggers,
@@ -170,7 +170,6 @@ func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions {
 		LimitImagePerPrompt: int32(c.LimitMMPerPrompt.LimitImagePerPrompt),
 		LimitVideoPerPrompt: int32(c.LimitMMPerPrompt.LimitVideoPerPrompt),
 		LimitAudioPerPrompt: int32(c.LimitMMPerPrompt.LimitAudioPerPrompt),
-		MMProj:              c.MMProj,
 		FlashAttention:      flashAttention,
 		CacheTypeKey:        c.CacheTypeK,
 		CacheTypeValue:      c.CacheTypeV,
@@ -198,6 +197,12 @@ func grpcModelOpts(c config.ModelConfig) *pb.ModelOptions {
 		// RWKV
 		Tokenizer: c.Tokenizer,
 	}
+
+	if c.MMProj != "" {
+		opts.MMProj = filepath.Join(modelPath, c.MMProj)
+	}
+
+	return opts
 }

 func gRPCPredictOpts(c config.ModelConfig, modelPath string) *pb.PredictOptions {
--- a/core/cli/models.go
+++ b/core/cli/models.go
@@ -80,7 +80,9 @@ func (mi *ModelsInstall) Run(ctx *cliContext.Context) error {
 		return err
 	}

-	galleryService := services.NewGalleryService(&config.ApplicationConfig{}, model.NewModelLoader(systemState))
+	galleryService := services.NewGalleryService(&config.ApplicationConfig{
+		SystemState: systemState,
+	}, model.NewModelLoader(systemState))
 	err = galleryService.Start(context.Background(), config.NewModelConfigLoader(mi.ModelsPath), systemState)
 	if err != nil {
 		return err
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -73,10 +73,15 @@ type RunCMD struct {
 	WatchdogBusyTimeout                string   `env:"LOCALAI_WATCHDOG_BUSY_TIMEOUT,WATCHDOG_BUSY_TIMEOUT" default:"5m" help:"Threshold beyond which a busy backend should be stopped" group:"backends"`
 	EnableMemoryReclaimer              bool     `env:"LOCALAI_MEMORY_RECLAIMER,MEMORY_RECLAIMER,LOCALAI_GPU_RECLAIMER,GPU_RECLAIMER" default:"false" help:"Enable memory threshold monitoring to auto-evict backends when memory usage exceeds threshold (uses GPU VRAM if available, otherwise RAM)" group:"backends"`
 	MemoryReclaimerThreshold           float64  `env:"LOCALAI_MEMORY_RECLAIMER_THRESHOLD,MEMORY_RECLAIMER_THRESHOLD,LOCALAI_GPU_RECLAIMER_THRESHOLD,GPU_RECLAIMER_THRESHOLD" default:"0.95" help:"Memory usage threshold (0.0-1.0) that triggers backend eviction (default 0.95 = 95%%)" group:"backends"`
+	ForceEvictionWhenBusy              bool     `env:"LOCALAI_FORCE_EVICTION_WHEN_BUSY,FORCE_EVICTION_WHEN_BUSY" default:"false" help:"Force eviction even when models have active API calls (default: false for safety)" group:"backends"`
+	LRUEvictionMaxRetries              int      `env:"LOCALAI_LRU_EVICTION_MAX_RETRIES,LRU_EVICTION_MAX_RETRIES" default:"30" help:"Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)" group:"backends"`
+	LRUEvictionRetryInterval           string   `env:"LOCALAI_LRU_EVICTION_RETRY_INTERVAL,LRU_EVICTION_RETRY_INTERVAL" default:"1s" help:"Interval between retries when waiting for busy models to become idle (e.g., 1s, 2s) (default: 1s)" group:"backends"`
 	Federated                          bool     `env:"LOCALAI_FEDERATED,FEDERATED" help:"Enable federated instance" group:"federated"`
 	DisableGalleryEndpoint             bool     `env:"LOCALAI_DISABLE_GALLERY_ENDPOINT,DISABLE_GALLERY_ENDPOINT" help:"Disable the gallery endpoints" group:"api"`
 	MachineTag                         string   `env:"LOCALAI_MACHINE_TAG,MACHINE_TAG" help:"Add Machine-Tag header to each response which is useful to track the machine in the P2P network" group:"api"`
 	LoadToMemory                       []string `env:"LOCALAI_LOAD_TO_MEMORY,LOAD_TO_MEMORY" help:"A list of models to load into memory at startup" group:"models"`
+	EnableTracing                      bool     `env:"LOCALAI_ENABLE_TRACING,ENABLE_TRACING" help:"Enable API tracing" group:"api"`
+	TracingMaxItems                    int      `env:"LOCALAI_TRACING_MAX_ITEMS" default:"1024" help:"Maximum number of traces to keep" group:"api"`
 	AgentJobRetentionDays              int      `env:"LOCALAI_AGENT_JOB_RETENTION_DAYS,AGENT_JOB_RETENTION_DAYS" default:"30" help:"Number of days to keep agent job history (default: 30)" group:"api"`

 	Version bool
@@ -149,6 +154,15 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		opts = append(opts, config.DisableRuntimeSettings)
 	}

+	if r.EnableTracing {
+		opts = append(opts, config.EnableTracing)
+	}
+
+	if r.EnableTracing {
+		opts = append(opts, config.EnableTracing)
+	}
+	opts = append(opts, config.WithTracingMaxItems(r.TracingMaxItems))
+
 	token := ""
 	if r.Peer2Peer || r.Peer2PeerToken != "" {
 		xlog.Info("P2P mode enabled")
@@ -220,6 +234,21 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		opts = append(opts, config.EnableSingleBackend)
 	}

+	// Handle LRU eviction settings
+	if r.ForceEvictionWhenBusy {
+		opts = append(opts, config.WithForceEvictionWhenBusy(true))
+	}
+	if r.LRUEvictionMaxRetries > 0 {
+		opts = append(opts, config.WithLRUEvictionMaxRetries(r.LRUEvictionMaxRetries))
+	}
+	if r.LRUEvictionRetryInterval != "" {
+		dur, err := time.ParseDuration(r.LRUEvictionRetryInterval)
+		if err != nil {
+			return fmt.Errorf("invalid LRU eviction retry interval: %w", err)
+		}
+		opts = append(opts, config.WithLRUEvictionRetryInterval(dur))
+	}
+
 	// split ":" to get backend name and the uri
 	for _, v := range r.ExternalGRPCBackends {
 		backend := v[:strings.IndexByte(v, ':')]
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -19,6 +19,8 @@ type ApplicationConfig struct {
 	UploadLimitMB, Threads, ContextSize int
 	F16                                 bool
 	Debug                               bool
+	EnableTracing                       bool
+	TracingMaxItems                     int
 	GeneratedContentDir                 string

 	UploadDir string
@@ -64,6 +66,11 @@ type ApplicationConfig struct {
 	MemoryReclaimerEnabled   bool    // Enable memory threshold monitoring
 	MemoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)

+	// Eviction settings
+	ForceEvictionWhenBusy    bool          // Force eviction even when models have active API calls (default: false for safety)
+	LRUEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models to become idle (default: 30)
+	LRUEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models (default: 1s)
+
 	ModelsURL []string

 	WatchDogBusyTimeout, WatchDogIdleTimeout time.Duration
@@ -86,10 +93,13 @@ type AppOption func(*ApplicationConfig)

 func NewApplicationConfig(o ...AppOption) *ApplicationConfig {
 	opt := &ApplicationConfig{
-		Context:               context.Background(),
-		UploadLimitMB:         15,
-		Debug:                 true,
-		AgentJobRetentionDays: 30, // Default: 30 days
+		Context:                  context.Background(),
+		UploadLimitMB:            15,
+		Debug:                    true,
+		AgentJobRetentionDays:    30,              // Default: 30 days
+		LRUEvictionMaxRetries:    30,              // Default: 30 retries
+		LRUEvictionRetryInterval: 1 * time.Second, // Default: 1 second
+		TracingMaxItems:       1024,
 		PathWithoutAuth: []string{
 			"/static/",
 			"/generated-audio/",
@@ -158,6 +168,10 @@ var EnableWatchDog = func(o *ApplicationConfig) {
 	o.WatchDog = true
 }

+var EnableTracing = func(o *ApplicationConfig) {
+	o.EnableTracing = true
+}
+
 var EnableWatchDogIdleCheck = func(o *ApplicationConfig) {
 	o.WatchDog = true
 	o.WatchDogIdle = true
@@ -259,6 +273,31 @@ func (o *ApplicationConfig) GetEffectiveMaxActiveBackends() int {
 	return 0
 }

+// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
+func WithForceEvictionWhenBusy(enabled bool) AppOption {
+	return func(o *ApplicationConfig) {
+		o.ForceEvictionWhenBusy = enabled
+	}
+}
+
+// WithLRUEvictionMaxRetries sets the maximum number of retries when waiting for busy models to become idle
+func WithLRUEvictionMaxRetries(maxRetries int) AppOption {
+	return func(o *ApplicationConfig) {
+		if maxRetries > 0 {
+			o.LRUEvictionMaxRetries = maxRetries
+		}
+	}
+}
+
+// WithLRUEvictionRetryInterval sets the interval between retries when waiting for busy models
+func WithLRUEvictionRetryInterval(interval time.Duration) AppOption {
+	return func(o *ApplicationConfig) {
+		if interval > 0 {
+			o.LRUEvictionRetryInterval = interval
+		}
+	}
+}
+
 var EnableParallelBackendRequests = func(o *ApplicationConfig) {
 	o.ParallelBackendRequests = true
 }
@@ -386,6 +425,12 @@ func WithDebug(debug bool) AppOption {
 	}
 }

+func WithTracingMaxItems(items int) AppOption {
+	return func(o *ApplicationConfig) {
+		o.TracingMaxItems = items
+	}
+}
+
 func WithGeneratedContentDir(generatedContentDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.GeneratedContentDir = generatedContentDir
@@ -505,10 +550,14 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	parallelBackendRequests := o.ParallelBackendRequests
 	memoryReclaimerEnabled := o.MemoryReclaimerEnabled
 	memoryReclaimerThreshold := o.MemoryReclaimerThreshold
+	forceEvictionWhenBusy := o.ForceEvictionWhenBusy
+	lruEvictionMaxRetries := o.LRUEvictionMaxRetries
 	threads := o.Threads
 	contextSize := o.ContextSize
 	f16 := o.F16
 	debug := o.Debug
+	tracingMaxItems := o.TracingMaxItems
+	enableTracing := o.EnableTracing
 	cors := o.CORS
 	csrf := o.CSRF
 	corsAllowOrigins := o.CORSAllowOrigins
@@ -539,6 +588,12 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 	} else {
 		watchdogInterval = "2s" // default
 	}
+	var lruEvictionRetryInterval string
+	if o.LRUEvictionRetryInterval > 0 {
+		lruEvictionRetryInterval = o.LRUEvictionRetryInterval.String()
+	} else {
+		lruEvictionRetryInterval = "1s" // default
+	}

 	return RuntimeSettings{
 		WatchdogEnabled:          &watchdogEnabled,
@@ -552,10 +607,15 @@ func (o *ApplicationConfig) ToRuntimeSettings() RuntimeSettings {
 		ParallelBackendRequests:  &parallelBackendRequests,
 		MemoryReclaimerEnabled:   &memoryReclaimerEnabled,
 		MemoryReclaimerThreshold: &memoryReclaimerThreshold,
+		ForceEvictionWhenBusy:    &forceEvictionWhenBusy,
+		LRUEvictionMaxRetries:    &lruEvictionMaxRetries,
+		LRUEvictionRetryInterval: &lruEvictionRetryInterval,
 		Threads:                  &threads,
 		ContextSize:              &contextSize,
 		F16:                      &f16,
 		Debug:                    &debug,
+		TracingMaxItems:          &tracingMaxItems,
+		EnableTracing:            &enableTracing,
 		CORS:                     &cors,
 		CSRF:                     &csrf,
 		CORSAllowOrigins:         &corsAllowOrigins,
@@ -644,6 +704,20 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 			requireRestart = true
 		}
 	}
+	if settings.ForceEvictionWhenBusy != nil {
+		o.ForceEvictionWhenBusy = *settings.ForceEvictionWhenBusy
+		// This setting doesn't require restart, can be updated dynamically
+	}
+	if settings.LRUEvictionMaxRetries != nil {
+		o.LRUEvictionMaxRetries = *settings.LRUEvictionMaxRetries
+		// This setting doesn't require restart, can be updated dynamically
+	}
+	if settings.LRUEvictionRetryInterval != nil {
+		if dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err == nil {
+			o.LRUEvictionRetryInterval = dur
+			// This setting doesn't require restart, can be updated dynamically
+		}
+	}
 	if settings.Threads != nil {
 		o.Threads = *settings.Threads
 	}
@@ -656,6 +730,12 @@ func (o *ApplicationConfig) ApplyRuntimeSettings(settings *RuntimeSettings) (req
 	if settings.Debug != nil {
 		o.Debug = *settings.Debug
 	}
+	if settings.EnableTracing != nil {
+		o.EnableTracing = *settings.EnableTracing
+	}
+	if settings.TracingMaxItems != nil {
+		o.TracingMaxItems = *settings.TracingMaxItems
+	}
 	if settings.CORS != nil {
 		o.CORS = *settings.CORS
 	}
--- a/core/config/model_config.go
+++ b/core/config/model_config.go
@@ -501,7 +501,13 @@ func (c *ModelConfig) Validate() (bool, error) {
 		if !re.MatchString(c.Backend) {
 			return false, fmt.Errorf("invalid backend name: %s", c.Backend)
 		}
-		return true, nil
+	}
+
+	// Validate MCP configuration if present
+	if c.MCP.Servers != "" || c.MCP.Stdio != "" {
+		if _, _, err := c.MCP.MCPConfigFromYAML(); err != nil {
+			return false, fmt.Errorf("invalid MCP configuration: %w", err)
+		}
 	}

 	return true, nil
--- a/core/config/model_config_loader.go
+++ b/core/config/model_config_loader.go
@@ -169,8 +169,10 @@ func (bcl *ModelConfigLoader) LoadMultipleModelConfigsSingleFile(file string, op
 	}

 	for _, cc := range c {
-		if valid, _ := cc.Validate(); valid {
+		if valid, err := cc.Validate(); valid {
 			bcl.configs[cc.Name] = *cc
+		} else {
+			xlog.Warn("skipping invalid model config", "name", cc.Name, "error", err)
 		}
 	}
 	return nil
@@ -184,9 +186,12 @@ func (bcl *ModelConfigLoader) ReadModelConfig(file string, opts ...ConfigLoaderO
 		return fmt.Errorf("ReadModelConfig cannot read config file %q: %w", file, err)
 	}

-	if valid, _ := c.Validate(); valid {
+	if valid, err := c.Validate(); valid {
 		bcl.configs[c.Name] = *c
 	} else {
+		if err != nil {
+			return fmt.Errorf("config is not valid: %w", err)
+		}
 		return fmt.Errorf("config is not valid")
 	}

@@ -364,10 +369,10 @@ func (bcl *ModelConfigLoader) LoadModelConfigsFromPath(path string, opts ...Conf
 			xlog.Error("LoadModelConfigsFromPath cannot read config file", "error", err, "File Name", file.Name())
 			continue
 		}
-		if valid, _ := c.Validate(); valid {
+		if valid, validationErr := c.Validate(); valid {
 			bcl.configs[c.Name] = *c
 		} else {
-			xlog.Error("config is not valid", "error", err, "Name", c.Name)
+			xlog.Error("config is not valid", "error", validationErr, "Name", c.Name)
 		}
 	}

--- a/core/config/model_config_test.go
+++ b/core/config/model_config_test.go
@@ -166,4 +166,63 @@ parameters:
 		Expect(i.HasUsecases(FLAG_COMPLETION)).To(BeTrue())
 		Expect(i.HasUsecases(FLAG_CHAT)).To(BeTrue())
 	})
+	It("Test Validate with invalid MCP config", func() {
+		tmp, err := os.CreateTemp("", "config.yaml")
+		Expect(err).To(BeNil())
+		defer os.Remove(tmp.Name())
+		_, err = tmp.WriteString(
+			`name: test-mcp
+backend: "llama-cpp"
+mcp:
+  stdio: |
+    {
+      "mcpServers": {
+        "ddg": {
+          "command": "/docker/docker",
+          "args": ["run", "-i"]
+        }
+        "weather": {
+          "command": "/docker/docker",
+          "args": ["run", "-i"]
+        }
+      }
+    }`)
+		Expect(err).ToNot(HaveOccurred())
+		config, err := readModelConfigFromFile(tmp.Name())
+		Expect(err).To(BeNil())
+		Expect(config).ToNot(BeNil())
+		valid, err := config.Validate()
+		Expect(err).To(HaveOccurred())
+		Expect(valid).To(BeFalse())
+		Expect(err.Error()).To(ContainSubstring("invalid MCP configuration"))
+	})
+	It("Test Validate with valid MCP config", func() {
+		tmp, err := os.CreateTemp("", "config.yaml")
+		Expect(err).To(BeNil())
+		defer os.Remove(tmp.Name())
+		_, err = tmp.WriteString(
+			`name: test-mcp-valid
+backend: "llama-cpp"
+mcp:
+  stdio: |
+    {
+      "mcpServers": {
+        "ddg": {
+          "command": "/docker/docker",
+          "args": ["run", "-i"]
+        },
+        "weather": {
+          "command": "/docker/docker",
+          "args": ["run", "-i"]
+        }
+      }
+    }`)
+		Expect(err).ToNot(HaveOccurred())
+		config, err := readModelConfigFromFile(tmp.Name())
+		Expect(err).To(BeNil())
+		Expect(config).ToNot(BeNil())
+		valid, err := config.Validate()
+		Expect(err).To(BeNil())
+		Expect(valid).To(BeTrue())
+	})
 })
--- a/core/config/runtime_settings.go
+++ b/core/config/runtime_settings.go
@@ -26,11 +26,18 @@ type RuntimeSettings struct {
 	MemoryReclaimerEnabled   *bool    `json:"memory_reclaimer_enabled,omitempty"`   // Enable memory threshold monitoring
 	MemoryReclaimerThreshold *float64 `json:"memory_reclaimer_threshold,omitempty"` // Threshold 0.0-1.0 (e.g., 0.95 = 95%)

+	// Eviction settings
+	ForceEvictionWhenBusy      *bool   `json:"force_eviction_when_busy,omitempty"`      // Force eviction even when models have active API calls (default: false for safety)
+	LRUEvictionMaxRetries      *int    `json:"lru_eviction_max_retries,omitempty"`      // Maximum number of retries when waiting for busy models to become idle (default: 30)
+	LRUEvictionRetryInterval   *string `json:"lru_eviction_retry_interval,omitempty"`   // Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)
+
 	// Performance settings
-	Threads     *int  `json:"threads,omitempty"`
-	ContextSize *int  `json:"context_size,omitempty"`
-	F16         *bool `json:"f16,omitempty"`
-	Debug       *bool `json:"debug,omitempty"`
+	Threads         *int  `json:"threads,omitempty"`
+	ContextSize     *int  `json:"context_size,omitempty"`
+	F16             *bool `json:"f16,omitempty"`
+	Debug           *bool `json:"debug,omitempty"`
+	EnableTracing   *bool `json:"enable_tracing,omitempty"`
+	TracingMaxItems *int  `json:"tracing_max_items,omitempty"`

 	// Security/CORS settings
 	CORS             *bool   `json:"cors,omitempty"`
--- a/core/gallery/models.go
+++ b/core/gallery/models.go
@@ -123,8 +123,10 @@ func InstallModelFromGallery(
 		config.Files = append(config.Files, model.AdditionalFiles...)

 		// TODO model.Overrides could be merged with user overrides (not defined yet)
-		if err := mergo.Merge(&model.Overrides, req.Overrides, mergo.WithOverride); err != nil {
-			return err
+		if req.Overrides != nil {
+			if err := mergo.Merge(&model.Overrides, req.Overrides, mergo.WithOverride); err != nil {
+				return err
+			}
 		}

 		installedModel, err := InstallModel(ctx, systemState, installName, &config, model.Overrides, downloadStatus, enforceScan)
@@ -245,8 +247,10 @@ func InstallModel(ctx context.Context, systemState *system.SystemState, nameOver

 		configMap["name"] = name

-		if err := mergo.Merge(&configMap, configOverrides, mergo.WithOverride); err != nil {
-			return nil, err
+		if configOverrides != nil {
+			if err := mergo.Merge(&configMap, configOverrides, mergo.WithOverride); err != nil {
+				return nil, err
+			}
 		}

 		// Write updated config file
--- a/core/gallery/models_test.go
+++ b/core/gallery/models_test.go
@@ -184,6 +184,26 @@ var _ = Describe("Model test", func() {
 			Expect(err).To(HaveOccurred())
 		})

+		It("handles nil configOverrides without panic", func() {
+			tempdir, err := os.MkdirTemp("", "test")
+			Expect(err).ToNot(HaveOccurred())
+			defer os.RemoveAll(tempdir)
+			c, err := ReadConfigFile[ModelConfig](filepath.Join(os.Getenv("FIXTURES"), "gallery_simple.yaml"))
+			Expect(err).ToNot(HaveOccurred())
+
+			systemState, err := system.GetSystemState(
+				system.WithModelPath(tempdir),
+			)
+			Expect(err).ToNot(HaveOccurred())
+			_, err = InstallModel(context.TODO(), systemState, "test-model", c, nil, func(string, string, string, float64) {}, true)
+			Expect(err).ToNot(HaveOccurred())
+
+			for _, f := range []string{"cerebras", "cerebras-completion.tmpl", "cerebras-chat.tmpl", "test-model.yaml"} {
+				_, err = os.Stat(filepath.Join(tempdir, f))
+				Expect(err).ToNot(HaveOccurred())
+			}
+		})
+
 		It("does not delete shared model files when one config is deleted", func() {
 			tempdir, err := os.MkdirTemp("", "test")
 			Expect(err).ToNot(HaveOccurred())
--- a/core/http/endpoints/localai/mcp.go
+++ b/core/http/endpoints/localai/mcp.go
@@ -53,12 +53,12 @@ type MCPErrorEvent struct {
 	Message string `json:"message"`
 }

-// MCPStreamEndpoint is the SSE streaming endpoint for MCP chat completions
+// MCPEndpoint is the endpoint for MCP chat completions. Supports SSE mode, but it is not compatible with the OpenAI apis.
 // @Summary Stream MCP chat completions with reasoning, tool calls, and results
 // @Param request body schema.OpenAIRequest true "query params"
 // @Success 200 {object} schema.OpenAIResponse "Response"
 // @Router /v1/mcp/chat/completions [post]
-func MCPStreamEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
+func MCPEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
 	return func(c echo.Context) error {
 		ctx := c.Request().Context()
 		created := int(time.Now().Unix())
--- a/core/http/endpoints/localai/settings.go
+++ b/core/http/endpoints/localai/settings.go
@@ -76,6 +76,14 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 				})
 			}
 		}
+		if settings.LRUEvictionRetryInterval != nil {
+			if _, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err != nil {
+				return c.JSON(http.StatusBadRequest, schema.SettingsResponse{
+					Success: false,
+					Error:   "Invalid lru_eviction_retry_interval format: " + err.Error(),
+				})
+			}
+		}

 		// Save to file
 		if appConfig.DynamicConfigsDir == "" {
@@ -111,6 +119,31 @@ func UpdateSettingsEndpoint(app *application.Application) echo.HandlerFunc {
 			appConfig.ApiKeys = append(envKeys, runtimeKeys...)
 		}

+		// Update watchdog dynamically for settings that don't require restart
+		if settings.ForceEvictionWhenBusy != nil {
+			currentWD := app.ModelLoader().GetWatchDog()
+			if currentWD != nil {
+				currentWD.SetForceEvictionWhenBusy(*settings.ForceEvictionWhenBusy)
+				xlog.Info("Updated watchdog force eviction when busy setting", "forceEvictionWhenBusy", *settings.ForceEvictionWhenBusy)
+			}
+		}
+
+		// Update ModelLoader LRU eviction retry settings dynamically
+		maxRetries := appConfig.LRUEvictionMaxRetries
+		retryInterval := appConfig.LRUEvictionRetryInterval
+		if settings.LRUEvictionMaxRetries != nil {
+			maxRetries = *settings.LRUEvictionMaxRetries
+		}
+		if settings.LRUEvictionRetryInterval != nil {
+			if dur, err := time.ParseDuration(*settings.LRUEvictionRetryInterval); err == nil {
+				retryInterval = dur
+			}
+		}
+		if settings.LRUEvictionMaxRetries != nil || settings.LRUEvictionRetryInterval != nil {
+			app.ModelLoader().SetLRUEvictionRetrySettings(maxRetries, retryInterval)
+			xlog.Info("Updated LRU eviction retry settings", "maxRetries", maxRetries, "retryInterval", retryInterval)
+		}
+
 		// Check if agent job retention changed
 		agentJobChanged := settings.AgentJobRetentionDays != nil

--- a/core/http/endpoints/openai/image.go
+++ b/core/http/endpoints/openai/image.go
@@ -157,16 +157,11 @@ func ImageEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
 					negative_prompt = prompts[1]
 				}

-				mode := 0
 				step := config.Step
 				if step == 0 {
 					step = 15
 				}

-				if input.Mode != 0 {
-					mode = input.Mode
-				}
-
 				if input.Step != 0 {
 					step = input.Step
 				}
@@ -197,7 +192,7 @@ func ImageEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
 					inputSrc = inputImages[0]
 				}

-				fn, err := backend.ImageGeneration(height, width, mode, step, *config.Seed, positive_prompt, negative_prompt, inputSrc, output, ml, *config, appConfig, refImages)
+				fn, err := backend.ImageGeneration(height, width, step, *config.Seed, positive_prompt, negative_prompt, inputSrc, output, ml, *config, appConfig, refImages)
 				if err != nil {
 					return err
 				}
@@ -232,6 +227,17 @@ func ImageEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfi
 			ID:      id,
 			Created: created,
 			Data:    result,
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     0,
+				CompletionTokens: 0,
+				TotalTokens:      0,
+				InputTokens:      0,
+				OutputTokens:     0,
+				InputTokensDetails: &schema.InputTokensDetails{
+					TextTokens:  0,
+					ImageTokens: 0,
+				},
+			},
 		}

 		jsonResult, _ := json.Marshal(resp)
--- a/core/http/endpoints/openai/inpainting.go
+++ b/core/http/endpoints/openai/inpainting.go
@@ -231,7 +231,7 @@ func InpaintingEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
 		// Note: ImageGenerationFunc will call into the loaded model's GenerateImage which expects src JSON
 		// Also pass ref images (orig + mask) so backends that support ref images can use them.
 		refImages := []string{origRef, maskRef}
-		fn, err := backend.ImageGenerationFunc(height, width, 0, steps, 0, prompt, "", jsonPath, dst, ml, *cfg, appConfig, refImages)
+		fn, err := backend.ImageGenerationFunc(height, width, steps, 0, prompt, "", jsonPath, dst, ml, *cfg, appConfig, refImages)
 		if err != nil {
 			return err
 		}
@@ -258,6 +258,17 @@ func InpaintingEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, app
 			Data: []schema.Item{{
 				URL: imgPath,
 			}},
+			Usage: schema.OpenAIUsage{
+				PromptTokens:     0,
+				CompletionTokens: 0,
+				TotalTokens:      0,
+				InputTokens:      0,
+				OutputTokens:     0,
+				InputTokensDetails: &schema.InputTokensDetails{
+					TextTokens:  0,
+					ImageTokens: 0,
+				},
+			},
 		}

 		// mark success so defer cleanup will not remove output files
--- a/core/http/endpoints/openai/inpainting_test.go
+++ b/core/http/endpoints/openai/inpainting_test.go
@@ -10,9 +10,9 @@ import (
 	"testing"

 	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/backend"
 	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	model "github.com/mudler/LocalAI/pkg/model"
 	"github.com/stretchr/testify/require"
 )
@@ -58,7 +58,7 @@ func TestInpainting_HappyPath(t *testing.T) {

 	// stub the backend.ImageGenerationFunc
 	orig := backend.ImageGenerationFunc
-	backend.ImageGenerationFunc = func(height, width, mode, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {
+	backend.ImageGenerationFunc = func(height, width, step, seed int, positive_prompt, negative_prompt, src, dst string, loader *model.ModelLoader, modelConfig config.ModelConfig, appConfig *config.ApplicationConfig, refImages []string) (func() error, error) {
 		fn := func() error {
 			// write a fake png file to dst
 			return os.WriteFile(dst, []byte("PNGDATA"), 0644)
--- a/core/http/endpoints/openai/mcp.go
+++ b/core/http/endpoints/openai/mcp.go
@@ -1,148 +0,0 @@
-package openai
-
-import (
-	"context"
-	"encoding/json"
-	"errors"
-	"fmt"
-	"net"
-	"time"
-
-	"github.com/labstack/echo/v4"
-	"github.com/mudler/LocalAI/core/config"
-	mcpTools "github.com/mudler/LocalAI/core/http/endpoints/mcp"
-	"github.com/mudler/LocalAI/core/http/middleware"
-
-	"github.com/google/uuid"
-	"github.com/mudler/LocalAI/core/schema"
-	"github.com/mudler/LocalAI/core/templates"
-	"github.com/mudler/LocalAI/pkg/model"
-	"github.com/mudler/cogito"
-	"github.com/mudler/xlog"
-)
-
-// MCPCompletionEndpoint is the OpenAI Completion API endpoint https://platform.openai.com/docs/api-reference/completions
-// @Summary Generate completions for a given prompt and model.
-// @Param request body schema.OpenAIRequest true "query params"
-// @Success 200 {object} schema.OpenAIResponse "Response"
-// @Router /mcp/v1/completions [post]
-func MCPCompletionEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, evaluator *templates.Evaluator, appConfig *config.ApplicationConfig) echo.HandlerFunc {
-	// We do not support streaming mode (Yet?)
-	return func(c echo.Context) error {
-		created := int(time.Now().Unix())
-
-		ctx := c.Request().Context()
-
-		// Handle Correlation
-		id := c.Request().Header.Get("X-Correlation-ID")
-		if id == "" {
-			id = uuid.New().String()
-		}
-
-		input, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_LOCALAI_REQUEST).(*schema.OpenAIRequest)
-		if !ok || input.Model == "" {
-			return echo.ErrBadRequest
-		}
-
-		config, ok := c.Get(middleware.CONTEXT_LOCALS_KEY_MODEL_CONFIG).(*config.ModelConfig)
-		if !ok || config == nil {
-			return echo.ErrBadRequest
-		}
-
-		if config.MCP.Servers == "" && config.MCP.Stdio == "" {
-			return fmt.Errorf("no MCP servers configured")
-		}
-
-		// Get MCP config from model config
-		remote, stdio, err := config.MCP.MCPConfigFromYAML()
-		if err != nil {
-			return fmt.Errorf("failed to get MCP config: %w", err)
-		}
-
-		// Check if we have tools in cache, or we have to have an initial connection
-		sessions, err := mcpTools.SessionsFromMCPConfig(config.Name, remote, stdio)
-		if err != nil {
-			return fmt.Errorf("failed to get MCP sessions: %w", err)
-		}
-
-		if len(sessions) == 0 {
-			return fmt.Errorf("no working MCP servers found")
-		}
-
-		fragment := cogito.NewEmptyFragment()
-
-		for _, message := range input.Messages {
-			fragment = fragment.AddMessage(message.Role, message.StringContent)
-		}
-
-		_, port, err := net.SplitHostPort(appConfig.APIAddress)
-		if err != nil {
-			return err
-		}
-
-		apiKey := ""
-		if appConfig.ApiKeys != nil {
-			apiKey = appConfig.ApiKeys[0]
-		}
-
-		ctxWithCancellation, cancel := context.WithCancel(ctx)
-		defer cancel()
-
-		// TODO: instead of connecting to the API, we should just wire this internally
-		// and act like completion.go.
-		// We can do this as cogito expects an interface and we can create one that
-		// we satisfy to just call internally ComputeChoices
-		defaultLLM := cogito.NewOpenAILLM(config.Name, apiKey, "http://127.0.0.1:"+port)
-
-		// Build cogito options using the consolidated method
-		cogitoOpts := config.BuildCogitoOptions()
-
-		cogitoOpts = append(
-			cogitoOpts,
-			cogito.WithContext(ctxWithCancellation),
-			cogito.WithMCPs(sessions...),
-			cogito.WithStatusCallback(func(s string) {
-				xlog.Debug("[model agent] Status", "model", config.Name, "status", s)
-			}),
-			cogito.WithReasoningCallback(func(s string) {
-				xlog.Debug("[model agent] Reasoning", "model", config.Name, "reasoning", s)
-			}),
-			cogito.WithToolCallBack(func(t *cogito.ToolChoice, state *cogito.SessionState) cogito.ToolCallDecision {
-				xlog.Debug("[model agent] Tool call", "model", config.Name, "tool", t.Name, "reasoning", t.Reasoning, "arguments", t.Arguments)
-				return cogito.ToolCallDecision{
-					Approved: true,
-				}
-			}),
-			cogito.WithToolCallResultCallback(func(t cogito.ToolStatus) {
-				xlog.Debug("[model agent] Tool call result", "model", config.Name, "tool", t.Name, "result", t.Result, "tool_arguments", t.ToolArguments)
-			}),
-		)
-
-		f, err := cogito.ExecuteTools(
-			defaultLLM, fragment,
-			cogitoOpts...,
-		)
-		if err != nil && !errors.Is(err, cogito.ErrNoToolSelected) {
-			return err
-		}
-
-		f, err = defaultLLM.Ask(ctx, f)
-		if err != nil {
-			return err
-		}
-
-		resp := &schema.OpenAIResponse{
-			ID:      id,
-			Created: created,
-			Model:   input.Model, // we have to return what the user sent here, due to OpenAI spec.
-			Choices: []schema.Choice{{Message: &schema.Message{Role: "assistant", Content: &f.LastMessage().Content}}},
-			Object:  "text_completion",
-		}
-
-		jsonResult, _ := json.Marshal(resp)
-		xlog.Debug("Response", "response", string(jsonResult))
-
-		// Return the prediction in the response body
-		return c.JSON(200, resp)
-	}
-}
--- a/core/http/middleware/trace.go
+++ b/core/http/middleware/trace.go
@@ -0,0 +1,156 @@
+package middleware
+
+import (
+	"bytes"
+	"github.com/emirpasic/gods/v2/queues/circularbuffer"
+	"io"
+	"net/http"
+	"sort"
+	"sync"
+	"time"
+
+	"github.com/labstack/echo/v4"
+	"github.com/mudler/LocalAI/core/application"
+	"github.com/mudler/xlog"
+)
+
+type APIExchangeRequest struct {
+	Method  string       `json:"method"`
+	Path    string       `json:"path"`
+	Headers *http.Header `json:"headers"`
+	Body    *[]byte      `json:"body"`
+}
+
+type APIExchangeResponse struct {
+	Status  int          `json:"status"`
+	Headers *http.Header `json:"headers"`
+	Body    *[]byte      `json:"body"`
+}
+
+type APIExchange struct {
+	Timestamp time.Time           `json:"timestamp"`
+	Request   APIExchangeRequest  `json:"request"`
+	Response  APIExchangeResponse `json:"response"`
+}
+
+var traceBuffer *circularbuffer.Queue[APIExchange]
+var mu sync.Mutex
+var logChan = make(chan APIExchange, 100)
+
+type bodyWriter struct {
+	http.ResponseWriter
+	body *bytes.Buffer
+}
+
+func (w *bodyWriter) Write(b []byte) (int, error) {
+	w.body.Write(b)
+	return w.ResponseWriter.Write(b)
+}
+
+func (w *bodyWriter) Flush() {
+	if flusher, ok := w.ResponseWriter.(http.Flusher); ok {
+		flusher.Flush()
+	}
+}
+
+// TraceMiddleware intercepts and logs JSON API requests and responses
+func TraceMiddleware(app *application.Application) echo.MiddlewareFunc {
+	if app.ApplicationConfig().EnableTracing && traceBuffer == nil {
+		traceBuffer = circularbuffer.New[APIExchange](app.ApplicationConfig().TracingMaxItems)
+
+		go func() {
+			for exchange := range logChan {
+				mu.Lock()
+				traceBuffer.Enqueue(exchange)
+				mu.Unlock()
+			}
+		}()
+	}
+
+	return func(next echo.HandlerFunc) echo.HandlerFunc {
+		return func(c echo.Context) error {
+			if !app.ApplicationConfig().EnableTracing {
+				return next(c)
+			}
+
+			if c.Request().Header.Get("Content-Type") != "application/json" {
+				return next(c)
+			}
+
+			body, err := io.ReadAll(c.Request().Body)
+			if err != nil {
+				xlog.Error("Failed to read request body")
+				return err
+			}
+
+			// Restore the body for downstream handlers
+			c.Request().Body = io.NopCloser(bytes.NewBuffer(body))
+
+			startTime := time.Now()
+
+			// Wrap response writer to capture body
+			resBody := new(bytes.Buffer)
+			mw := &bodyWriter{
+				ResponseWriter: c.Response().Writer,
+				body:           resBody,
+			}
+			c.Response().Writer = mw
+
+			err = next(c)
+			if err != nil {
+				c.Response().Writer = mw.ResponseWriter // Restore original writer if error
+				return err
+			}
+
+			// Create exchange log
+			requestHeaders := c.Request().Header.Clone()
+			requestBody := make([]byte, len(body))
+			copy(requestBody, body)
+			responseHeaders := c.Response().Header().Clone()
+			responseBody := make([]byte, resBody.Len())
+			copy(responseBody, resBody.Bytes())
+			exchange := APIExchange{
+				Timestamp: startTime,
+				Request: APIExchangeRequest{
+					Method:  c.Request().Method,
+					Path:    c.Path(),
+					Headers: &requestHeaders,
+					Body:    &requestBody,
+				},
+				Response: APIExchangeResponse{
+					Status:  c.Response().Status,
+					Headers: &responseHeaders,
+					Body:    &responseBody,
+				},
+			}
+
+			select {
+			case logChan <- exchange:
+			default:
+				xlog.Warn("Trace channel full, dropping trace")
+			}
+
+			return nil
+		}
+	}
+}
+
+// GetTraces returns a copy of the logged API exchanges for display
+func GetTraces() []APIExchange {
+	mu.Lock()
+	traces := traceBuffer.Values()
+	mu.Unlock()
+
+	sort.Slice(traces, func(i, j int) bool {
+		return traces[i].Timestamp.Before(traces[j].Timestamp)
+	})
+
+	return traces
+}
+
+// ClearTraces clears the in-memory logs
+func ClearTraces() {
+	mu.Lock()
+	traceBuffer.Clear()
+	mu.Unlock()
+}
--- a/core/http/routes/localai.go
+++ b/core/http/routes/localai.go
@@ -137,9 +137,10 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 		requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TOKENIZE)),
 		requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TokenizeRequest) }))

-	// MCP Stream endpoint
+	// MCP endpoint - supports both streaming and non-streaming modes
+	// Note: streaming mode is NOT compatible with the OpenAI apis. We have a set which streams more states.
 	if evaluator != nil {
-		mcpStreamHandler := localai.MCPStreamEndpoint(cl, ml, evaluator, appConfig)
+		mcpStreamHandler := localai.MCPEndpoint(cl, ml, evaluator, appConfig)
 		mcpStreamMiddleware := []echo.MiddlewareFunc{
 			requestExtractor.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
 			requestExtractor.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
@@ -154,6 +155,7 @@ func RegisterLocalAIRoutes(router *echo.Echo,
 		}
 		router.POST("/v1/mcp/chat/completions", mcpStreamHandler, mcpStreamMiddleware...)
 		router.POST("/mcp/v1/chat/completions", mcpStreamHandler, mcpStreamMiddleware...)
+		router.POST("/mcp/chat/completions", mcpStreamHandler, mcpStreamMiddleware...)
 	}

 	// Agent job routes
--- a/core/http/routes/openai.go
+++ b/core/http/routes/openai.go
@@ -14,16 +14,18 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	re *middleware.RequestExtractor,
 	application *application.Application) {
 	// openAI compatible API endpoint
+	traceMiddleware := middleware.TraceMiddleware(application)

 	// realtime
 	// TODO: Modify/disable the API key middleware for this endpoint to allow ephemeral keys created by sessions
 	app.GET("/v1/realtime", openai.Realtime(application))
-	app.POST("/v1/realtime/sessions", openai.RealtimeTranscriptionSession(application))
-	app.POST("/v1/realtime/transcription_session", openai.RealtimeTranscriptionSession(application))
+	app.POST("/v1/realtime/sessions", openai.RealtimeTranscriptionSession(application), traceMiddleware)
+	app.POST("/v1/realtime/transcription_session", openai.RealtimeTranscriptionSession(application), traceMiddleware)

 	// chat
 	chatHandler := openai.ChatEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig())
 	chatMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
 		func(next echo.HandlerFunc) echo.HandlerFunc {
@@ -41,6 +43,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	// edit
 	editHandler := openai.EditEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig())
 	editMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EDIT)),
 		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
@@ -59,6 +62,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	// completion
 	completionHandler := openai.CompletionEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig())
 	completionMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_COMPLETION)),
 		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
@@ -75,26 +79,10 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	app.POST("/completions", completionHandler, completionMiddleware...)
 	app.POST("/v1/engines/:model/completions", completionHandler, completionMiddleware...)

-	// MCPcompletion
-	mcpCompletionHandler := openai.MCPCompletionEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.TemplatesEvaluator(), application.ApplicationConfig())
-	mcpCompletionMiddleware := []echo.MiddlewareFunc{
-		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_CHAT)),
-		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
-		func(next echo.HandlerFunc) echo.HandlerFunc {
-			return func(c echo.Context) error {
-				if err := re.SetOpenAIRequest(c); err != nil {
-					return err
-				}
-				return next(c)
-			}
-		},
-	}
-	app.POST("/mcp/v1/chat/completions", mcpCompletionHandler, mcpCompletionMiddleware...)
-	app.POST("/mcp/chat/completions", mcpCompletionHandler, mcpCompletionMiddleware...)
-
 	// embeddings
 	embeddingHandler := openai.EmbeddingsEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
 	embeddingMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_EMBEDDINGS)),
 		re.BuildConstantDefaultModelNameMiddleware("gpt-4o"),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
@@ -113,6 +101,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,

 	audioHandler := openai.TranscriptEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
 	audioMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TRANSCRIPT)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
 		func(next echo.HandlerFunc) echo.HandlerFunc {
@@ -130,6 +119,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,

 	audioSpeechHandler := localai.TTSEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
 	audioSpeechMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_TTS)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.TTSRequest) }),
 	}
@@ -140,6 +130,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	// images
 	imageHandler := openai.ImageEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
 	imageMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		// Default: use the first available image generation model
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_IMAGE)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
@@ -164,6 +155,7 @@ func RegisterOpenAIRoutes(app *echo.Echo,
 	// videos (OpenAI-compatible endpoints mapped to LocalAI video handler)
 	videoHandler := openai.VideoEndpoint(application.ModelConfigLoader(), application.ModelLoader(), application.ApplicationConfig())
 	videoMiddleware := []echo.MiddlewareFunc{
+		traceMiddleware,
 		re.BuildFilteredFirstAvailableDefaultModel(config.BuildUsecaseFilterFn(config.FLAG_VIDEO)),
 		re.SetModelAndConfig(func() schema.LocalAIRequest { return new(schema.OpenAIRequest) }),
 		func(next echo.HandlerFunc) echo.HandlerFunc {
--- a/core/http/routes/ui.go
+++ b/core/http/routes/ui.go
@@ -317,4 +317,24 @@ func RegisterUIRoutes(app *echo.Echo,
 		// Render index
 		return c.Render(200, "views/tts", summary)
 	})
+
+	// Traces UI
+	app.GET("/traces", func(c echo.Context) error {
+		summary := map[string]interface{}{
+			"Title":   "LocalAI - Traces",
+			"BaseURL": middleware.BaseURL(c),
+			"Version": internal.PrintableVersion(),
+		}
+		return c.Render(200, "views/traces", summary)
+	})
+
+	app.GET("/api/traces", func(c echo.Context) error {
+		return c.JSON(200, middleware.GetTraces())
+	})
+
+	app.POST("/api/traces/clear", func(c echo.Context) error {
+		middleware.ClearTraces()
+		return c.NoContent(204)
+	})
+
 }
--- a/core/http/routes/ui_api.go
+++ b/core/http/routes/ui_api.go
@@ -16,6 +16,7 @@ import (
 	"github.com/mudler/LocalAI/core/config"
 	"github.com/mudler/LocalAI/core/gallery"
 	"github.com/mudler/LocalAI/core/http/endpoints/localai"
+	"github.com/mudler/LocalAI/core/http/middleware"
 	"github.com/mudler/LocalAI/core/p2p"
 	"github.com/mudler/LocalAI/core/services"
 	"github.com/mudler/LocalAI/pkg/model"
@@ -947,4 +948,24 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model
 		app.GET("/api/settings", localai.GetSettingsEndpoint(applicationInstance))
 		app.POST("/api/settings", localai.UpdateSettingsEndpoint(applicationInstance))
 	}
+
+	// Logs API
+	app.GET("/api/traces", func(c echo.Context) error {
+		if !appConfig.EnableTracing {
+			return c.JSON(503, map[string]any{
+				"error": "Tracing disabled",
+			})
+		}
+		traces := middleware.GetTraces()
+		return c.JSON(200, map[string]interface{}{
+			"traces": traces,
+		})
+	})
+
+	app.POST("/api/traces/clear", func(c echo.Context) error {
+		middleware.ClearTraces()
+		return c.JSON(200, map[string]interface{}{
+			"message": "Traces cleared",
+		})
+	})
 }
--- a/core/http/static/image.js
+++ b/core/http/static/image.js
@@ -1,61 +1,281 @@
+// Helper function to convert file to base64
+function fileToBase64(file) {
+  return new Promise((resolve, reject) => {
+    const reader = new FileReader();
+    reader.onload = () => {
+      // Remove data:image/...;base64, prefix if present
+      const base64 = reader.result.split(',')[1] || reader.result;
+      resolve(base64);
+    };
+    reader.onerror = reject;
+    reader.readAsDataURL(file);
+  });
+}
+
+// Helper function to read multiple files
+async function filesToBase64Array(fileList) {
+  const base64Array = [];
+  for (let i = 0; i < fileList.length; i++) {
+    const base64 = await fileToBase64(fileList[i]);
+    base64Array.push(base64);
+  }
+  return base64Array;
+}
+
 function genImage(event) {
  event.preventDefault();
-  const input = document.getElementById("input").value;
-
-  promptDallE(input);
+  promptDallE();
 }
-  
-async function promptDallE(input) {
-  document.getElementById("loader").style.display = "block";
-  document.getElementById("input").value = "";
-  document.getElementById("input").disabled = true;

-  const model = document.getElementById("image-model").value;
-  const size = document.getElementById("image-size").value;
-  const response = await fetch("v1/images/generations", {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-    },
-    body: JSON.stringify({
-      model: model,
-      steps: 10,
-      prompt: input,
-      n: 1,
-      size: size,
-    }),
-  });
-  const json = await response.json();
-  if (json.error) {
-    // Display error if there is one
-    var div = document.getElementById('result');  // Get the div by its ID
-    div.innerHTML = '<p style="color:red;">' + json.error.message + '</p>';
+async function promptDallE() {
+  const loader = document.getElementById("loader");
+  const input = document.getElementById("input");
+  const generateBtn = document.getElementById("generate-btn");
+  const resultDiv = document.getElementById("result");
+  const resultPlaceholder = document.getElementById("result-placeholder");
+
+  // Show loader and disable form
+  loader.classList.remove("hidden");
+  if (resultPlaceholder) {
+    resultPlaceholder.style.display = "none";
+  }
+  input.disabled = true;
+  generateBtn.disabled = true;
+
+  // Store the prompt for later restoration
+  const prompt = input.value.trim();
+  if (!prompt) {
+    alert("Please enter a prompt");
+    loader.classList.add("hidden");
+    if (resultPlaceholder) {
+      resultPlaceholder.style.display = "flex";
+    }
+    input.disabled = false;
+    generateBtn.disabled = false;
    return;
  }
-  const url = json.data[0].url;

-  var div = document.getElementById('result');  // Get the div by its ID
-  var img = document.createElement('img');         // Create a new img element
-  img.src = url;  // Set the source of the image
-  img.alt = 'Generated image';            // Set the alt text of the image
+  // Collect all form values
+  const model = document.getElementById("image-model").value;
+  const size = document.getElementById("image-size").value;
+  const negativePrompt = document.getElementById("negative-prompt").value.trim();
+  const n = parseInt(document.getElementById("image-count").value) || 1;
+  const stepInput = document.getElementById("image-steps").value.trim();
+  const step = stepInput ? parseInt(stepInput) : undefined;
+  const seedInput = document.getElementById("image-seed").value.trim();
+  const seed = seedInput ? parseInt(seedInput) : undefined;

-  div.innerHTML = '';                             // Clear the existing content of the div
-  div.appendChild(img);                           // Add the new img element to the div
+  // Prepare request body
+  // Combine prompt and negative prompt with "|" separator (backend expects this format)
+  let combinedPrompt = prompt;
+  if (negativePrompt) {
+    combinedPrompt = prompt + "|" + negativePrompt;
+  }

-  document.getElementById("loader").style.display = "none";
-  document.getElementById("input").disabled = false;
-  document.getElementById("input").focus();
+  const requestBody = {
+    model: model,
+    prompt: combinedPrompt,
+    n: n,
+    size: size,
+  };
+
+  if (step !== undefined) {
+    requestBody.step = step;
+  }
+
+  if (seed !== undefined) {
+    requestBody.seed = seed;
+  }
+
+  // Handle file inputs
+  try {
+    // Source image (single file for img2img)
+    const sourceImageInput = document.getElementById("source-image");
+    if (sourceImageInput.files.length > 0) {
+      const base64 = await fileToBase64(sourceImageInput.files[0]);
+      requestBody.file = base64;
+    }
+
+    // Reference images (collect from all dynamic inputs)
+    const refImageInputs = document.querySelectorAll('.reference-image-file');
+    const refImageFiles = [];
+    for (const input of refImageInputs) {
+      if (input.files.length > 0) {
+        refImageFiles.push(input.files[0]);
+      }
+    }
+    if (refImageFiles.length > 0) {
+      const base64Array = await filesToBase64Array(refImageFiles);
+      requestBody.ref_images = base64Array;
+    }
+  } catch (error) {
+    console.error("Error processing image files:", error);
+    resultDiv.innerHTML = '<p class="text-xs text-red-500 p-2">Error processing image files: ' + error.message + '</p>';
+    loader.classList.add("hidden");
+    if (resultPlaceholder) {
+      resultPlaceholder.style.display = "none";
+    }
+    input.disabled = false;
+    generateBtn.disabled = false;
+    return;
+  }
+
+  // Make API request
+  try {
+    const response = await fetch("v1/images/generations", {
+      method: "POST",
+      headers: {
+        "Content-Type": "application/json",
+      },
+      body: JSON.stringify(requestBody),
+    });
+
+    const json = await response.json();
+
+    if (json.error) {
+      // Display error
+      resultDiv.innerHTML = '<p class="text-xs text-red-500 p-2">Error: ' + json.error.message + '</p>';
+      loader.classList.add("hidden");
+      if (resultPlaceholder) {
+        resultPlaceholder.style.display = "none";
+      }
+      input.disabled = false;
+      generateBtn.disabled = false;
+      return;
+    }
+
+    // Clear result div and hide placeholder
+    resultDiv.innerHTML = '';
+    if (resultPlaceholder) {
+      resultPlaceholder.style.display = "none";
+    }
+
+    // Display all generated images
+    if (json.data && json.data.length > 0) {
+      json.data.forEach((item, index) => {
+        const imageContainer = document.createElement("div");
+        imageContainer.className = "mb-4 bg-[var(--color-bg-primary)]/50 border border-[#1E293B] rounded-lg p-2";
+
+        // Create image element
+        const img = document.createElement("img");
+        if (item.url) {
+          img.src = item.url;
+        } else if (item.b64_json) {
+          img.src = "data:image/png;base64," + item.b64_json;
+        } else {
+          return; // Skip invalid items
+        }
+        img.alt = prompt;
+        img.className = "w-full h-auto rounded-lg mb-2";
+        imageContainer.appendChild(img);
+
+        // Create caption container
+        const captionDiv = document.createElement("div");
+        captionDiv.className = "mt-2 p-2 bg-[var(--color-bg-secondary)] rounded-lg";
+
+        // Prompt caption
+        const promptCaption = document.createElement("p");
+        promptCaption.className = "text-xs text-[var(--color-text-primary)] mb-1.5";
+        promptCaption.innerHTML = '<strong>Prompt:</strong> ' + escapeHtml(prompt);
+        captionDiv.appendChild(promptCaption);
+
+        // Negative prompt if provided
+        if (negativePrompt) {
+          const negativeCaption = document.createElement("p");
+          negativeCaption.className = "text-xs text-[var(--color-text-secondary)] mb-1.5";
+          negativeCaption.innerHTML = '<strong>Negative Prompt:</strong> ' + escapeHtml(negativePrompt);
+          captionDiv.appendChild(negativeCaption);
+        }
+
+        // Generation details
+        const detailsDiv = document.createElement("div");
+        detailsDiv.className = "flex flex-wrap gap-3 text-[10px] text-[var(--color-text-secondary)] mt-1.5";
+        detailsDiv.innerHTML = `
+          <span><strong>Size:</strong> ${size}</span>
+          ${step !== undefined ? `<span><strong>Steps:</strong> ${step}</span>` : ''}
+          ${seed !== undefined ? `<span><strong>Seed:</strong> ${seed}</span>` : ''}
+        `;
+        captionDiv.appendChild(detailsDiv);
+
+        // Copy prompt button
+        const copyBtn = document.createElement("button");
+        copyBtn.className = "mt-1.5 px-2 py-0.5 text-[10px] bg-[var(--color-primary)] text-white rounded hover:opacity-80";
+        copyBtn.innerHTML = '<i class="fas fa-copy mr-1"></i>Copy Prompt';
+        copyBtn.onclick = () => {
+          navigator.clipboard.writeText(prompt).then(() => {
+            copyBtn.innerHTML = '<i class="fas fa-check mr-1"></i>Copied!';
+            setTimeout(() => {
+              copyBtn.innerHTML = '<i class="fas fa-copy mr-1"></i>Copy Prompt';
+            }, 2000);
+          });
+        };
+        captionDiv.appendChild(copyBtn);
+
+        imageContainer.appendChild(captionDiv);
+        resultDiv.appendChild(imageContainer);
+      });
+      // Hide placeholder when images are displayed
+      if (resultPlaceholder) {
+        resultPlaceholder.style.display = "none";
+      }
+    } else {
+      resultDiv.innerHTML = '<p class="text-xs text-[var(--color-text-secondary)] p-2">No images were generated.</p>';
+      if (resultPlaceholder) {
+        resultPlaceholder.style.display = "none";
+      }
+    }
+
+    // Preserve prompt in input field (don't clear it)
+    // The prompt is already in the input field, so we don't need to restore it
+
+  } catch (error) {
+    console.error("Error generating image:", error);
+    resultDiv.innerHTML = '<p class="text-xs text-red-500 p-2">Error: ' + error.message + '</p>';
+    if (resultPlaceholder) {
+      resultPlaceholder.style.display = "none";
+    }
+  } finally {
+    // Hide loader and re-enable form
+    loader.classList.add("hidden");
+    input.disabled = false;
+    generateBtn.disabled = false;
+    input.focus();
+  }
 }

-document.getElementById("input").focus();
-document.getElementById("genimage").addEventListener("submit", genImage);
+// Helper function to escape HTML
+function escapeHtml(text) {
+  const div = document.createElement("div");
+  div.textContent = text;
+  return div.innerHTML;
+}

-// Handle Enter key press in the prompt input
-document.getElementById("input").addEventListener("keypress", function(event) {
-    if (event.key === "Enter") {
+// Initialize
+document.addEventListener("DOMContentLoaded", function() {
+  const input = document.getElementById("input");
+  const form = document.getElementById("genimage");
+
+  if (input) {
+    input.focus();
+  }
+
+  if (form) {
+    form.addEventListener("submit", genImage);
+  }
+
+  // Handle Enter key press in the prompt input (but allow Shift+Enter for new lines)
+  if (input) {
+    input.addEventListener("keydown", function(event) {
+      if (event.key === "Enter" && !event.shiftKey) {
        event.preventDefault();
        genImage(event);
-    }
-});
+      }
+    });
+  }

-document.getElementById("loader").style.display = "none";
+  // Hide loader initially
+  const loader = document.getElementById("loader");
+  if (loader) {
+    loader.classList.add("hidden");
+  }
+});
--- a/core/http/views/model-editor.html
+++ b/core/http/views/model-editor.html
@@ -833,8 +833,24 @@ function importModel() {
                });
                
                if (!response.ok) {
-                    const error = await response.json().catch(() => ({ error: 'Failed to start import' }));
-                    throw new Error(error.error || 'Failed to start import');
+                    const errorData = await response.json().catch(() => ({ message: 'Failed to start import' }));
+                    // Extract error message from various possible formats
+                    // Handle nested error object: {"error": {"message": "...", "code": 500}}
+                    let errorMessage = 'Failed to start import';
+                    if (errorData.error) {
+                        if (typeof errorData.error === 'object' && errorData.error.message) {
+                            errorMessage = errorData.error.message;
+                        } else if (typeof errorData.error === 'string') {
+                            errorMessage = errorData.error;
+                        }
+                    } else if (errorData.message) {
+                        errorMessage = errorData.message;
+                    } else if (errorData.Error) {
+                        errorMessage = errorData.Error;
+                    } else {
+                        errorMessage = JSON.stringify(errorData);
+                    }
+                    throw new Error(errorMessage);
                }
                
                const result = await response.json();
@@ -1053,6 +1069,27 @@ parameters:
                    body: yamlContent
                });

+                if (!response.ok) {
+                    const errorData = await response.json().catch(() => ({ message: 'Failed to save configuration' }));
+                    // Extract error message from various possible formats
+                    // Handle nested error object: {"error": {"message": "...", "code": 500}}
+                    let errorMessage = 'Failed to save configuration';
+                    if (errorData.error) {
+                        if (typeof errorData.error === 'object' && errorData.error.message) {
+                            errorMessage = errorData.error.message;
+                        } else if (typeof errorData.error === 'string') {
+                            errorMessage = errorData.error;
+                        }
+                    } else if (errorData.message) {
+                        errorMessage = errorData.message;
+                    } else if (errorData.Error) {
+                        errorMessage = errorData.Error;
+                    } else {
+                        errorMessage = JSON.stringify(errorData);
+                    }
+                    throw new Error(errorMessage);
+                }
+
                const result = await response.json();
                
                if (result.success) {
@@ -1063,7 +1100,8 @@ parameters:
                        }, 2000);
                    }
                } else {
-                    this.showAlert('error', result.error || 'Failed to save configuration');
+                    const errorMessage = result.message || result.error || result.Error || 'Failed to save configuration';
+                    this.showAlert('error', errorMessage);
                }
            } catch (error) {
                this.showAlert('error', 'Failed to save: ' + error.message);
--- a/core/http/views/partials/navbar.html
+++ b/core/http/views/partials/navbar.html
@@ -37,6 +37,9 @@
                <a href="agent-jobs" class="text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] px-2 py-2 rounded-lg transition duration-300 ease-in-out hover:bg-[var(--color-bg-secondary)] flex items-center group text-sm">
                    <i class="fas fa-tasks text-[var(--color-primary)] mr-1.5 text-sm group-hover:scale-110 transition-transform"></i>Agent Jobs
                </a>
+                <a href="traces/" class="text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] px-2 py-2 rounded-lg transition duration-300 ease-in-out hover:bg-[var(--color-bg-secondary)] flex items-center group text-sm">
+                    <i class="fas fa-chart-line text-[var(--color-primary)] mr-1.5 text-sm group-hover:scale-110 transition-transform"></i>Traces
+                </a>
                <a href="swagger/" class="text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] px-2 py-2 rounded-lg transition duration-300 ease-in-out hover:bg-[var(--color-bg-secondary)] flex items-center group text-sm">
                    <i class="fas fa-code text-[var(--color-primary)] mr-1.5 text-sm group-hover:scale-110 transition-transform"></i>API
                </a>
@@ -94,6 +97,9 @@
                <a href="agent-jobs" class="block text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] px-3 py-2 rounded-lg transition duration-300 ease-in-out flex items-center text-sm">
                    <i class="fas fa-tasks text-[var(--color-primary)] mr-3 w-5 text-center text-sm"></i>Agent Jobs
                </a>
+                <a href="traces/" class="block text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] px-3 py-2 rounded-lg transition duration-300 ease-in-out flex items-center text-sm">
+                    <i class="fas fa-chart-line text-[var(--color-primary)] mr-3 w-5 text-center text-sm"></i>Traces
+                </a>
                <a href="swagger/" class="block text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] px-3 py-2 rounded-lg transition duration-300 ease-in-out flex items-center text-sm">
                    <i class="fas fa-code text-[var(--color-primary)] mr-3 w-5 text-center text-sm"></i>API
                </a>
--- a/core/http/views/settings.html
+++ b/core/http/views/settings.html
@@ -10,7 +10,7 @@
    <!-- Notifications -->
    <div class="fixed top-20 right-4 z-50 space-y-2" style="max-width: 400px;">
        <template x-for="notification in notifications" :key="notification.id">
-            <div x-show="true" 
+            <div x-show="true"
                 x-transition:enter="transition ease-out duration-200"
                 x-transition:enter-start="opacity-0"
                 x-transition:enter-end="opacity-100"
@@ -39,7 +39,7 @@
                <h1 class="h2">
                    Application Settings
                </h1>
-                <a href="/manage" 
+                <a href="/manage"
                   class="inline-flex items-center text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] transition-colors">
                    <i class="fas fa-arrow-left mr-2 text-sm"></i>
                    <span class="text-sm">Back to Manage</span>
@@ -68,7 +68,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable automatic monitoring of backend processes</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.watchdog_enabled" 
+                            <input type="checkbox" x-model="settings.watchdog_enabled"
                                   @change="updateWatchdogEnabled()"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
@@ -82,7 +82,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Automatically stop backends that are idle for too long</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.watchdog_idle_enabled" 
+                            <input type="checkbox" x-model="settings.watchdog_idle_enabled"
                                   :disabled="!settings.watchdog_enabled"
                                   class="sr-only peer" :class="!settings.watchdog_enabled ? 'opacity-50' : ''">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
@@ -93,7 +93,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Idle Timeout</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Time before an idle backend is stopped (e.g., 15m, 1h)</p>
-                        <input type="text" x-model="settings.watchdog_idle_timeout" 
+                        <input type="text" x-model="settings.watchdog_idle_timeout"
                               :disabled="!settings.watchdog_idle_enabled"
                               placeholder="15m"
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
@@ -107,7 +107,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Automatically stop backends that are busy for too long (stuck processes)</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.watchdog_busy_enabled" 
+                            <input type="checkbox" x-model="settings.watchdog_busy_enabled"
                                   :disabled="!settings.watchdog_enabled"
                                   class="sr-only peer" :class="!settings.watchdog_enabled ? 'opacity-50' : ''">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
@@ -118,7 +118,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Busy Timeout</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Time before a busy backend is stopped (e.g., 5m, 30m)</p>
-                        <input type="text" x-model="settings.watchdog_busy_timeout" 
+                        <input type="text" x-model="settings.watchdog_busy_timeout"
                               :disabled="!settings.watchdog_busy_enabled"
                               placeholder="5m"
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
@@ -129,13 +129,50 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Check Interval</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">How often the watchdog checks backends and memory usage (e.g., 2s, 30s)</p>
-                        <input type="text" x-model="settings.watchdog_interval" 
+                        <input type="text" x-model="settings.watchdog_interval"
                               :disabled="!settings.watchdog_enabled"
                               placeholder="2s"
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
                               :class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
                    </div>

+                    <!-- Force Eviction When Busy -->
+                    <div class="flex items-center justify-between">
+                        <div>
+                            <label class="text-sm font-medium text-[var(--color-text-primary)]">Force Eviction When Busy</label>
+                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Allow evicting models even when they have active API calls (default: disabled for safety)</p>
+                        </div>
+                        <label class="relative inline-flex items-center cursor-pointer">
+                            <input type="checkbox" x-model="settings.force_eviction_when_busy" 
+                                   :disabled="!settings.watchdog_enabled"
+                                   class="sr-only peer" :class="!settings.watchdog_enabled ? 'opacity-50' : ''">
+                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
+                        </label>
+                    </div>
+
+                    <!-- LRU Eviction Max Retries -->
+                    <div>
+                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">LRU Eviction Max Retries</label>
+                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Maximum number of retries when waiting for busy models to become idle (default: 30)</p>
+                        <input type="number" x-model="settings.lru_eviction_max_retries" 
+                               :disabled="!settings.watchdog_enabled"
+                               min="1"
+                               placeholder="30"
+                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
+                               :class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
+                    </div>
+
+                    <!-- LRU Eviction Retry Interval -->
+                    <div>
+                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">LRU Eviction Retry Interval</label>
+                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Interval between retries when waiting for busy models (e.g., 1s, 2s) (default: 1s)</p>
+                        <input type="text" x-model="settings.lru_eviction_retry_interval" 
+                               :disabled="!settings.watchdog_enabled"
+                               placeholder="1s"
+                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary-border)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary-border)]"
+                               :class="!settings.watchdog_enabled ? 'opacity-50 cursor-not-allowed' : ''">
+                    </div>
+
                    <!-- Memory Reclaimer Subsection -->
                    <div class="mt-6 pt-4 border-t border-[var(--color-primary-border)]/20">
                        <h3 class="text-md font-medium text-[var(--color-text-primary)] mb-3 flex items-center">
@@ -159,7 +196,7 @@
                                    <template x-for="gpu in resourceData.gpus" :key="gpu.index">
                                        <div class="flex items-center justify-between text-xs">
                                            <span class="text-[var(--color-text-primary)] truncate max-w-[200px]" x-text="gpu.name"></span>
-                                            <span class="font-mono" 
+                                            <span class="font-mono"
                                                  :class="gpu.usage_percent > 90 ? 'text-red-400' : gpu.usage_percent > 70 ? 'text-yellow-400' : 'text-green-400'"
                                                  x-text="`${gpu.usage_percent.toFixed(1)}%`"></span>
                                        </div>
@@ -169,7 +206,7 @@
                            <template x-if="resourceData && resourceData.available && resourceData.type === 'ram'">
                                <div class="flex items-center justify-between text-xs">
                                    <span class="text-[var(--color-text-primary)]">System RAM</span>
-                                    <span class="font-mono" 
+                                    <span class="font-mono"
                                          :class="resourceData.ram.usage_percent > 90 ? 'text-red-400' : resourceData.ram.usage_percent > 70 ? 'text-yellow-400' : 'text-green-400'"
                                          x-text="`${resourceData.ram.usage_percent.toFixed(1)}%`"></span>
                                </div>
@@ -186,7 +223,7 @@
                                <p class="text-xs text-[var(--color-text-secondary)] mt-1">Evict backends when memory usage exceeds threshold</p>
                            </div>
                            <label class="relative inline-flex items-center cursor-pointer">
-                                <input type="checkbox" x-model="settings.memory_reclaimer_enabled" 
+                                <input type="checkbox" x-model="settings.memory_reclaimer_enabled"
                                       :disabled="!settings.watchdog_enabled"
                                       class="sr-only peer" :class="!settings.watchdog_enabled ? 'opacity-50' : ''">
                                <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
@@ -198,12 +235,12 @@
                            <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Memory Threshold (%)</label>
                            <p class="text-xs text-[var(--color-text-secondary)] mb-2">When memory usage exceeds this, backends will be evicted (50-100%)</p>
                            <div class="flex items-center gap-3">
-                                <input type="range" x-model="settings.memory_reclaimer_threshold_percent" 
+                                <input type="range" x-model="settings.memory_reclaimer_threshold_percent"
                                       min="50" max="100" step="1"
                                       :disabled="!settings.memory_reclaimer_enabled || !settings.watchdog_enabled"
                                       class="flex-1 h-2 bg-[var(--color-bg-primary)] rounded-lg appearance-none cursor-pointer"
                                       :class="(!settings.memory_reclaimer_enabled || !settings.watchdog_enabled) ? 'opacity-50' : ''">
-                                <span class="text-sm font-mono text-[var(--color-text-primary)] w-12 text-right" 
+                                <span class="text-sm font-mono text-[var(--color-text-primary)] w-12 text-right"
                                      x-text="`${settings.memory_reclaimer_threshold_percent}%`"></span>
                            </div>
                        </div>
@@ -226,7 +263,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Max Active Backends</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Maximum number of models to keep loaded at once (0 = unlimited, 1 = single backend mode). Least recently used models are evicted when limit is reached.</p>
-                        <input type="number" x-model="settings.max_active_backends" 
+                        <input type="number" x-model="settings.max_active_backends"
                               min="0"
                               placeholder="0"
                               @change="updateMaxActiveBackends()"
@@ -240,7 +277,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable backends to handle multiple requests in parallel (if supported)</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.parallel_backend_requests" 
+                            <input type="checkbox" x-model="settings.parallel_backend_requests"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-accent-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-accent)]"></div>
                        </label>
@@ -263,7 +300,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Default Threads</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Number of threads to use for model inference (0 = auto)</p>
-                        <input type="number" x-model="settings.threads" 
+                        <input type="number" x-model="settings.threads"
                               min="0"
                               placeholder="0"
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-success-light)] rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-success-light)]">
@@ -273,7 +310,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Default Context Size</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Default context window size for models</p>
-                        <input type="number" x-model="settings.context_size" 
+                        <input type="number" x-model="settings.context_size"
                               min="0"
                               placeholder="512"
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-success-light)] rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-success-light)]">
@@ -286,7 +323,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Use 16-bit floating point precision</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.f16" 
+                            <input type="checkbox" x-model="settings.f16"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-success-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-success)]"></div>
                        </label>
@@ -299,11 +336,37 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable debug logging</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.debug" 
+                            <input type="checkbox" x-model="settings.debug"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-success-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-success)]"></div>
                        </label>
                    </div>
+
+                    <!-- Enable Tracing -->
+                    <div class="flex items-center justify-between">
+                        <div>
+                            <label class="text-sm font-medium text-[var(--color-text-primary)]">Enable Tracing</label>
+                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable tracing of requests and responses</p>
+                        </div>
+                        <label class="relative inline-flex items-center cursor-pointer">
+                            <input type="checkbox" x-model="settings.enable_tracing"
+                                   class="sr-only peer">
+                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-success-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-success)]"></div>
+                        </label>
+                    </div>
+
+                    <!-- Tracing Max Items -->
+                    <div>
+                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Tracing Max Items</label>
+                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Maximum number of tracing items to keep</p>
+                        <input type="number" x-model="settings.tracing_max_items"
+                               min="0"
+                               placeholder="0"
+                               :disabled="!settings.enable_tracing"
+                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-success-light)] rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-success-light)]"
+                               :class="!settings.enable_tracing ? 'opacity-50 cursor-not-allowed' : ''">
+                    </div>
+
                </div>
            </div>

@@ -325,7 +388,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable Cross-Origin Resource Sharing</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.cors" 
+                            <input type="checkbox" x-model="settings.cors"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-warning-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-warning)]"></div>
                        </label>
@@ -335,7 +398,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">CORS Allow Origins</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Comma-separated list of allowed origins</p>
-                        <input type="text" x-model="settings.cors_allow_origins" 
+                        <input type="text" x-model="settings.cors_allow_origins"
                               placeholder="*"
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-warning-light)] rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-warning-light)]">
                    </div>
@@ -347,7 +410,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable Cross-Site Request Forgery protection</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.csrf" 
+                            <input type="checkbox" x-model="settings.csrf"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-warning-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-warning)]"></div>
                        </label>
@@ -370,7 +433,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">P2P Token</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Authentication token for P2P network (set to 0 to generate a new token)</p>
-                        <input type="text" x-model="settings.p2p_token" 
+                        <input type="text" x-model="settings.p2p_token"
                               placeholder=""
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-accent)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-accent)]/50">
                    </div>
@@ -379,7 +442,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">P2P Network ID</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Network identifier for P2P connections</p>
-                        <input type="text" x-model="settings.p2p_network_id" 
+                        <input type="text" x-model="settings.p2p_network_id"
                               placeholder=""
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-accent)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-accent)]/50">
                    </div>
@@ -391,7 +454,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable federated instance mode</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.federated" 
+                            <input type="checkbox" x-model="settings.federated"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-accent)]/20 rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-accent)]"></div>
                        </label>
@@ -414,7 +477,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Job Retention Days</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Number of days to keep job history (default: 30)</p>
-                        <input type="number" x-model="settings.agent_job_retention_days" 
+                        <input type="number" x-model="settings.agent_job_retention_days"
                               min="0"
                               placeholder="30"
                               class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary)]/50">
@@ -437,7 +500,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">API Keys</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">List of API keys (one per line or comma-separated)</p>
-                        <textarea x-model="settings.api_keys_text" 
+                        <textarea x-model="settings.api_keys_text"
                                  rows="4"
                                  placeholder="sk-1234567890abcdef&#10;sk-0987654321fedcba"
                                  class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-error-light)] rounded text-sm text-[var(--color-text-primary)] font-mono focus:outline-none focus:ring-2 focus:ring-[var(--color-error-light)]"></textarea>
@@ -464,7 +527,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Automatically load model galleries on startup</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.autoload_galleries" 
+                            <input type="checkbox" x-model="settings.autoload_galleries"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-accent)]/20 rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-accent)]"></div>
                        </label>
@@ -477,7 +540,7 @@
                            <p class="text-xs text-[var(--color-text-secondary)] mt-1">Automatically load backend galleries on startup</p>
                        </div>
                        <label class="relative inline-flex items-center cursor-pointer">
-                            <input type="checkbox" x-model="settings.autoload_backend_galleries" 
+                            <input type="checkbox" x-model="settings.autoload_backend_galleries"
                                   class="sr-only peer">
                            <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-accent)]/20 rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-accent)]"></div>
                        </label>
@@ -487,7 +550,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Model Galleries (JSON)</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Array of gallery objects with 'url' and 'name' fields</p>
-                        <textarea x-model="settings.galleries_json" 
+                        <textarea x-model="settings.galleries_json"
                                  rows="4"
                                  placeholder='[{"url": "https://example.com", "name": "Example Gallery"}]'
                                  class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-accent)]/20 rounded text-sm text-[var(--color-text-primary)] font-mono focus:outline-none focus:ring-2 focus:ring-[var(--color-accent)]/50"></textarea>
@@ -497,7 +560,7 @@
                    <div>
                        <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Backend Galleries (JSON)</label>
                        <p class="text-xs text-[var(--color-text-secondary)] mb-2">Array of backend gallery objects with 'url' and 'name' fields</p>
-                        <textarea x-model="settings.backend_galleries_json" 
+                        <textarea x-model="settings.backend_galleries_json"
                                  rows="4"
                                  placeholder='[{"url": "https://example.com", "name": "Example Backend Gallery"}]'
                                  class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-accent)]/20 rounded text-sm text-[var(--color-text-primary)] font-mono focus:outline-none focus:ring-2 focus:ring-[var(--color-accent)]/50"></textarea>
@@ -521,7 +584,7 @@

            <!-- Save Button -->
            <div class="flex justify-end">
-                <button type="submit" 
+                <button type="submit"
                        :disabled="saving"
                        class="btn-primary">
                    <i class="fas fa-save mr-2" :class="saving ? 'fa-spin fa-spinner' : ''"></i>
@@ -545,6 +608,9 @@ function settingsDashboard() {
            watchdog_idle_timeout: '15m',
            watchdog_busy_timeout: '5m',
            watchdog_interval: '2s',
+            force_eviction_when_busy: false,
+            lru_eviction_max_retries: 30,
+            lru_eviction_retry_interval: '1s',
            max_active_backends: 0,
            parallel_backend_requests: false,
            memory_reclaimer_enabled: false,
@@ -554,6 +620,8 @@ function settingsDashboard() {
            context_size: 0,
            f16: false,
            debug: false,
+            enable_tracing: false,
+            tracing_max_items: 0,
            cors: false,
            csrf: false,
            cors_allow_origins: '',
@@ -569,16 +637,16 @@ function settingsDashboard() {
        },
        sourceInfo: '',
        saving: false,
-        
+
        init() {
            this.loadSettings();
        },
-        
+
        async loadSettings() {
            try {
                const response = await fetch('/api/settings');
                const data = await response.json();
-                
+
                if (response.ok) {
                    this.settings = {
                        watchdog_enabled: data.watchdog_enabled,
@@ -587,6 +655,9 @@ function settingsDashboard() {
                        watchdog_idle_timeout: data.watchdog_idle_timeout || '15m',
                        watchdog_busy_timeout: data.watchdog_busy_timeout || '5m',
                        watchdog_interval: data.watchdog_interval || '2s',
+                        force_eviction_when_busy: data.force_eviction_when_busy || false,
+                        lru_eviction_max_retries: data.lru_eviction_max_retries || 30,
+                        lru_eviction_retry_interval: data.lru_eviction_retry_interval || '1s',
                        max_active_backends: data.max_active_backends || 0,
                        parallel_backend_requests: data.parallel_backend_requests,
                        memory_reclaimer_enabled: data.memory_reclaimer_enabled || false,
@@ -596,6 +667,8 @@ function settingsDashboard() {
                        context_size: data.context_size || 0,
                        f16: data.f16 || false,
                        debug: data.debug || false,
+                        enable_tracing: data.enable_tracing || false,
+                        tracing_max_items: data.tracing_max_items || 0,
                        cors: data.cors || false,
                        csrf: data.csrf || false,
                        cors_allow_origins: data.cors_allow_origins || '',
@@ -618,7 +691,7 @@ function settingsDashboard() {
                this.addNotification('Failed to load settings: ' + error.message, 'error');
            }
        },
-        
+
        updateWatchdogEnabled() {
            if (!this.settings.watchdog_enabled) {
                this.settings.watchdog_idle_enabled = false;
@@ -626,21 +699,27 @@ function settingsDashboard() {
                this.settings.memory_reclaimer_enabled = false;
            }
        },
-        
+
        updateMaxActiveBackends() {
            // Ensure max_active_backends is a non-negative integer
            const value = parseInt(this.settings.max_active_backends) || 0;
            this.settings.max_active_backends = Math.max(0, value);
        },
-        
+
+        updateTracingEnabled() {
+            if (!this.settings.enable_tracing) {
+                this.settings.tracing_max_items = 0;
+            }
+        },
+
        async saveSettings() {
            if (this.saving) return;
-            
+
            this.saving = true;
-            
+
            try {
                const payload = {};
-                
+
                // Only include changed values
                if (this.settings.watchdog_enabled !== undefined) {
                    payload.watchdog_enabled = this.settings.watchdog_enabled;
@@ -660,6 +739,15 @@ function settingsDashboard() {
                if (this.settings.watchdog_interval) {
                    payload.watchdog_interval = this.settings.watchdog_interval;
                }
+                if (this.settings.force_eviction_when_busy !== undefined) {
+                    payload.force_eviction_when_busy = this.settings.force_eviction_when_busy;
+                }
+                if (this.settings.lru_eviction_max_retries !== undefined) {
+                    payload.lru_eviction_max_retries = parseInt(this.settings.lru_eviction_max_retries) || 30;
+                }
+                if (this.settings.lru_eviction_retry_interval) {
+                    payload.lru_eviction_retry_interval = this.settings.lru_eviction_retry_interval;
+                }
                if (this.settings.max_active_backends !== undefined) {
                    payload.max_active_backends = parseInt(this.settings.max_active_backends) || 0;
                }
@@ -685,6 +773,12 @@ function settingsDashboard() {
                if (this.settings.debug !== undefined) {
                    payload.debug = this.settings.debug;
                }
+                if (this.settings.enable_tracing !== undefined) {
+                    payload.enable_tracing = this.settings.enable_tracing;
+                }
+                if (this.settings.tracing_max_items !== undefined) {
+                    payload.tracing_max_items = parseInt(this.settings.tracing_max_items) || 0;
+                }
                if (this.settings.cors !== undefined) {
                    payload.cors = this.settings.cors;
                }
@@ -744,7 +838,7 @@ function settingsDashboard() {
                if (this.settings.agent_job_retention_days !== undefined) {
                    payload.agent_job_retention_days = parseInt(this.settings.agent_job_retention_days) || 30;
                }
-                
+
                const response = await fetch('/api/settings', {
                    method: 'POST',
                    headers: {
@@ -752,9 +846,9 @@ function settingsDashboard() {
                    },
                    body: JSON.stringify(payload)
                });
-                
+
                const data = await response.json();
-                
+
                if (response.ok && data.success) {
                    this.addNotification('Settings saved successfully!', 'success');
                    // Reload settings to get updated source info
@@ -769,13 +863,13 @@ function settingsDashboard() {
                this.saving = false;
            }
        },
-        
+
        addNotification(message, type = 'success') {
            const id = Date.now();
            this.notifications.push({ id, message, type });
            setTimeout(() => this.dismissNotification(id), 5000);
        },
-        
+
        dismissNotification(id) {
            this.notifications = this.notifications.filter(n => n.id !== id);
        }
@@ -786,7 +880,7 @@ function settingsDashboard() {
 function resourceStatus() {
    return {
        resourceData: null,
-        
+
        async fetchResource() {
            try {
                const response = await fetch('/api/resources');
--- a/core/http/views/text2image.html
+++ b/core/http/views/text2image.html
@@ -3,126 +3,329 @@
 {{template "views/partials/head" .}}
 <script defer src="static/image.js"></script>

-<body class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">
-<div class="flex flex-col min-h-screen">
+<body class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)] flex flex-col h-screen">
+<div class="flex flex-col flex-1 overflow-hidden">
   
    {{template "views/partials/navbar" .}}
-    <div class="container mx-auto px-4 py-8 flex-grow" x-data="{ component: 'menu' }">
-    
-        <!-- Hero Section -->
-        <div class="hero-section">
-            <div class="hero-content">
-                <h1 class="hero-title">
-                    Image Generation {{ if .Model }} with {{.Model}} {{ end }}
-                </h1>
-                <p class="hero-subtitle">Create stunning images from text descriptions</p>
-            </div>
-        </div>
-      
-        <!-- Model Selection - Positioned between hero and generation form -->
-        <div class="card p-5 mb-6">
-            <div class="flex items-center">
-                <div class="text-lg font-medium text-[var(--color-primary)] mr-4">
-                    <i class="fas fa-palette mr-2"></i>Select Model:
-                </div>
-                <div class="flex-grow">
-                    <select x-data="{ link : '' }" x-model="link" x-init="$watch('link', value => window.location = link)" 
-                        id="model-select"
-                        class="input w-full max-w-md p-2.5 pr-10"
-                    >	
-                        <option value="" disabled class="text-[var(--color-text-secondary)]">Select a model</option>
-                        {{ $model:=.Model}}
-                        {{ range .ModelsConfig }}
-                          {{ $cfg := . }}
-                          {{ range .KnownUsecaseStrings }}
-                            {{ if eq . "FLAG_IMAGE" }}
-                                <option value="text2image/{{$cfg.Name}}" {{ if eq $cfg.Name $model }} selected {{end}} class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{$cfg.Name}}</option>
+    <div class="flex flex-1 overflow-hidden">
+        <!-- Two Column Layout: Settings on Left, Preview on Right -->
+        <div class="flex flex-col lg:flex-row flex-1 gap-4 p-4 overflow-hidden">
+            <!-- Left Column: Generation Settings -->
+            <div class="flex-shrink-0 lg:w-1/4 flex flex-col min-h-0">
+                <div class="card p-3 space-y-3 overflow-y-auto flex-1">
+                    <!-- Model Selection - Compact -->
+                    <div class="space-y-1.5">
+                        <div class="flex items-center justify-between gap-2">
+                            <label class="text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide flex-shrink-0">Model</label>
+                        </div>
+                        <select x-data="{ link : '' }" x-model="link" x-init="$watch('link', value => window.location = link)" 
+                            id="model-select"
+                            class="input w-full p-1.5 text-xs"
+                        >	
+                            <option value="" disabled class="text-[var(--color-text-secondary)]">Select a model</option>
+                            {{ $model:=.Model}}
+                            {{ range .ModelsConfig }}
+                              {{ $cfg := . }}
+                              {{ range .KnownUsecaseStrings }}
+                                {{ if eq . "FLAG_IMAGE" }}
+                                    <option value="text2image/{{$cfg.Name}}" {{ if eq $cfg.Name $model }} selected {{end}} class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{$cfg.Name}}</option>
+                                {{ end }}
+                              {{ end }}
                            {{ end }}
-                          {{ end }}
-                        {{ end }}
-                        {{ range .ModelsWithoutConfig }}
-                            <option value="text2image/{{.}}" {{ if eq . $model }} selected {{ end }} class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{.}}</option>
-                        {{end}}
-                    </select>
+                            {{ range .ModelsWithoutConfig }}
+                                <option value="text2image/{{.}}" {{ if eq . $model }} selected {{ end }} class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">{{.}}</option>
+                            {{end}}
+                        </select>
+                    </div>
+                    
+                    <div class="relative">
+                        <input id="image-model" type="hidden" value="{{.Model}}">
+                        <form id="genimage" action="text2image/{{.Model}}" method="get">
+                            <!-- Basic Settings -->
+                            <div class="space-y-2">
+                                <!-- Prompt -->
+                                <div class="space-y-1">
+                                    <label for="input" class="block text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide">
+                                        <i class="fas fa-magic mr-1.5 text-[var(--color-primary)]"></i>Prompt
+                                    </label>
+                                    <textarea
+                                        id="input"
+                                        name="input"
+                                        placeholder="Describe the image you want to generate..."
+                                        autocomplete="off"
+                                        rows="3"
+                                        class="input w-full p-1.5 text-xs resize-y"
+                                        required
+                                    ></textarea>
+                                </div>
+
+                                <!-- Negative Prompt -->
+                                <div class="space-y-1">
+                                    <label for="negative-prompt" class="block text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide">
+                                        <i class="fas fa-ban mr-1.5 text-[var(--color-primary)]"></i>Negative Prompt
+                                    </label>
+                                    <textarea
+                                        id="negative-prompt"
+                                        name="negative-prompt"
+                                        placeholder="Things to avoid in the image..."
+                                        rows="2"
+                                        class="input w-full p-1.5 text-xs resize-y"
+                                    ></textarea>
+                                </div>
+
+                                <!-- Size Selection with Presets -->
+                                <div class="space-y-1">
+                                    <label for="image-size" class="block text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide">
+                                        <i class="fas fa-expand-arrows-alt mr-1.5 text-[var(--color-primary)]"></i>Image Size
+                                    </label>
+                                    <div class="flex flex-wrap gap-1.5 mb-1.5">
+                                        <button type="button" class="size-preset px-2 py-0.5 text-[10px] rounded border border-[var(--color-border)] hover:bg-[var(--color-bg-secondary)]" data-size="256x256">256×256</button>
+                                        <button type="button" class="size-preset px-2 py-0.5 text-[10px] rounded border border-[var(--color-border)] hover:bg-[var(--color-bg-secondary)]" data-size="512x512">512×512</button>
+                                        <button type="button" class="size-preset px-2 py-0.5 text-[10px] rounded border border-[var(--color-border)] hover:bg-[var(--color-bg-secondary)]" data-size="768x768">768×768</button>
+                                        <button type="button" class="size-preset px-2 py-0.5 text-[10px] rounded border border-[var(--color-border)] hover:bg-[var(--color-bg-secondary)]" data-size="1024x1024">1024×1024</button>
+                                    </div>
+                                    <input
+                                        type="text"
+                                        id="image-size"
+                                        value="512x512"
+                                        placeholder="e.g., 256x256, 512x512, 1024x1024"
+                                        class="input p-1.5 text-xs w-full"
+                                    />
+                                </div>
+
+                                <!-- Number of Images -->
+                                <div class="space-y-1">
+                                    <label for="image-count" class="block text-xs font-medium text-[var(--color-text-secondary)] uppercase tracking-wide">
+                                        <i class="fas fa-images mr-1.5 text-[var(--color-primary)]"></i>Number of Images
+                                    </label>
+                                    <input
+                                        type="number"
+                                        id="image-count"
+                                        name="n"
+                                        min="1"
+                                        max="4"
+                                        value="1"
+                                        class="input p-1.5 text-xs w-full"
+                                    />
+                                </div>
+                            </div>
+
+                            <!-- Advanced Settings (Collapsible) -->
+                            <div class="space-y-2">
+                                <button type="button" id="advanced-toggle" class="w-full flex items-center justify-between px-2 py-1.5 text-xs rounded text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] transition-colors">
+                                    <span><i class="fa-solid fa-sliders mr-1.5 text-[var(--color-primary)]"></i> Advanced Settings</span>
+                                    <i class="fas fa-chevron-down text-[10px]" id="advanced-chevron"></i>
+                                </button>
+                                <div id="advanced-settings" class="hidden p-2 bg-[var(--color-bg-secondary)] border border-[var(--color-primary-border)]/20 rounded pl-4 border-l-2 border-[var(--color-bg-secondary)] space-y-2">
+                                    <!-- Steps -->
+                                    <div class="space-y-1">
+                                        <label for="image-steps" class="block text-xs text-[var(--color-text-secondary)]">
+                                            <i class="fas fa-step-forward mr-1.5 text-[var(--color-primary)]"></i>Steps
+                                        </label>
+                                        <input
+                                            type="number"
+                                            id="image-steps"
+                                            name="step"
+                                            min="1"
+                                            max="100"
+                                            placeholder="Leave empty for default"
+                                            class="input p-1.5 text-xs w-full"
+                                        />
+                                    </div>
+
+                                    <!-- Seed -->
+                                    <div class="space-y-1">
+                                        <label for="image-seed" class="block text-xs text-[var(--color-text-secondary)]">
+                                            <i class="fas fa-seedling mr-1.5 text-[var(--color-primary)]"></i>Seed
+                                        </label>
+                                        <input
+                                            type="number"
+                                            id="image-seed"
+                                            name="seed"
+                                            min="0"
+                                            placeholder="Leave empty for random"
+                                            class="input p-1.5 text-xs w-full"
+                                        />
+                                    </div>
+                                </div>
+                            </div>
+
+                            <!-- Image Inputs (Collapsible) -->
+                            <div class="space-y-2">
+                                <button type="button" id="image-inputs-toggle" class="w-full flex items-center justify-between px-2 py-1.5 text-xs rounded text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)] hover:bg-[var(--color-bg-secondary)] transition-colors">
+                                    <span><i class="fa-solid fa-image mr-1.5 text-[var(--color-primary)]"></i> Image Inputs</span>
+                                    <i class="fas fa-chevron-down text-[10px]" id="image-inputs-chevron"></i>
+                                </button>
+                                <div id="image-inputs-settings" class="hidden p-2 bg-[var(--color-bg-secondary)] border border-[var(--color-primary-border)]/20 rounded pl-4 border-l-2 border-[var(--color-bg-secondary)] space-y-2">
+                                    <!-- Source Image (img2img) -->
+                                    <div class="space-y-1">
+                                        <label for="source-image" class="block text-xs text-[var(--color-text-secondary)]">
+                                            <i class="fas fa-file-image mr-1.5 text-[var(--color-primary)]"></i>Source Image (img2img)
+                                        </label>
+                                        <input
+                                            type="file"
+                                            id="source-image"
+                                            name="file"
+                                            accept="image/*"
+                                            class="input p-1.5 text-xs w-full"
+                                        />
+                                    </div>
+
+                                    <!-- Reference Images (Dynamic) -->
+                                    <div class="space-y-1">
+                                        <div class="flex items-center justify-between mb-1">
+                                            <label class="block text-xs text-[var(--color-text-secondary)]">
+                                                <i class="fas fa-images mr-1.5 text-[var(--color-primary)]"></i>Multiple Input Images
+                                            </label>
+                                            <button type="button" id="add-reference-image" class="px-2 py-0.5 text-[10px] bg-[var(--color-primary)] text-white rounded hover:opacity-80">
+                                                <i class="fas fa-plus mr-1"></i>Add
+                                            </button>
+                                        </div>
+                                        <div id="reference-images-container" class="space-y-1.5">
+                                            <div class="reference-image-item flex items-center gap-1.5">
+                                                <input
+                                                    type="file"
+                                                    class="reference-image-file input p-1.5 text-xs flex-1"
+                                                    accept="image/*"
+                                                    data-type="ref_images"
+                                                />
+                                                <button type="button" class="remove-reference-image px-1.5 py-1.5 text-[10px] bg-red-500 text-white rounded hover:opacity-80 hidden">
+                                                    <i class="fas fa-times"></i>
+                                                </button>
+                                            </div>
+                                        </div>
+                                    </div>
+                                </div>
+                            </div>
+                            
+                            <!-- Submit Button -->
+                            <div>
+                                <button
+                                    type="submit"
+                                    id="generate-btn"
+                                    class="w-full px-2 py-1.5 text-xs rounded text-[var(--color-bg-primary)] bg-[var(--color-primary)] hover:bg-[var(--color-primary)]/90 transition-colors font-medium"
+                                >
+                                    <i class="fas fa-magic mr-1.5"></i>Generate Image
+                                </button>
+                            </div>
+                        </form>
+                    </div>
                </div>
            </div>
-        </div>
-            
-        <!-- Image Generation Form -->
-        <div class="card p-6">
-            <h2 class="h3 mb-6">Generate an Image</h2>
-            
-            <div class="relative">
-                <input id="image-model" type="hidden" value="{{.Model}}">
-                <form id="genimage" action="text2image/{{.Model}}" method="get" class="mb-8">
-                    <div class="relative">
-                        <div class="absolute inset-y-0 left-0 flex items-center pl-4 pointer-events-none z-10">
-                            <i class="fas fa-magic text-[var(--color-primary)]"></i>
+
+            <!-- Right Column: Image Preview -->
+            <div class="flex-grow lg:w-3/4 flex flex-col min-h-0">
+                <div class="card p-3 flex flex-col flex-1 min-h-0">
+                    <h3 class="text-sm font-semibold text-[var(--color-text-primary)] mb-3 flex-shrink-0">Generated Images</h3>
+                    <div class="relative flex-1 min-h-0 overflow-y-auto">
+                        <!-- Loading Animation -->
+                        <div id="loader" class="hidden absolute inset-0 flex items-center justify-center bg-[var(--color-bg-primary)]/80 rounded-xl z-10">
+                            <div class="text-center">
+                                <svg class="animate-spin h-10 w-10 text-[var(--color-primary)] mx-auto mb-3" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
+                                    <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
+                                    <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
+                                </svg>
+                                <p class="text-xs text-[var(--color-text-secondary)]">Generating image...</p>
+                            </div>
                        </div>
-                        <input
-                            type="text"
-                            id="input"
-                            name="input"
-                            placeholder="Describe the image you want to generate..."
-                            autocomplete="off"
-                            class="input w-full pr-12 py-4 text-lg"
-                            style="padding-left: 3.5rem !important;"
-                            required
-                        />
-                        <span id="loader" class="my-2 loader absolute right-4 top-4 hidden">
-                            <svg class="animate-spin h-6 w-6 text-[var(--color-primary)]" xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24">
-                                <circle class="opacity-25" cx="12" cy="12" r="10" stroke="currentColor" stroke-width="4"></circle>
-                                <path class="opacity-75" fill="currentColor" d="M4 12a8 8 0 018-8V0C5.373 0 0 5.373 0 12h4zm2 5.291A7.962 7.962 0 014 12H0c0 3.042 1.135 5.824 3 7.938l3-2.647z"></path>
-                            </svg>
-                        </span>
-                    </div>
-                    
-                    <!-- Size Selection -->
-                    <div class="mt-4">
-                        <label for="image-size" class="block text-sm font-medium text-[var(--color-text-secondary)] mb-2">
-                            <i class="fas fa-expand-arrows-alt mr-2 text-[var(--color-primary)]"></i>Image Size:
-                        </label>
-                        <input
-                            type="text"
-                            id="image-size"
-                            value="256x256"
-                            placeholder="e.g., 256x256, 512x512, 1024x1024"
-                            class="input p-2.5 w-full max-w-xs"
-                        />
-                    </div>
-                    
-                    <!-- Submit Button -->
-                    <div class="mt-6">
-                        <button
-                            type="submit"
-                            class="btn-primary w-full"
-                        >
-                            <i class="fas fa-magic mr-2"></i>Generate Image
-                        </button>
-                    </div>
-                </form>
-                
-                <!-- Image Results Container -->
-                <div class="mt-6 border-t border-[#1E293B] pt-6">
-                    <h3 class="text-xl font-semibold text-[var(--color-text-primary)] mb-4">Generated Image</h3>
-                    <div class="container mx-auto flex justify-center">
-                        <div id="result" class="mx-auto bg-[var(--color-bg-primary)]/50 border border-[#1E293B] rounded-xl p-4 min-h-[300px] w-full flex items-center justify-center">
-                            <p class="text-[var(--color-text-secondary)] italic">Your generated image will appear here</p>
+                        <!-- Placeholder when no images -->
+                        <div id="result-placeholder" class="bg-[var(--color-bg-primary)]/50 border border-[#1E293B] rounded-xl p-6 min-h-[400px] flex items-center justify-center flex-shrink-0">
+                            <p class="text-xs text-[var(--color-text-secondary)] italic text-center">Your generated images will appear here</p>
                        </div>
+                        <!-- Results container -->
+                        <div id="result" class="space-y-4 pb-4"></div>
                    </div>
                </div>
            </div>
        </div>
    </div>

-    {{template "views/partials/footer" .}}
 </div>

 <script>
-    // Show loader when form is submitted
-    document.getElementById('genimage').addEventListener('submit', function() {
-        document.getElementById('loader').classList.remove('hidden');
+    // Collapsible sections
+    document.getElementById('advanced-toggle').addEventListener('click', function() {
+        const settings = document.getElementById('advanced-settings');
+        const chevron = document.getElementById('advanced-chevron');
+        settings.classList.toggle('hidden');
+        chevron.classList.toggle('fa-chevron-down');
+        chevron.classList.toggle('fa-chevron-up');
    });
+
+    document.getElementById('image-inputs-toggle').addEventListener('click', function() {
+        const settings = document.getElementById('image-inputs-settings');
+        const chevron = document.getElementById('image-inputs-chevron');
+        settings.classList.toggle('hidden');
+        chevron.classList.toggle('fa-chevron-down');
+        chevron.classList.toggle('fa-chevron-up');
+    });
+
+    // Size preset buttons
+    document.querySelectorAll('.size-preset').forEach(button => {
+        button.addEventListener('click', function() {
+            const size = this.getAttribute('data-size');
+            document.getElementById('image-size').value = size;
+            // Update active state
+            document.querySelectorAll('.size-preset').forEach(btn => {
+                btn.classList.remove('bg-[var(--color-primary)]', 'text-white');
+            });
+            this.classList.add('bg-[var(--color-primary)]', 'text-white');
+        });
+    });
+
+    // Set initial active size preset
+    document.querySelector('.size-preset[data-size="512x512"]').classList.add('bg-[var(--color-primary)]', 'text-white');
+
+    // Dynamic image inputs for Reference Images
+    function addReferenceImage() {
+        const container = document.getElementById('reference-images-container');
+        const newItem = document.createElement('div');
+        newItem.className = 'reference-image-item flex items-center gap-2';
+        newItem.innerHTML = `
+            <input
+                type="file"
+                class="reference-image-file input p-1.5 text-xs flex-1"
+                accept="image/*"
+                data-type="ref_images"
+            />
+            <button type="button" class="remove-reference-image px-1.5 py-1.5 text-[10px] bg-red-500 text-white rounded hover:opacity-80">
+                <i class="fas fa-times"></i>
+            </button>
+        `;
+        container.appendChild(newItem);
+        updateRemoveButtons('reference-images-container', 'remove-reference-image');
+    }
+
+    function removeReferenceImage(button) {
+        const container = document.getElementById('reference-images-container');
+        if (container.children.length > 1) {
+            button.closest('.reference-image-item').remove();
+            updateRemoveButtons('reference-images-container', 'remove-reference-image');
+        }
+    }
+
+    // Update remove button visibility (hide if only one item, show if multiple)
+    function updateRemoveButtons(containerId, buttonClass) {
+        const container = document.getElementById(containerId);
+        const buttons = container.querySelectorAll('.' + buttonClass);
+        if (container.children.length > 1) {
+            buttons.forEach(btn => btn.classList.remove('hidden'));
+        } else {
+            buttons.forEach(btn => btn.classList.add('hidden'));
+        }
+    }
+
+    // Event listeners for dynamic inputs
+    document.getElementById('add-reference-image').addEventListener('click', addReferenceImage);
+
+    document.getElementById('reference-images-container').addEventListener('click', function(e) {
+        if (e.target.closest('.remove-reference-image')) {
+            removeReferenceImage(e.target.closest('.remove-reference-image'));
+        }
+    });
+
+    // Initialize remove button visibility
+    updateRemoveButtons('reference-images-container', 'remove-reference-image');
 </script>

 </body>
--- a/core/http/views/traces.html
+++ b/core/http/views/traces.html
@@ -0,0 +1,334 @@
+<!DOCTYPE html>
+<html lang="en">
+{{template "views/partials/head" .}}
+
+<body class="bg-[var(--color-bg-primary)] text-[var(--color-text-primary)]">
+<div class="flex flex-col min-h-screen" x-data="tracesApp()" x-init="init()">
+
+    {{template "views/partials/navbar" .}}
+
+    <!-- Notifications -->
+    <div class="fixed top-20 right-4 z-50 space-y-2" style="max-width: 400px;">
+        <template x-for="notification in notifications" :key="notification.id">
+            <div x-show="true"
+                 x-transition:enter="transition ease-out duration-200"
+                 x-transition:enter-start="opacity-0"
+                 x-transition:enter-end="opacity-100"
+                 x-transition:leave="transition ease-in duration-150"
+                 x-transition:leave-start="opacity-100"
+                 x-transition:leave-end="opacity-0"
+                 :class="notification.type === 'error' ? 'bg-red-500' : 'bg-green-500'"
+                 class="rounded-lg p-4 text-white flex items-start space-x-3">
+                <div class="flex-shrink-0">
+                    <i :class="notification.type === 'error' ? 'fas fa-exclamation-circle' : 'fas fa-check-circle'" class="text-xl"></i>
+                </div>
+                <div class="flex-1 min-w-0">
+                    <p class="text-sm font-medium break-words" x-text="notification.message"></p>
+                </div>
+                <button @click="dismissNotification(notification.id)" class="flex-shrink-0 text-white hover:opacity-80 transition-opacity">
+                    <i class="fas fa-times"></i>
+                </button>
+            </div>
+        </template>
+    </div>
+
+    <div class="container mx-auto px-4 py-8 flex-grow">
+
+        <!-- Hero Header -->
+        <div class="hero-section">
+            <div class="hero-content">
+                <h1 class="hero-title">
+                    API Traces
+                </h1>
+                <p class="hero-subtitle">View logged API requests and responses</p>
+                <div class="flex flex-wrap justify-center gap-3">
+                    <button @click="clearTraces()" class="btn-secondary text-sm py-1.5 px-3">
+                        <i class="fas fa-trash mr-1.5 text-[10px]"></i>
+                        <span>Clear Traces</span>
+                    </button>
+                    <a href="/api/traces" download="traces.json" class="btn-secondary text-sm py-1.5 px-3">
+                        <i class="fas fa-download mr-1.5 text-[10px]"></i>
+                        <span>Export Traces</span>
+                    </a>
+                </div>
+            </div>
+        </div>
+
+        <!-- Tracing Settings -->
+        <div class="bg-[var(--color-bg-secondary)] border border-[var(--color-primary)]/20 rounded-lg p-6 mb-8">
+            <h2 class="text-xl font-semibold text-[var(--color-text-primary)] mb-4 flex items-center">
+                <i class="fas fa-bug mr-2 text-[var(--color-primary)] text-sm"></i>
+                Tracing Settings
+            </h2>
+            <p class="text-xs text-[var(--color-text-secondary)] mb-4">Configure API tracing</p>
+
+            <div class="space-y-4">
+                <!-- Enable Tracing -->
+                <div class="flex items-center justify-between">
+                    <div>
+                        <label class="text-sm font-medium text-[var(--color-text-primary)]">Enable Tracing</label>
+                        <p class="text-xs text-[var(--color-text-secondary)] mt-1">Enable tracing of requests and responses</p>
+                    </div>
+                    <label class="relative inline-flex items-center cursor-pointer">
+                        <input type="checkbox" x-model="settings.enable_tracing"
+                               @change="updateTracingEnabled()"
+                               class="sr-only peer">
+                        <div class="w-11 h-6 bg-[var(--color-bg-primary)] peer-focus:outline-none peer-focus:ring-4 peer-focus:ring-[var(--color-primary-light)] rounded-full peer peer-checked:after:translate-x-full peer-checked:after:border-white after:content-[''] after:absolute after:top-[2px] after:left-[2px] after:bg-white after:border-gray-300 after:border after:rounded-full after:h-5 after:w-5 after:transition-all peer-checked:bg-[var(--color-primary)]"></div>
+                    </label>
+                </div>
+
+                <!-- Tracing Max Items -->
+                <div>
+                    <label class="block text-sm font-medium text-[var(--color-text-primary)] mb-2">Tracing Max Items</label>
+                    <p class="text-xs text-[var(--color-text-secondary)] mb-2">Maximum number of tracing items to keep (0 = unlimited)</p>
+                    <input type="number" x-model="settings.tracing_max_items"
+                           min="0"
+                           placeholder="1000"
+                           :disabled="!settings.enable_tracing"
+                           class="w-full px-3 py-2 bg-[var(--color-bg-primary)] border border-[var(--color-primary)]/20 rounded text-sm text-[var(--color-text-primary)] focus:outline-none focus:ring-2 focus:ring-[var(--color-primary)]/50"
+                           :class="!settings.enable_tracing ? 'opacity-50 cursor-not-allowed' : ''">
+                </div>
+
+                <!-- Save Button -->
+                <div class="flex justify-end pt-2">
+                    <button @click="saveTracingSettings()"
+                            :disabled="saving"
+                            class="btn-primary px-4 py-2 text-sm">
+                        <i class="fas fa-save mr-2" :class="saving ? 'fa-spin fa-spinner' : ''"></i>
+                        <span x-text="saving ? 'Saving...' : 'Save Settings'"></span>
+                    </button>
+                </div>
+            </div>
+        </div>
+
+        <!-- Traces Table -->
+        <div class="mt-8">
+            <div class="overflow-x-auto">
+                <table class="w-full border-collapse">
+                    <thead>
+                        <tr class="border-b border-[var(--color-bg-secondary)]">
+                            <th class="text-left p-2 text-xs font-semibold text-[var(--color-text-secondary)]">Method</th>
+                            <th class="text-left p-2 text-xs font-semibold text-[var(--color-text-secondary)]">Path</th>
+                            <th class="text-left p-2 text-xs font-semibold text-[var(--color-text-secondary)]">Status</th>
+                            <th class="text-right p-2 text-xs font-semibold text-[var(--color-text-secondary)]">Actions</th>
+                        </tr>
+                    </thead>
+                    <tbody>
+                        <template x-for="(trace, index) in traces" :key="index">
+                            <tr class="hover:bg-[var(--color-bg-secondary)]/50 border-b border-[var(--color-bg-secondary)] transition-colors">
+                                <td class="p-2" x-text="trace.request.method"></td>
+                                <td class="p-2" x-text="trace.request.path"></td>
+                                <td class="p-2" x-text="trace.response.status"></td>
+                                <td class="p-2 text-right">
+                                    <button @click="showDetails(index)" class="text-[var(--color-primary)]/60 hover:text-[var(--color-primary)] hover:bg-[var(--color-primary)]/10 rounded p-1 transition-colors">
+                                        <i class="fas fa-eye text-xs"></i>
+                                    </button>
+                                </td>
+                            </tr>
+                        </template>
+                    </tbody>
+                </table>
+            </div>
+        </div>
+
+        <!-- Details Modal -->
+        <div x-show="selectedTrace !== null" class="fixed inset-0 bg-black/50 flex items-center justify-center z-50" @click="selectedTrace = null">
+            <div class="bg-[var(--color-bg-secondary)] rounded-lg p-6 max-w-4xl w-full max-h-[90vh] overflow-auto" @click.stop>
+                <div class="flex justify-between mb-4">
+                    <h2 class="h3">Trace Details</h2>
+                    <button @click="selectedTrace = null" class="text-[var(--color-text-secondary)] hover:text-[var(--color-text-primary)]">
+                        <i class="fas fa-times"></i>
+                    </button>
+                </div>
+                <div class="grid grid-cols-2 gap-4">
+                    <div>
+                        <h3 class="text-lg font-semibold mb-2">Request Body</h3>
+                        <div id="requestEditor" class="h-96 border border-[var(--color-primary-border)]/20"></div>
+                    </div>
+                    <div>
+                        <h3 class="text-lg font-semibold mb-2">Response Body</h3>
+                        <div id="responseEditor" class="h-96 border border-[var(--color-primary-border)]/20"></div>
+                    </div>
+                </div>
+            </div>
+        </div>
+
+    </div>
+
+    {{template "views/partials/footer" .}}
+
+</div>
+
+<!-- CodeMirror -->
+<link rel="stylesheet" href="static/assets/codemirror.min.css">
+<script src="static/assets/codemirror.min.js"></script>
+<script src="static/assets/javascript.min.js"></script>
+
+<!-- Styles from model-editor -->
+<style>
+.CodeMirror {
+    height: 100% !important;
+    font-family: monospace;
+}
+</style>
+
+<script>
+function tracesApp() {
+    return {
+        traces: [],
+        selectedTrace: null,
+        requestEditor: null,
+        responseEditor: null,
+        notifications: [],
+        settings: {
+            enable_tracing: false,
+            tracing_max_items: 0
+        },
+        saving: false,
+
+        init() {
+            this.loadTracingSettings();
+            this.fetchTraces();
+            setInterval(() => this.fetchTraces(), 5000);
+        },
+
+        async loadTracingSettings() {
+            try {
+                const response = await fetch('/api/settings');
+                const data = await response.json();
+
+                if (response.ok) {
+                    this.settings.enable_tracing = data.enable_tracing || false;
+                    this.settings.tracing_max_items = data.tracing_max_items || 0;
+                } else {
+                    this.addNotification('Failed to load tracing settings: ' + (data.error || 'Unknown error'), 'error');
+                }
+            } catch (error) {
+                console.error('Error loading tracing settings:', error);
+                this.addNotification('Failed to load tracing settings: ' + error.message, 'error');
+            }
+        },
+
+        updateTracingEnabled() {
+            if (!this.settings.enable_tracing) {
+                this.settings.tracing_max_items = 0;
+            }
+        },
+
+        async saveTracingSettings() {
+            if (this.saving) return;
+
+            this.saving = true;
+
+            try {
+                const payload = {
+                    enable_tracing: this.settings.enable_tracing,
+                    tracing_max_items: parseInt(this.settings.tracing_max_items) || 0
+                };
+
+                const response = await fetch('/api/settings', {
+                    method: 'POST',
+                    headers: {
+                        'Content-Type': 'application/json',
+                    },
+                    body: JSON.stringify(payload)
+                });
+
+                const data = await response.json();
+
+                if (response.ok && data.success) {
+                    this.addNotification('Tracing settings saved successfully!', 'success');
+                } else {
+                    this.addNotification('Failed to save tracing settings: ' + (data.error || 'Unknown error'), 'error');
+                }
+            } catch (error) {
+                console.error('Error saving tracing settings:', error);
+                this.addNotification('Failed to save tracing settings: ' + error.message, 'error');
+            } finally {
+                this.saving = false;
+            }
+        },
+
+        addNotification(message, type = 'success') {
+            const id = Date.now();
+            this.notifications.push({ id, message, type });
+            setTimeout(() => this.dismissNotification(id), 5000);
+        },
+
+        dismissNotification(id) {
+            this.notifications = this.notifications.filter(n => n.id !== id);
+        },
+
+        async fetchTraces() {
+            const response = await fetch('/api/traces');
+            this.traces = await response.json();
+        },
+
+        async clearTraces() {
+            if (confirm('Clear all traces?')) {
+                await fetch('/api/traces/clear', { method: 'POST' });
+                this.traces = [];
+            }
+        },
+
+        showDetails(index) {
+            this.selectedTrace = index;
+            this.$nextTick(() => {
+                const trace = this.traces[index];
+
+                const decodeBase64 = (base64) => {
+                    const binaryString = atob(base64);
+                    const bytes = new Uint8Array(binaryString.length);
+                    for (let i = 0; i < binaryString.length; i++) {
+                        bytes[i] = binaryString.charCodeAt(i);
+                    }
+                    return new TextDecoder().decode(bytes);
+                };
+
+                const formatBody = (bodyText) => {
+                    try {
+                        const json = JSON.parse(bodyText);
+                        return JSON.stringify(json, null, 2);
+                    } catch {
+                        return bodyText;
+                    }
+                };
+
+                const reqBody = formatBody(decodeBase64(trace.request.body));
+                const resBody = formatBody(decodeBase64(trace.response.body));
+
+                if (!this.requestEditor) {
+                    this.requestEditor = CodeMirror(document.getElementById('requestEditor'), {
+                        value: reqBody,
+                        mode: 'javascript',
+                        json: true,
+                        theme: 'default',
+                        lineNumbers: true,
+                        readOnly: true,
+                        lineWrapping: true
+                    });
+                } else {
+                    this.requestEditor.setValue(reqBody);
+                }
+
+                if (!this.responseEditor) {
+                    this.responseEditor = CodeMirror(document.getElementById('responseEditor'), {
+                        value: resBody,
+                        mode: 'javascript',
+                        json: true,
+                        theme: 'default',
+                        lineNumbers: true,
+                        readOnly: true,
+                        lineWrapping: true
+                    });
+                } else {
+                    this.responseEditor.setValue(resBody);
+                }
+            });
+        }
+    }
+}
+</script>
+
+</body>
+</html>
--- a/core/schema/openai.go
+++ b/core/schema/openai.go
@@ -18,10 +18,19 @@ type ErrorResponse struct {
 	Error *APIError `json:"error,omitempty"`
 }

+type InputTokensDetails struct {
+	TextTokens  int `json:"text_tokens"`
+	ImageTokens int `json:"image_tokens"`
+}
+
 type OpenAIUsage struct {
 	PromptTokens     int `json:"prompt_tokens"`
 	CompletionTokens int `json:"completion_tokens"`
 	TotalTokens      int `json:"total_tokens"`
+	// Fields for image generation API compatibility
+	InputTokens        int                 `json:"input_tokens,omitempty"`
+	OutputTokens       int                 `json:"output_tokens,omitempty"`
+	InputTokensDetails *InputTokensDetails `json:"input_tokens_details,omitempty"`
 	// Extra timing data, disabled by default as is't not a part of OpenAI specification
 	TimingPromptProcessing float64 `json:"timing_prompt_processing,omitempty"`
 	TimingTokenGeneration  float64 `json:"timing_token_generation,omitempty"`
@@ -49,11 +58,11 @@ type OpenAIResponse struct {
 }

 type Choice struct {
-	Index        int      `json:"index"`
-	FinishReason *string  `json:"finish_reason"`
-	Message      *Message `json:"message,omitempty"`
-	Delta        *Message `json:"delta,omitempty"`
-	Text         string   `json:"text,omitempty"`
+	Index        int       `json:"index"`
+	FinishReason *string   `json:"finish_reason"`
+	Message      *Message  `json:"message,omitempty"`
+	Delta        *Message  `json:"delta,omitempty"`
+	Text         string    `json:"text,omitempty"`
 	Logprobs     *Logprobs `json:"logprobs,omitempty"`
 }

@@ -151,7 +160,6 @@ type OpenAIRequest struct {
 	Stream bool `json:"stream"`

 	// Image (not supported by OpenAI)
-	Mode    int    `json:"mode"`
 	Quality string `json:"quality"`
 	Step    int    `json:"step"`

--- a/docs/content/advanced/vram-management.md
+++ b/docs/content/advanced/vram-management.md
@@ -52,6 +52,49 @@ Setting the limit to `1` is equivalent to single active backend mode (see below)
 3. The LRU model(s) are automatically unloaded to make room for the new model
 4. Concurrent requests for loading different models are handled safely - the system accounts for models currently being loaded when calculating evictions

+### Eviction Behavior with Active Requests
+
+By default, LocalAI will **skip evicting models that have active API calls** to prevent interrupting ongoing requests. This means:
+
+- If all models are busy (have active requests), eviction will be skipped and the system will wait for models to become idle
+- The loading request will retry eviction with configurable retry settings
+- This ensures data integrity and prevents request failures
+
+You can configure this behavior via WebUI or using the following settings:
+
+#### Force Eviction When Busy
+
+To allow evicting models even when they have active API calls (not recommended for production):
+
+```bash
+# Via CLI
+./local-ai --force-eviction-when-busy
+
+# Via environment variable
+LOCALAI_FORCE_EVICTION_WHEN_BUSY=true ./local-ai
+```
+
+> **Warning:** Enabling force eviction can interrupt active requests and cause errors. Only use this if you understand the implications.
+
+#### LRU Eviction Retry Settings
+
+When models are busy and cannot be evicted, LocalAI will retry eviction with configurable settings:
+
+```bash
+# Configure maximum retries (default: 30)
+./local-ai --lru-eviction-max-retries=50
+
+# Configure retry interval (default: 1s)
+./local-ai --lru-eviction-retry-interval=2s
+
+# Using environment variables
+LOCALAI_LRU_EVICTION_MAX_RETRIES=50 \
+LOCALAI_LRU_EVICTION_RETRY_INTERVAL=2s \
+./local-ai
+```
+
+These settings control how long the system will wait for busy models to become idle before giving up. The retry mechanism allows busy models to complete their requests before being evicted, preventing request failures.
+
 ### Example

 ```bash
@@ -207,6 +250,33 @@ This configuration:
 - Automatically unloads any model that hasn't been used for 15 minutes
 - Provides both hard limits and time-based cleanup

+### Example with Retry Settings
+
+You can also configure retry behavior when models are busy:
+
+```bash
+# Allow up to 2 active backends with custom retry settings
+LOCALAI_MAX_ACTIVE_BACKENDS=2 \
+LOCALAI_LRU_EVICTION_MAX_RETRIES=50 \
+LOCALAI_LRU_EVICTION_RETRY_INTERVAL=2s \
+./local-ai
+```
+
+Or using command line flags:
+
+```bash
+./local-ai \
+  --max-active-backends=2 \
+  --lru-eviction-max-retries=50 \
+  --lru-eviction-retry-interval=2s
+```
+
+This configuration:
+- Limits to 2 active backends
+- Will retry eviction up to 50 times if models are busy
+- Waits 2 seconds between retry attempts
+- Ensures busy models have time to complete their requests before eviction
+
 ## Limitations and Considerations

 ### VRAM Usage Estimation
--- a/docs/content/features/GPU-acceleration.md
+++ b/docs/content/features/GPU-acceleration.md
@@ -82,6 +82,7 @@ The image list is on [quay](https://quay.io/repository/go-skynet/local-ai?tab=ta

 - CUDA `11` tags: `master-gpu-nvidia-cuda-11`, `v1.40.0-gpu-nvidia-cuda-11`, ...
 - CUDA `12` tags: `master-gpu-nvidia-cuda-12`, `v1.40.0-gpu-nvidia-cuda-12`, ...
+- CUDA `13` tags: `master-gpu-nvidia-cuda-13`, `v1.40.0-gpu-nvidia-cuda-13`, ...

 In addition to the commands to run LocalAI normally, you need to specify `--gpus all` to docker, for example:

--- a/docs/content/features/mcp.md
+++ b/docs/content/features/mcp.md
@@ -257,6 +257,7 @@ It might be handy to install packages before starting the container to setup the
 services:
  local-ai:
    image: localai/localai:latest
+    #image: localai/localai:latest-gpu-nvidia-cuda-13
    #image: localai/localai:latest-gpu-nvidia-cuda-12
    container_name: local-ai
    restart: always
--- a/docs/content/features/runtime-settings.md
+++ b/docs/content/features/runtime-settings.md
@@ -29,9 +29,23 @@ Changes to watchdog settings are applied immediately by restarting the watchdog

 - **Max Active Backends**: Maximum number of active backends (loaded models). When exceeded, the least recently used model is automatically evicted. Set to `0` for unlimited, `1` for single-backend mode
 - **Parallel Backend Requests**: Enable backends to handle multiple requests in parallel if supported
+- **Force Eviction When Busy**: Allow evicting models even when they have active API calls (default: disabled for safety). **Warning:** Enabling this can interrupt active requests
+- **LRU Eviction Max Retries**: Maximum number of retries when waiting for busy models to become idle before eviction (default: 30)
+- **LRU Eviction Retry Interval**: Interval between retries when waiting for busy models (default: `1s`)

 > **Note:** The "Single Backend" setting is deprecated. Use "Max Active Backends" set to `1` for single-backend behavior.

+#### LRU Eviction Behavior
+
+By default, LocalAI will skip evicting models that have active API calls to prevent interrupting ongoing requests. When all models are busy and eviction is needed:
+
+1. The system will wait for models to become idle
+2. It will retry eviction up to the configured maximum number of retries
+3. The retry interval determines how long to wait between attempts
+4. If all retries are exhausted, the system will proceed (which may cause out-of-memory errors if resources are truly exhausted)
+
+You can configure these settings via the web UI or through environment variables. See [VRAM Management]({{%relref "advanced/vram-management" %}}) for more details.
+
 ### Performance Settings

 - **Threads**: Number of threads used for parallel computation (recommended: number of physical cores)
@@ -94,6 +108,9 @@ The `runtime_settings.json` file follows this structure:
  "watchdog_busy_timeout": "5m",
  "max_active_backends": 0,
  "parallel_backend_requests": true,
+  "force_eviction_when_busy": false,
+  "lru_eviction_max_retries": 30,
+  "lru_eviction_retry_interval": "1s",
  "threads": 8,
  "context_size": 2048,
  "f16": false,
--- a/docs/content/features/text-to-audio.md
+++ b/docs/content/features/text-to-audio.md
@@ -122,6 +122,48 @@ curl --request POST \

 Future versions of LocalAI will expose additional control over audio generation beyond the text prompt.

+### VibeVoice
+
+[VibeVoice-Realtime](https://github.com/microsoft/VibeVoice) is a real-time text-to-speech model that generates natural-sounding speech with voice cloning capabilities.
+
+#### Setup
+
+Install the `vibevoice` model in the Model gallery or run `local-ai run models install vibevoice`.
+
+#### Usage
+
+Use the tts endpoint by specifying the vibevoice backend:
+
+```
+curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
+     "model": "vibevoice",
+     "input":"Hello!"
+   }' | aplay
+```
+
+#### Voice cloning
+
+VibeVoice supports voice cloning through voice preset files. You can configure a model with a specific voice:
+
+```yaml
+name: vibevoice
+backend: vibevoice
+parameters:
+  model: microsoft/VibeVoice-Realtime-0.5B
+tts:
+  voice: "Frank"  # or use audio_path to specify a .pt file path
+  # Available English voices: Carter, Davis, Emma, Frank, Grace, Mike
+```
+
+Then you can use the model:
+
+```
+curl http://localhost:8080/tts -H "Content-Type: application/json" -d '{         
+     "model": "vibevoice",
+     "input":"Hello!"
+   }' | aplay
+```
+
 ### Vall-E-X

 [VALL-E-X](https://github.com/Plachtaa/VALL-E-X) is an open source implementation of Microsoft's VALL-E X zero-shot TTS model.
--- a/docs/content/getting-started/container-images.md
+++ b/docs/content/getting-started/container-images.md
@@ -70,6 +70,16 @@ Standard container images do not have pre-installed models. Use these if you wan

 {{% /tab %}}

+{{% tab title="GPU Images CUDA 13" %}}
+
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-gpu-nvidia-cuda-13` | `localai/localai:master-gpu-nvidia-cuda-13`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-gpu-nvidia-cuda-13` | `localai/localai:latest-gpu-nvidia-cuda-13`                 |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-gpu-nvidia-cuda-13` | `localai/localai:{{< version >}}-gpu-nvidia-cuda-13`             |
+
+{{% /tab %}}
+
 {{% tab title="Intel GPU" %}}

 | Description | Quay | Docker Hub                                                  |
@@ -98,9 +108,9 @@ Standard container images do not have pre-installed models. Use these if you wan
 | Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-vulkan` | `localai/localai:{{< version >}}-vulkan`             |
 {{% /tab %}}

-{{% tab title="Nvidia Linux for tegra" %}}
+{{% tab title="Nvidia Linux for tegra (CUDA 12)" %}}

-These images are compatible with Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Xavier. For more information, see the [Nvidia L4T guide]({{%relref "reference/nvidia-l4t" %}}).
+These images are compatible with Nvidia ARM64 devices with CUDA 12, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Orin. For more information, see the [Nvidia L4T guide]({{%relref "reference/nvidia-l4t" %}}).

 | Description | Quay | Docker Hub                                                  |
 | --- | --- |-------------------------------------------------------------|
@@ -110,6 +120,18 @@ These images are compatible with Nvidia ARM64 devices, such as the Jetson Nano,

 {{% /tab %}}

+{{% tab title="Nvidia Linux for tegra (CUDA 13)" %}}
+
+These images are compatible with Nvidia ARM64 devices with CUDA 13, such as the Nvidia DGX Spark. For more information, see the [Nvidia L4T guide]({{%relref "reference/nvidia-l4t" %}}).
+
+| Description | Quay | Docker Hub                                                  |
+| --- | --- |-------------------------------------------------------------|
+| Latest images from the branch (development) | `quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13` | `localai/localai:master-nvidia-l4t-arm64-cuda-13`                      |
+| Latest tag | `quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64-cuda-13` | `localai/localai:latest-nvidia-l4t-arm64-cuda-13`                 |
+| Versioned image | `quay.io/go-skynet/local-ai:{{< version >}}-nvidia-l4t-arm64-cuda-13` | `localai/localai:{{< version >}}-nvidia-l4t-arm64-cuda-13`             |
+
+{{% /tab %}}
+
 {{< /tabs >}}

 ## All-in-one images
@@ -147,11 +169,13 @@ services:
    image: localai/localai:latest-aio-cpu
    # For a specific version:
    # image: localai/localai:{{< version >}}-aio-cpu
-    # For Nvidia GPUs decomment one of the following (cuda11 or cuda12):
+    # For Nvidia GPUs decomment one of the following (cuda11, cuda12, or cuda13):
    # image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-11
    # image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-12
+    # image: localai/localai:{{< version >}}-aio-gpu-nvidia-cuda-13
    # image: localai/localai:latest-aio-gpu-nvidia-cuda-11
    # image: localai/localai:latest-aio-gpu-nvidia-cuda-12
+    # image: localai/localai:latest-aio-gpu-nvidia-cuda-13
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8080/readyz"]
      interval: 1m
@@ -203,6 +227,7 @@ docker run -p 8080:8080 --name local-ai -ti -v localai-models:/models localai/lo
 | Versioned image (e.g. for CPU) | `quay.io/go-skynet/local-ai:{{< version >}}-aio-cpu` | `localai/localai:{{< version >}}-aio-cpu`             |
 | Latest images for Nvidia GPU (CUDA11) | `quay.io/go-skynet/local-ai:latest-aio-gpu-nvidia-cuda-11` | `localai/localai:latest-aio-gpu-nvidia-cuda-11`                      |
 | Latest images for Nvidia GPU (CUDA12) | `quay.io/go-skynet/local-ai:latest-aio-gpu-nvidia-cuda-12` | `localai/localai:latest-aio-gpu-nvidia-cuda-12`                      |
+| Latest images for Nvidia GPU (CUDA13) | `quay.io/go-skynet/local-ai:latest-aio-gpu-nvidia-cuda-13` | `localai/localai:latest-aio-gpu-nvidia-cuda-13`                      |
 | Latest images for AMD GPU | `quay.io/go-skynet/local-ai:latest-aio-gpu-hipblas` | `localai/localai:latest-aio-gpu-hipblas`                      |
 | Latest images for Intel GPU | `quay.io/go-skynet/local-ai:latest-aio-gpu-intel` | `localai/localai:latest-aio-gpu-intel`                      |

--- a/docs/content/installation/docker.md
+++ b/docs/content/installation/docker.md
@@ -58,6 +58,11 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest

 #### GPU Images

+**NVIDIA CUDA 13:**
+```bash
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-13
+```
+
 **NVIDIA CUDA 12:**
 ```bash
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-gpu-nvidia-cuda-12
@@ -84,10 +89,17 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-gpu-vulkan
 ```

 **NVIDIA Jetson (L4T ARM64):**
+
+CUDA 12 (for Nvidia AGX Orin and similar platforms):
 ```bash
 docker run -ti --name local-ai -p 8080:8080 --runtime nvidia --gpus all localai/localai:latest-nvidia-l4t-arm64
 ```

+CUDA 13 (for Nvidia DGX Spark):
+```bash
+docker run -ti --name local-ai -p 8080:8080 --runtime nvidia --gpus all localai/localai:latest-nvidia-l4t-arm64-cuda-13
+```
+
 ### All-in-One (AIO) Images

 **Recommended for beginners** - These images come pre-configured with models and backends, ready to use immediately.
@@ -100,6 +112,11 @@ docker run -ti --name local-ai -p 8080:8080 localai/localai:latest-aio-cpu

 #### GPU Images

+**NVIDIA CUDA 13:**
+```bash
+docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-13
+```
+
 **NVIDIA CUDA 12:**
 ```bash
 docker run -ti --name local-ai -p 8080:8080 --gpus all localai/localai:latest-aio-gpu-nvidia-cuda-12
@@ -130,6 +147,7 @@ services:
  api:
    image: localai/localai:latest-aio-cpu
    # For GPU support, use one of:
+    # image: localai/localai:latest-aio-gpu-nvidia-cuda-13
    # image: localai/localai:latest-aio-gpu-nvidia-cuda-12
    # image: localai/localai:latest-aio-gpu-nvidia-cuda-11
    # image: localai/localai:latest-aio-gpu-hipblas
--- a/docs/content/reference/cli-reference.md
+++ b/docs/content/reference/cli-reference.md
@@ -46,6 +46,9 @@ Complete reference for all LocalAI command-line interface (CLI) parameters and e
 | `--watchdog-idle-timeout` | `15m` | Threshold beyond which an idle backend should be stopped | `$LOCALAI_WATCHDOG_IDLE_TIMEOUT`, `$WATCHDOG_IDLE_TIMEOUT` |
 | `--enable-watchdog-busy` | `false` | Enable watchdog for stopping backends that are busy longer than the watchdog-busy-timeout | `$LOCALAI_WATCHDOG_BUSY`, `$WATCHDOG_BUSY` |
 | `--watchdog-busy-timeout` | `5m` | Threshold beyond which a busy backend should be stopped | `$LOCALAI_WATCHDOG_BUSY_TIMEOUT`, `$WATCHDOG_BUSY_TIMEOUT` |
+| `--force-eviction-when-busy` | `false` | Force eviction even when models have active API calls (default: false for safety). **Warning:** Enabling this can interrupt active requests | `$LOCALAI_FORCE_EVICTION_WHEN_BUSY`, `$FORCE_EVICTION_WHEN_BUSY` |
+| `--lru-eviction-max-retries` | `30` | Maximum number of retries when waiting for busy models to become idle before eviction | `$LOCALAI_LRU_EVICTION_MAX_RETRIES`, `$LRU_EVICTION_MAX_RETRIES` |
+| `--lru-eviction-retry-interval` | `1s` | Interval between retries when waiting for busy models to become idle (e.g., `1s`, `2s`) | `$LOCALAI_LRU_EVICTION_RETRY_INTERVAL`, `$LRU_EVICTION_RETRY_INTERVAL` |

 For more information on VRAM management, see [VRAM and Memory Management]({{%relref "advanced/vram-management" %}}).

--- a/docs/content/reference/compatibility-table.md
+++ b/docs/content/reference/compatibility-table.md
@@ -18,10 +18,10 @@ LocalAI will attempt to automatically load models which are not explicitly confi

 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [llama.cpp]({{%relref "features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes | yes                  | CUDA 11/12, ROCm, Intel SYCL, Vulkan, Metal, CPU |
-| [vLLM](https://github.com/vllm-project/vllm)        | Various GPTs and quantization formats | yes                      | GPT             | no | no                  | CUDA 12, ROCm, Intel |
-| [transformers](https://github.com/huggingface/transformers) | Various GPTs and quantization formats  | yes                      | GPT, embeddings, Audio generation            | yes | yes*                  | CUDA 11/12, ROCm, Intel, CPU |
-| [exllama2](https://github.com/turboderp-org/exllamav2)  | GPTQ                   | yes                       | GPT only                  | no                               | no                   | CUDA 12 |
+| [llama.cpp]({{%relref "features/text-generation#llama.cpp" %}})        | LLama, Mamba, RWKV, Falcon, Starcoder, GPT-2, [and many others](https://github.com/ggerganov/llama.cpp?tab=readme-ov-file#description) | yes                      | GPT and Functions                        | yes | yes                  | CUDA 11/12/13, ROCm, Intel SYCL, Vulkan, Metal, CPU |
+| [vLLM](https://github.com/vllm-project/vllm)        | Various GPTs and quantization formats | yes                      | GPT             | no | no                  | CUDA 12/13, ROCm, Intel |
+| [transformers](https://github.com/huggingface/transformers) | Various GPTs and quantization formats  | yes                      | GPT, embeddings, Audio generation            | yes | yes*                  | CUDA 11/12/13, ROCm, Intel, CPU |
+| [exllama2](https://github.com/turboderp-org/exllamav2)  | GPTQ                   | yes                       | GPT only                  | no                               | no                   | CUDA 12/13 |
 | [MLX](https://github.com/ml-explore/mlx-lm)        | Various LLMs               | yes                       | GPT                        | no                                | no                   | Metal (Apple Silicon) |
 | [MLX-VLM](https://github.com/Blaizzy/mlx-vlm)        | Vision-Language Models               | yes                       | Multimodal GPT                        | no                                | no                   | Metal (Apple Silicon) |
 | [langchain-huggingface](https://github.com/tmc/langchaingo)                                                                    | Any text generators available on HuggingFace through API | yes                      | GPT                        | no                                | no                   | N/A |
@@ -30,47 +30,49 @@ LocalAI will attempt to automatically load models which are not explicitly confi

 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [whisper.cpp](https://github.com/ggml-org/whisper.cpp)         | whisper               | no                       | Audio transcription                 | no                                | no                   | CUDA 12, ROCm, Intel SYCL, Vulkan, CPU |
-| [faster-whisper](https://github.com/SYSTRAN/faster-whisper)         | whisper               | no                       | Audio transcription                 | no                                | no                   | CUDA 12, ROCm, Intel, CPU |
+| [whisper.cpp](https://github.com/ggml-org/whisper.cpp)         | whisper               | no                       | Audio transcription                 | no                                | no                   | CUDA 12/13, ROCm, Intel SYCL, Vulkan, CPU |
+| [faster-whisper](https://github.com/SYSTRAN/faster-whisper)         | whisper               | no                       | Audio transcription                 | no                                | no                   | CUDA 12/13, ROCm, Intel, CPU |
 | [piper](https://github.com/rhasspy/piper) ([binding](https://github.com/mudler/go-piper))                                                                     | Any piper onnx model | no                      | Text to voice                        | no                                | no                   | CPU |
-| [bark](https://github.com/suno-ai/bark)  | bark                   | no                       | Audio generation                  | no                               | no                   | CUDA 12, ROCm, Intel |
+| [bark](https://github.com/suno-ai/bark)  | bark                   | no                       | Audio generation                  | no                               | no                   | CUDA 12/13, ROCm, Intel |
 | [bark-cpp](https://github.com/PABannier/bark.cpp)        | bark               | no                       | Audio-Only                 | no                                | no                   | CUDA, Metal, CPU |
-| [coqui](https://github.com/idiap/coqui-ai-TTS) | Coqui TTS    | no                       | Audio generation and Voice cloning    | no                               | no                   | CUDA 12, ROCm, Intel, CPU |
-| [kokoro](https://github.com/hexgrad/kokoro) | Kokoro TTS    | no                       | Text-to-speech    | no                               | no                   | CUDA 12, ROCm, Intel, CPU |
-| [chatterbox](https://github.com/resemble-ai/chatterbox) | Chatterbox TTS    | no                       | Text-to-speech    | no                               | no                   | CUDA 11/12, CPU |
+| [coqui](https://github.com/idiap/coqui-ai-TTS) | Coqui TTS    | no                       | Audio generation and Voice cloning    | no                               | no                   | CUDA 12/13, ROCm, Intel, CPU |
+| [kokoro](https://github.com/hexgrad/kokoro) | Kokoro TTS    | no                       | Text-to-speech    | no                               | no                   | CUDA 12/13, ROCm, Intel, CPU |
+| [chatterbox](https://github.com/resemble-ai/chatterbox) | Chatterbox TTS    | no                       | Text-to-speech    | no                               | no                   | CUDA 11/12/13, CPU |
 | [kitten-tts](https://github.com/KittenML/KittenTTS) | Kitten TTS    | no                       | Text-to-speech    | no                               | no                   | CPU |
 | [silero-vad](https://github.com/snakers4/silero-vad) with [Golang bindings](https://github.com/streamer45/silero-vad-go) | Silero VAD    | no                       | Voice Activity Detection    | no                               | no                   | CPU |
-| [neutts](https://github.com/neuphonic/neuttsair) | NeuTTSAir    | no                       | Text-to-speech with voice cloning    | no                               | no                   | CUDA 12, ROCm, CPU |
+| [neutts](https://github.com/neuphonic/neuttsair) | NeuTTSAir    | no                       | Text-to-speech with voice cloning    | no                               | no                   | CUDA 12/13, ROCm, CPU |
+| [vibevoice](https://github.com/microsoft/VibeVoice) | VibeVoice-Realtime    | no                       | Real-time text-to-speech with voice cloning    | no                               | no                   | CUDA 12/13, ROCm, Intel, CPU |
 | [mlx-audio](https://github.com/Blaizzy/mlx-audio) | MLX | no                       | Text-tospeech    | no                               | no                   | Metal (Apple Silicon) |

 ## Image & Video Generation

 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [stablediffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)         | stablediffusion-1, stablediffusion-2, stablediffusion-3, flux, PhotoMaker               | no                       | Image                 | no                                | no                   | CUDA 12, Intel SYCL, Vulkan, CPU |
-| [diffusers](https://github.com/huggingface/diffusers)  | SD, various diffusion models,...                   | no                       | Image/Video generation    | no                               | no                   | CUDA 11/12, ROCm, Intel, Metal, CPU |
+| [stablediffusion.cpp](https://github.com/leejet/stable-diffusion.cpp)         | stablediffusion-1, stablediffusion-2, stablediffusion-3, flux, PhotoMaker               | no                       | Image                 | no                                | no                   | CUDA 12/13, Intel SYCL, Vulkan, CPU |
+| [diffusers](https://github.com/huggingface/diffusers)  | SD, various diffusion models,...                   | no                       | Image/Video generation    | no                               | no                   | CUDA 11/12/13, ROCm, Intel, Metal, CPU |
 | [transformers-musicgen](https://github.com/huggingface/transformers)  | MusicGen                    | no                       | Audio generation                | no                               | no                   | CUDA, CPU |

 ## Specialized AI Tasks

 | Backend and Bindings                                                             | Compatible models     | Completion/Chat endpoint | Capability | Embeddings support                | Token stream support | Acceleration |
 |----------------------------------------------------------------------------------|-----------------------|--------------------------|---------------------------|-----------------------------------|----------------------|--------------|
-| [rfdetr](https://github.com/roboflow/rf-detr) | RF-DETR    | no                       | Object Detection    | no                               | no                   | CUDA 12, Intel, CPU |
-| [rerankers](https://github.com/AnswerDotAI/rerankers) | Reranking API    | no                       | Reranking   | no                               | no                   | CUDA 11/12, ROCm, Intel, CPU |
+| [rfdetr](https://github.com/roboflow/rf-detr) | RF-DETR    | no                       | Object Detection    | no                               | no                   | CUDA 12/13, Intel, CPU |
+| [rerankers](https://github.com/AnswerDotAI/rerankers) | Reranking API    | no                       | Reranking   | no                               | no                   | CUDA 11/12/13, ROCm, Intel, CPU |
 | [local-store](https://github.com/mudler/LocalAI) | Vector database    | no                       | Vector storage   | yes                               | no                   | CPU |
 | [huggingface](https://huggingface.co/docs/hub/en/api) | HuggingFace API models    | yes                       | Various AI tasks   | yes                               | yes                   | API-based |

 ## Acceleration Support Summary

 ### GPU Acceleration
- **NVIDIA CUDA**: CUDA 11.7, CUDA 12.0 support across most backends
+- **NVIDIA CUDA**: CUDA 11.7, CUDA 12.0, CUDA 13.0 support across most backends
 - **AMD ROCm**: HIP-based acceleration for AMD GPUs
 - **Intel oneAPI**: SYCL-based acceleration for Intel GPUs (F16/F32 precision)
 - **Vulkan**: Cross-platform GPU acceleration
 - **Metal**: Apple Silicon GPU acceleration (M1/M2/M3+)

 ### Specialized Hardware
- **NVIDIA Jetson (L4T)**: ARM64 support for embedded AI
+- **NVIDIA Jetson (L4T CUDA 12)**: ARM64 support for embedded AI (AGX Orin, Jetson Nano, Jetson Xavier NX, Jetson AGX Xavier)
+- **NVIDIA Jetson (L4T CUDA 13)**: ARM64 support for embedded AI (DGX Spark)
 - **Apple Silicon**: Native Metal acceleration for Mac M1/M2/M3+
 - **Darwin x86**: Intel Mac support

--- a/docs/content/reference/nvidia-l4t.md
+++ b/docs/content/reference/nvidia-l4t.md
@@ -5,16 +5,43 @@ title = "Running on Nvidia ARM64"
 weight = 27
 +++

-LocalAI can be run on Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, and Jetson AGX Xavier. The following instructions will guide you through building the LocalAI container for Nvidia ARM64 devices.
+LocalAI can be run on Nvidia ARM64 devices, such as the Jetson Nano, Jetson Xavier NX, Jetson AGX Orin, and Nvidia DGX Spark. The following instructions will guide you through building and using the LocalAI container for Nvidia ARM64 devices.
+
+## Platform Compatibility
+
+- **CUDA 12 L4T images**: Compatible with Nvidia AGX Orin and similar platforms (Jetson Nano, Jetson Xavier NX, Jetson AGX Xavier)
+- **CUDA 13 L4T images**: Compatible with Nvidia DGX Spark

 ## Prerequisites

 - Docker engine installed (https://docs.docker.com/engine/install/ubuntu/)
 - Nvidia container toolkit installed (https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html#installing-with-ap)

+## Pre-built Images
+
+Pre-built images are available on quay.io and dockerhub:
+
+### CUDA 12 (for AGX Orin and similar platforms)
+
+```bash
+docker pull quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64
+# or
+docker pull localai/localai:latest-nvidia-l4t-arm64
+```
+
+### CUDA 13 (for DGX Spark)
+
+```bash
+docker pull quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64-cuda-13
+# or
+docker pull localai/localai:latest-nvidia-l4t-arm64-cuda-13
+```
+
 ## Build the container

-Build the LocalAI container for Nvidia ARM64 devices using the following command:
+If you need to build the container yourself, use the following commands:
+
+### CUDA 12 (for AGX Orin and similar platforms)

 ```bash
 git clone https://github.com/mudler/LocalAI
@@ -24,18 +51,30 @@ cd LocalAI
 docker build --build-arg SKIP_DRIVERS=true --build-arg BUILD_TYPE=cublas --build-arg BASE_IMAGE=nvcr.io/nvidia/l4t-jetpack:r36.4.0 --build-arg IMAGE_TYPE=core -t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core .
 ```

-Otherwise images are available on quay.io and dockerhub:
+### CUDA 13 (for DGX Spark)

 ```bash
-docker pull quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
+git clone https://github.com/mudler/LocalAI
+
+cd LocalAI
+
+docker build --build-arg SKIP_DRIVERS=false --build-arg BUILD_TYPE=cublas --build-arg CUDA_MAJOR_VERSION=13 --build-arg CUDA_MINOR_VERSION=0 --build-arg BASE_IMAGE=ubuntu:24.04 --build-arg IMAGE_TYPE=core -t quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-cuda-13-core .
 ```

 ## Usage

-Run the LocalAI container on Nvidia ARM64 devices using the following command, where `/data/models` is the directory containing the models:
+Run the LocalAI container on Nvidia ARM64 devices using the following commands, where `/data/models` is the directory containing the models:
+
+### CUDA 12 (for AGX Orin and similar platforms)

 ```bash
-docker run -e DEBUG=true -p 8080:8080 -v /data/models:/models  -ti --restart=always --name local-ai --runtime nvidia --gpus all quay.io/go-skynet/local-ai:master-nvidia-l4t-arm64-core
+docker run -e DEBUG=true -p 8080:8080 -v /data/models:/models -ti --restart=always --name local-ai --runtime nvidia --gpus all quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64
+```
+
+### CUDA 13 (for DGX Spark)
+
+```bash
+docker run -e DEBUG=true -p 8080:8080 -v /data/models:/models -ti --restart=always --name local-ai --runtime nvidia --gpus all quay.io/go-skynet/local-ai:latest-nvidia-l4t-arm64-cuda-13
 ```

 Note: `/data/models` is the directory containing the models. You can replace it with the directory containing your models.
--- a/docs/data/version.json
+++ b/docs/data/version.json
@@ -1,3 +1,3 @@
 {
-  "version": "v3.8.0"
+  "version": "v3.9.0"
 }
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -1,4 +1,170 @@
 ---
+- name: "iquest-coder-v1-40b-instruct-i1"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/mradermacher/IQuest-Coder-V1-40B-Instruct-i1-GGUF
+  description: |
+    The **IQuest-Coder-V1-40B-Instruct-i1-GGUF** is a quantized version of the original **IQuestLab/IQuest-Coder-V1-40B-Instruct** model, designed for efficient deployment. It is an **instruction-following large language model** with 40 billion parameters, optimized for tasks like code generation and reasoning.
+
+    **Key Features:**
+      - **Size:** 40B parameters (quantized for efficiency).
+      - **Purpose:** Instruction-based coding and reasoning.
+      - **Format:** GGUF (supports multi-part files).
+      - **Quantization:** Uses advanced techniques (e.g., IQ3_M, Q4_K_M) for balance between performance and quality.
+
+    **Available Quantizations:**
+      - Optimized for speed and size: **i1-Q4_K_M** (recommended).
+      - Lower-quality options for trade-off between size/quality.
+
+    **Note:** This is a **quantized version** of the original model, but the base model (IQuestLab/IQuest-Coder-V1-40B-Instruct) is the official source. For full functionality, use the unquantized version or verify compatibility with your deployment tools.
+  overrides:
+    parameters:
+      model: llama-cpp/models/IQuest-Coder-V1-40B-Instruct.i1-Q4_K_M.gguf
+    name: IQuest-Coder-V1-40B-Instruct-i1-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    description: Imported from https://huggingface.co/mradermacher/IQuest-Coder-V1-40B-Instruct-i1-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/IQuest-Coder-V1-40B-Instruct.i1-Q4_K_M.gguf
+      sha256: 0090b84ea8e5a862352cbb44498bd6b4cd38564834182813c35ed84209050b51
+      uri: https://huggingface.co/mradermacher/IQuest-Coder-V1-40B-Instruct-i1-GGUF/resolve/main/IQuest-Coder-V1-40B-Instruct.i1-Q4_K_M.gguf
+- name: "onerec-8b"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/mradermacher/OneRec-8B-GGUF
+  description: |
+    The model `mradermacher/OneRec-8B-GGUF` is a quantized version of the base model `OpenOneRec/OneRec-8B`, a large language model designed for tasks like recommendations or content generation. It is optimized for efficiency with various quantization schemes (e.g., Q2_K, Q4_K, Q8_0) and available in multiple sizes (3.5–9.0 GB). The model uses the GGUF format and is licensed under Apache-2.0. Key features include:
+
+      - **Base Model**: `OpenOneRec/OneRec-8B` (a pre-trained language model for recommendations).
+      - **Quantization**: Supports multiple quantized variants (Q2_K, Q3_K, Q4_K, etc.), with the best quality for `Q4_K_S` and `Q8_0`.
+      - **Sizes**: Available in sizes ranging from 3.5 GB (Q2_K) to 9.0 GB (Q8_0), with faster speeds for lower-bit quantized versions.
+      - **Usage**: Compatible with GGUF files, suitable for deployment in applications requiring efficient model inference.
+      - **Licence**: Apache-2.0, available at [https://huggingface.co/OpenOneRec/OneRec-8B/blob/main/LICENSE](https://huggingface.co/OpenOneRec/OneRec-8B/blob/main/LICENSE).
+
+    For detailed specifications, refer to the [model page](https://hf.tst.eu/model#OneRec-8B-GGUF).
+  overrides:
+    parameters:
+      model: llama-cpp/models/OneRec-8B.Q4_K_M.gguf
+    name: OneRec-8B-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    description: Imported from https://huggingface.co/mradermacher/OneRec-8B-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/OneRec-8B.Q4_K_M.gguf
+      sha256: f19217971ee5a7a909c9217a79d09fb573380f5018e25dcb32693139e59b434f
+      uri: https://huggingface.co/mradermacher/OneRec-8B-GGUF/resolve/main/OneRec-8B.Q4_K_M.gguf
+- name: "minimax-m2.1-i1"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/mradermacher/MiniMax-M2.1-i1-GGUF
+  description: |
+    The model **MiniMax-M2.1** (base model: *MiniMaxAI/MiniMax-M2.1*) is a large language model quantized for efficient deployment. It is optimized for speed and memory usage, with quantized versions available in various formats (e.g., GGUF) for different performance trade-offs. The quantization is done by the user, and the model is licensed under the *modified-mit* license.
+
+    Key features:
+      - **Quantized versions**: Includes low-precision (IQ1, IQ2, Q2_K, etc.) and high-precision (Q4_K_M, Q6_K) options.
+      - **Usage**: Requires GGUF files; see [TheBloke's documentation](https://huggingface.co/TheBloke/KafkaLM-70B-German-V0.1-GGUF) for details on integration.
+      - **License**: Modified MIT (see [license link](https://github.com/MiniMax-AI/MiniMax-M2.1/blob/main/LICENSE)).
+
+    For gallery use, emphasize its quantized variants, performance trade-offs, and licensing.
+  overrides:
+    parameters:
+      model: llama-cpp/models/MiniMax-M2.1.i1-Q4_K_M.gguf
+    name: MiniMax-M2.1-i1-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    description: Imported from https://huggingface.co/mradermacher/MiniMax-M2.1-i1-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/MiniMax-M2.1.i1-Q4_K_M.gguf
+      sha256: dba387e17ddd9b4559fb6f14459fcece7f00c66bbe4062d7ceea7fb9568e3282
+      uri: https://huggingface.co/mradermacher/MiniMax-M2.1-i1-GGUF/resolve/main/MiniMax-M2.1.i1-Q4_K_M.gguf
+- name: "tildeopen-30b-instruct-lv-i1"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/mradermacher/TildeOpen-30B-Instruct-LV-i1-GGUF
+  description: |
+    The **TildeOpen-30B-Instruct-LV-i1-GGUF** is a quantized version of the base model **pazars/TildeOpen-30B-Instruct-LV**, optimized for deployment. It is an instruct-based language model trained on diverse datasets, supporting multiple languages (en, de, fr, pl, ru, it, pt, cs, nl, es, fi, tr, hu, bg, uk, bs, hr, da, et, lt, ro, sk, sl, sv, no, lv, sr, sq, mk, is, mt, ga). Licensed under CC-BY-4.0, it uses the Transformers library and is designed for efficient inference. The quantized version (with imatrix format) is tailored for deployment on devices with limited resources, while the base model remains the original, high-quality version.
+  overrides:
+    parameters:
+      model: llama-cpp/models/TildeOpen-30B-Instruct-LV.i1-Q4_K_M.gguf
+    name: TildeOpen-30B-Instruct-LV-i1-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    description: Imported from https://huggingface.co/mradermacher/TildeOpen-30B-Instruct-LV-i1-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/TildeOpen-30B-Instruct-LV.i1-Q4_K_M.gguf
+      sha256: 48ed550e9ce7278ac456a43634c2a5804ba273522021434dfa0aa85dda3167b3
+      uri: https://huggingface.co/mradermacher/TildeOpen-30B-Instruct-LV-i1-GGUF/resolve/main/TildeOpen-30B-Instruct-LV.i1-Q4_K_M.gguf
+- name: "allenai_olmo-3.1-32b-think"
+  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
+  urls:
+    - https://huggingface.co/bartowski/allenai_Olmo-3.1-32B-Think-GGUF
+  description: |
+    The **Olmo-3.1-32B-Think** model is a large language model (LLM) optimized for efficient inference using quantized versions. It is a quantized version of the original **allenai/Olmo-3.1-32B-Think** model, developed by **bartowski** using the **imatrix** quantization method.
+
+    ### Key Features:
+      - **Base Model**: `allenai/Olmo-3.1-32B-Think` (unquantized version).
+      - **Quantized Versions**: Available in multiple formats (e.g., `Q6_K_L`, `Q4_1`, `bf16`) with varying precision (e.g., Q8_0, Q6_K_L, Q5_K_M). These are derived from the original model using the **imatrix calibration dataset**.
+      - **Performance**: Optimized for low-memory usage and efficient inference on GPUs/CPUs. Recommended quantization types include `Q6_K_L` (near-perfect quality) or `Q4_K_M` (default, balanced performance).
+      - **Downloads**: Available via Hugging Face CLI. Split into multiple files if needed for large models.
+      - **License**: Apache-2.0.
+
+    ### Recommended Quantization:
+      - Use `Q6_K_L` for highest quality (near-perfect performance).
+      - Use `Q4_K_M` for balanced performance and size.
+      - Avoid lower-quality options (e.g., `Q3_K_S`) unless specific hardware constraints apply.
+
+    This model is ideal for deploying on GPUs/CPUs with limited memory, leveraging efficient quantization for practical use cases.
+  overrides:
+    parameters:
+      model: llama-cpp/models/allenai_Olmo-3.1-32B-Think-Q4_K_M.gguf
+    name: allenai_Olmo-3.1-32B-Think-GGUF
+    backend: llama-cpp
+    template:
+      use_tokenizer_template: true
+    known_usecases:
+      - chat
+    function:
+      grammar:
+        disable: true
+    description: Imported from https://huggingface.co/bartowski/allenai_Olmo-3.1-32B-Think-GGUF
+    options:
+      - use_jinja:true
+  files:
+    - filename: llama-cpp/models/allenai_Olmo-3.1-32B-Think-Q4_K_M.gguf
+      sha256: 09ca87494efb75f6658a0c047414cccc5fb29d26a49c650a90af7c8f0412fdac
+      uri: https://huggingface.co/bartowski/allenai_Olmo-3.1-32B-Think-GGUF/resolve/main/allenai_Olmo-3.1-32B-Think-Q4_K_M.gguf
 - name: "huihui-glm-4.6v-flash-abliterated"
  url: "github:mudler/LocalAI/gallery/virtual.yaml@master"
  urls:
@@ -219,9 +385,9 @@
  files:
    - filename: Qwen3-VL-30B-A3B-Instruct-Q4_K_M.gguf
      uri: huggingface://unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF/Qwen3-VL-30B-A3B-Instruct-Q4_K_M.gguf
-      sha256: dfee58d4227981d04dd8558b7d8d50073b4d69a6d017a67ed42795d303b6f9ef
+      sha256: 7ea0a652b4bda1c1911a93a79a7cd98b92011dfea078e87328285294b2b4ab44
    - filename: mmproj/mmproj-F16.gguf
-      sha256: 7e7cec67a3a887bddbf38099738d08570e85f08dd126578fa00a7acf4dacef01
+      sha256: 9f248089357599a08a23af40cb5ce0030de14a2e119b7ef57f66cb339bd20819
      uri: huggingface://unsloth/Qwen3-VL-30B-A3B-Instruct-GGUF/mmproj-F16.gguf
 - !!merge <<: *qwen3vl
  name: "qwen3-vl-30b-a3b-thinking"
@@ -236,10 +402,10 @@
  files:
    - filename: Qwen3-VL-30B-A3B-Thinking-Q4_K_M.gguf
      uri: huggingface://unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF/Qwen3-VL-30B-A3B-Thinking-Q4_K_M.gguf
-      sha256: 68aacddb0c5598150fcbdb38916606e070101439714726352b808e1efa075e53
+      sha256: b5622d28d2deb398558841fb29060f0ad241bd30f6afe79ed3fcf78d5fbf887b
    - filename: mmproj/mmproj-F16.gguf
      uri: huggingface://unsloth/Qwen3-VL-30B-A3B-Thinking-GGUF/mmproj-F16.gguf
-      sha256: 752f8f67171e1d3c752b638b1b210a4c75dd0731200595f496ef8b26040ce35d
+      sha256: 7c5d39a9dc4645fc49a39a1c5a96157825af4d1c6e0961bed5d667a65b4b9572
 - !!merge <<: *qwen3vl
  name: "qwen3-vl-4b-instruct"
  urls:
@@ -269,11 +435,11 @@
      model: Qwen3-VL-32B-Instruct-Q4_K_M.gguf
  files:
    - filename: Qwen3-VL-32B-Instruct-Q4_K_M.gguf
-      sha256: 17885d28e964b22b2faa981a7eaeeeb78da0972ee5f826ad5965f7583a610d9f
      uri: huggingface://unsloth/Qwen3-VL-32B-Instruct-GGUF/Qwen3-VL-32B-Instruct-Q4_K_M.gguf
+      sha256: 92d605566f8661b296251c535ed028ecf81c32e14e06948a3d8bef829e96a804
    - filename: mmproj/mmproj-Qwen3-VL-32B-Instruct-F16.gguf
-      sha256: 14b1d68befa75a5e646dd990c5bb429c912b7aa9b49b9ab18231ca5f750421c9
      uri: huggingface://unsloth/Qwen3-VL-32B-Instruct-GGUF/mmproj-F16.gguf
+      sha256: dde7e407cf72e601455976c2d0daa960d16ee34ba3f0c78718c881d8cd8c1052
 - !!merge <<: *qwen3vl
  name: "qwen3-vl-4b-thinking"
  urls:
--- a/go.mod
+++ b/go.mod
@@ -9,22 +9,17 @@ require (
 	fyne.io/fyne/v2 v2.7.1
 	github.com/Masterminds/sprig/v3 v3.3.0
 	github.com/alecthomas/kong v1.13.0
-	github.com/census-instrumentation/opencensus-proto v0.2.1
 	github.com/charmbracelet/glamour v0.10.0
-	github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f
 	github.com/containerd/containerd v1.7.30
 	github.com/ebitengine/purego v0.9.1
+	github.com/emirpasic/gods/v2 v2.0.0-alpha
 	github.com/fsnotify/fsnotify v1.9.0
 	github.com/go-audio/wav v1.1.0
 	github.com/go-skynet/go-llama.cpp v0.0.0-20240314183750-6a8041ef6b46
-	github.com/gofiber/fiber/v2 v2.52.10
-	github.com/gofiber/websocket/v2 v2.2.1
 	github.com/gofrs/flock v0.13.0
-	github.com/golang/protobuf v1.5.4
 	github.com/google/go-containerregistry v0.20.7
 	github.com/google/uuid v1.6.0
 	github.com/gpustack/gguf-parser-go v0.22.1
-	github.com/grpc-ecosystem/grpc-gateway v1.16.0
 	github.com/hpcloud/tail v1.0.0
 	github.com/ipfs/go-log v1.0.5
 	github.com/jaypipes/ghw v0.21.2
@@ -35,7 +30,7 @@ require (
 	github.com/lithammer/fuzzysearch v1.1.8
 	github.com/mholt/archiver/v3 v3.5.1
 	github.com/microcosm-cc/bluemonday v1.0.27
-	github.com/modelcontextprotocol/go-sdk v1.1.0
+	github.com/modelcontextprotocol/go-sdk v1.2.0
 	github.com/mudler/cogito v0.7.2
 	github.com/mudler/edgevpn v0.31.1
 	github.com/mudler/go-processmanager v0.0.0-20240820160718-8b802d3ecf82
@@ -50,7 +45,7 @@ require (
 	github.com/robfig/cron/v3 v3.0.1
 	github.com/russross/blackfriday v1.6.0
 	github.com/sashabaranov/go-openai v1.41.2
-	github.com/schollz/progressbar/v3 v3.18.0
+	github.com/schollz/progressbar/v3 v3.19.0
 	github.com/shirou/gopsutil/v3 v3.24.5
 	github.com/streamer45/silero-vad-go v0.2.1
 	github.com/stretchr/testify v1.11.1
@@ -62,33 +57,18 @@ require (
 	go.opentelemetry.io/otel/exporters/prometheus v0.61.0
 	go.opentelemetry.io/otel/metric v1.39.0
 	go.opentelemetry.io/otel/sdk/metric v1.39.0
-	google.golang.org/api v0.218.0
-	google.golang.org/grpc v1.77.0
-	google.golang.org/protobuf v1.36.10
+	google.golang.org/grpc v1.78.0
 	gopkg.in/yaml.v2 v2.4.0
 	gopkg.in/yaml.v3 v3.0.1
 	oras.land/oras-go/v2 v2.6.0
 )

 require (
-	cel.dev/expr v0.24.0 // indirect
-	cloud.google.com/go/auth v0.14.0 // indirect
-	cloud.google.com/go/auth/oauth2adapt v0.2.7 // indirect
-	cloud.google.com/go/compute/metadata v0.9.0 // indirect
-	github.com/envoyproxy/protoc-gen-validate v1.2.1 // indirect
-	github.com/fasthttp/websocket v1.5.3 // indirect
 	github.com/ghodss/yaml v1.0.0 // indirect
-	github.com/google/s2a-go v0.1.9 // indirect
-	github.com/googleapis/enterprise-certificate-proxy v0.3.4 // indirect
-	github.com/googleapis/gax-go/v2 v2.14.1 // indirect
 	github.com/labstack/gommon v0.4.2 // indirect
-	github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee // indirect
 	github.com/swaggo/files/v2 v2.0.2 // indirect
-	github.com/valyala/fasthttp v1.51.0 // indirect
 	github.com/valyala/fasttemplate v1.2.2 // indirect
-	github.com/valyala/tcplisten v1.0.0 // indirect
-	google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 // indirect
-	google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 // indirect
+	google.golang.org/protobuf v1.36.10 // indirect
 )

 require (
@@ -347,7 +327,7 @@ require (
 	golang.zx2c4.com/wireguard v0.0.0-20250521234502-f333402bd9cb // indirect
 	golang.zx2c4.com/wireguard/windows v0.5.3 // indirect
 	gonum.org/v1/gonum v0.16.0 // indirect
-	google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20251029180050-ab9386a59fda // indirect
 	gopkg.in/fsnotify.v1 v1.4.7 // indirect
 	gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 // indirect
 	howett.net/plist v1.0.2-0.20250314012144-ee69052608d9 // indirect
--- a/go.sum
+++ b/go.sum
@@ -1,15 +1,7 @@
-cel.dev/expr v0.24.0 h1:56OvJKSH3hDGL0ml5uSxZmz3/3Pq4tJ+fb1unVLAFcY=
-cel.dev/expr v0.24.0/go.mod h1:hLPLo1W4QUmuYdA72RBX06QTs6MXw941piREPl3Yfiw=
 cloud.google.com/go v0.26.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.31.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.34.0/go.mod h1:aQUYkXzVsufM+DwF1aE+0xfcU+56JwCaLick0ClmMTw=
 cloud.google.com/go v0.37.0/go.mod h1:TS1dMSSfndXH133OKGwekG838Om/cQT0BUHV3HcBgoo=
-cloud.google.com/go/auth v0.14.0 h1:A5C4dKV/Spdvxcl0ggWwWEzzP7AZMJSEIgrkngwhGYM=
-cloud.google.com/go/auth v0.14.0/go.mod h1:CYsoRL1PdiDuqeQpZE0bP2pnPrGqFcOkI0nldEQis+A=
-cloud.google.com/go/auth/oauth2adapt v0.2.7 h1:/Lc7xODdqcEw8IrZ9SvwnlLX6j9FHQM74z6cBk9Rw6M=
-cloud.google.com/go/auth/oauth2adapt v0.2.7/go.mod h1:NTbTTzfvPl1Y3V1nPpOgl2w6d/FjO7NNUQaWSox6ZMc=
-cloud.google.com/go/compute/metadata v0.9.0 h1:pDUj4QMoPejqq20dK0Pg2N4yG9zIkYGdBtwLoEkH9Zs=
-cloud.google.com/go/compute/metadata v0.9.0/go.mod h1:E0bWwX5wTnLPedCKqk3pJmVgCBSM6qQI1yTBdEb3C10=
 dario.cat/mergo v1.0.2 h1:85+piFYR1tMbRrLcDwR18y4UKJ3aH1Tbzi24VRW1TK8=
 dario.cat/mergo v1.0.2/go.mod h1:E/hbnu0NxMFBjpMIE34DRGLWqDy0g5FuKDhCb31ngxA=
 dmitri.shuralyov.com/app/changes v0.0.0-20180602232624-0a106ad413e3/go.mod h1:Yl+fi1br7+Rr3LqpNJf1/uxUdtRUV+Tnj0o93V2B9MU=
@@ -52,7 +44,6 @@ github.com/andybalholm/brotli v1.0.1/go.mod h1:loMXtMfwqflxFJPmdbJO0a3KNoPuLBgiu
 github.com/andybalholm/brotli v1.2.0 h1:ukwgCxwYrmACq68yiUqwIWnGY0cTPox/M94sVwToPjQ=
 github.com/andybalholm/brotli v1.2.0/go.mod h1:rzTDkvFWvIrjDXZHkuS16NPggd91W3kUSvPlQ1pLaKY=
 github.com/anmitsu/go-shlex v0.0.0-20161002113705-648efa622239/go.mod h1:2FmKhYUyUczH0OGQWaF5ceTx0UBShxjsH6f8oGKYe2c=
-github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
 github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
 github.com/aymanbagabas/go-udiff v0.2.0 h1:TK0fH4MteXUDspT88n8CKzvK0X9O2xu9yQjWpi6yML8=
@@ -70,7 +61,6 @@ github.com/c-robinson/iplib v1.0.8 h1:exDRViDyL9UBLcfmlxxkY5odWX5092nPsQIykHXhIn
 github.com/c-robinson/iplib v1.0.8/go.mod h1:i3LuuFL1hRT5gFpBRnEydzw8R6yhGkF4szNDIbF8pgo=
 github.com/cenkalti/backoff/v4 v4.3.0 h1:MyRJ/UdXutAwSAT+s3wNd7MfTIcy71VQueUuFK343L8=
 github.com/cenkalti/backoff/v4 v4.3.0/go.mod h1:Y3VNntkOUPxTVeUxJ/G5vcM//AlwfmyYozVcomhLiZE=
-github.com/census-instrumentation/opencensus-proto v0.2.1 h1:glEXhBS5PSLLv4IXzLA5yPRVX4bilULVyxxbrfOtDAk=
 github.com/census-instrumentation/opencensus-proto v0.2.1/go.mod h1:f6KPmirojxKA12rnyqOA5BBL4O983OfeGPqjHWSTneU=
 github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs=
 github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs=
@@ -94,8 +84,6 @@ github.com/chengxilo/virtualterm v1.0.4 h1:Z6IpERbRVlfB8WkOmtbHiDbBANU7cimRIof7m
 github.com/chengxilo/virtualterm v1.0.4/go.mod h1:DyxxBZz/x1iqJjFxTFcr6/x+jSpqN0iwWCOK1q10rlY=
 github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDkc90ppPyw=
 github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc=
-github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f h1:Y8xYupdHxryycyPlc9Y+bSQAYZnetRJ70VMVKm5CKI0=
-github.com/cncf/xds/go v0.0.0-20251022180443-0feb69152e9f/go.mod h1:HlzOvOjVBOfTGSRXRyY0OiCS/3J1akRGQQpRO/7zyF4=
 github.com/containerd/cgroups v1.1.0 h1:v8rEWFl6EoqHB+swVNjVoCJE8o3jX7e8nqBGPLaDFBM=
 github.com/containerd/cgroups v1.1.0/go.mod h1:6ppBcbh/NOOUU+dMKrykgaBnK9lCIBxHqJDGwsa1mIw=
 github.com/containerd/containerd v1.7.30 h1:/2vezDpLDVGGmkUXmlNPLCCNKHJ5BbC5tJB5JNzQhqE=
@@ -154,14 +142,12 @@ github.com/dsnet/golib v0.0.0-20171103203638-1ea166775780/go.mod h1:Lj+Z9rebOhdf
 github.com/dustin/go-humanize v1.0.0/go.mod h1:HtrtbFcZ19U5GC7JDqmcUSB87Iq5E25KnS6fMYU6eOk=
 github.com/ebitengine/purego v0.9.1 h1:a/k2f2HQU3Pi399RPW1MOaZyhKJL9w/xFpKAg4q1s0A=
 github.com/ebitengine/purego v0.9.1/go.mod h1:iIjxzd6CiRiOG0UyXP+V1+jWqUXVjPKLAI0mRfJZTmQ=
+github.com/emirpasic/gods/v2 v2.0.0-alpha h1:dwFlh8pBg1VMOXWGipNMRt8v96dKAIvBehtCt6OtunU=
+github.com/emirpasic/gods/v2 v2.0.0-alpha/go.mod h1:W0y4M2dtBB9U5z3YlghmpuUhiaZT2h6yoeE+C1sCp6A=
 github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4=
 github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98=
 github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c=
-github.com/envoyproxy/protoc-gen-validate v1.2.1 h1:DEo3O99U8j4hBFwbJfrz9VtgcDfUKS7KJ7spH3d86P8=
-github.com/envoyproxy/protoc-gen-validate v1.2.1/go.mod h1:d/C80l/jxXLdfEIhX1W2TmLfsJ31lvEjwamM4DxlWXU=
-github.com/fasthttp/websocket v1.5.3 h1:TPpQuLwJYfd4LJPXvHDYPMFWbLjsT91n3GpWtCQtdek=
-github.com/fasthttp/websocket v1.5.3/go.mod h1:46gg/UBmTU1kUaTcwQXpUxtRwG2PvIZYeA8oL6vF3Fs=
 github.com/felixge/fgprof v0.9.3 h1:VvyZxILNuCiUCSXtPtYmmtGvb65nqXh2QFWc0Wpf2/g=
 github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw=
 github.com/felixge/httpsnoop v1.0.4 h1:NFTV2Zj1bL4mc9sqWACXbQFVBBg2W3GPvqp8/ESS2Wg=
@@ -237,15 +223,13 @@ github.com/goccy/go-yaml v1.18.0 h1:8W7wMFS12Pcas7KU+VVkaiCng+kG8QiFeFwzFb+rwuw=
 github.com/goccy/go-yaml v1.18.0/go.mod h1:XBurs7gK8ATbW4ZPGKgcbrY1Br56PdM69F7LkFRi1kA=
 github.com/godbus/dbus/v5 v5.1.0 h1:4KLkAxT3aOY8Li4FRJe/KvhoNFFxo0m6fNuFUO8QJUk=
 github.com/godbus/dbus/v5 v5.1.0/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA=
-github.com/gofiber/fiber/v2 v2.52.10 h1:jRHROi2BuNti6NYXmZ6gbNSfT3zj/8c0xy94GOU5elY=
-github.com/gofiber/fiber/v2 v2.52.10/go.mod h1:YEcBbO/FB+5M1IZNBP9FO3J9281zgPAreiI1oqg8nDw=
-github.com/gofiber/websocket/v2 v2.2.1 h1:C9cjxvloojayOp9AovmpQrk8VqvVnT8Oao3+IUygH7w=
-github.com/gofiber/websocket/v2 v2.2.1/go.mod h1:Ao/+nyNnX5u/hIFPuHl28a+NIkrqK7PRimyKaj4JxVU=
 github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw=
 github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0=
 github.com/gogo/protobuf v1.1.1/go.mod h1:r8qH/GZQm5c6nD/R0oafs1akxWv10x8SbQlK7atdtwQ=
 github.com/gogo/protobuf v1.3.2 h1:Ov1cvc58UF3b5XjBnZv7+opcTcQFZebYjWzi34vdm4Q=
 github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q=
+github.com/golang-jwt/jwt/v5 v5.3.0 h1:pv4AsKCKKZuqlgs5sUmn4x8UlGa0kEVt/puTpKx9vvo=
+github.com/golang-jwt/jwt/v5 v5.3.0/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE=
 github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q=
 github.com/golang/groupcache v0.0.0-20200121045136-8c9f03a8e57e/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc=
 github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da h1:oI5xCqsCo564l8iNU+DwB5epxmsaqB+rhGL0m5jtYqE=
@@ -256,7 +240,6 @@ github.com/golang/mock v1.2.0/go.mod h1:oTYuIxOrZwtPieC+H1uAHpcLFnEyAGVDL/k47Jfb
 github.com/golang/protobuf v1.2.0/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.1/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
 github.com/golang/protobuf v1.3.2/go.mod h1:6lQm79b+lXiMfvg/cZm0SGofjICqVBUtrP5yJMmIC1U=
-github.com/golang/protobuf v1.3.3/go.mod h1:vzj43D7+SQXF/4pzW/hwtAqwc6iTitCiVSaWz5lYuqw=
 github.com/golang/protobuf v1.4.0-rc.1/go.mod h1:ceaxUfeHdC40wWswd/P6IGgMaK3YpKi5j83Wpe3EHw8=
 github.com/golang/protobuf v1.4.0-rc.1.0.20200221234624-67d41d38c208/go.mod h1:xKAWHe0F5eneWXFV3EuXVDTCmh+JuBKY0li0aMyXATA=
 github.com/golang/protobuf v1.4.0-rc.2/go.mod h1:LlEzMj4AhA7rCAGe4KMBDvJI+AwstrUpVNzEA03Pprs=
@@ -295,17 +278,11 @@ github.com/google/pprof v0.0.0-20181206194817-3ea8567a2e57/go.mod h1:zfwlbNMJ+OI
 github.com/google/pprof v0.0.0-20250630185457-6e76a2b096b5 h1:xhMrHhTJ6zxu3gA4enFM9MLn9AY7613teCdFnlUVbSQ=
 github.com/google/pprof v0.0.0-20250630185457-6e76a2b096b5/go.mod h1:5hDyRhoBCxViHszMt12TnOpEI4VVi+U8Gm9iphldiMA=
 github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI=
-github.com/google/s2a-go v0.1.9 h1:LGD7gtMgezd8a/Xak7mEWL0PjoTQFvpRudN895yqKW0=
-github.com/google/s2a-go v0.1.9/go.mod h1:YA0Ei2ZQL3acow2O62kdp9UlnvMmU7kA6Eutn0dXayM=
 github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
 github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0=
 github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo=
-github.com/googleapis/enterprise-certificate-proxy v0.3.4 h1:XYIDZApgAnrN1c855gTgghdIA6Stxb52D5RnLI1SLyw=
-github.com/googleapis/enterprise-certificate-proxy v0.3.4/go.mod h1:YKe7cfqYXjKGpGvmSg28/fFvhNzinZQm8DGnaburhGA=
 github.com/googleapis/gax-go v2.0.0+incompatible/go.mod h1:SFVmujtThgffbyetf+mdk2eWhX2bMyUtNHzFKcPA9HY=
 github.com/googleapis/gax-go/v2 v2.0.3/go.mod h1:LLvjysVCY1JZeum8Z6l8qUty8fiNwE08qbEPm1M08qg=
-github.com/googleapis/gax-go/v2 v2.14.1 h1:hb0FFeiPaQskmvakKu5EbCbpntQn48jyHuvrkurSS/Q=
-github.com/googleapis/gax-go/v2 v2.14.1/go.mod h1:Hb/NubMaVM88SrNkvl8X/o8XWwDJEPqouaLeN2IUxoA=
 github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
 github.com/gopherjs/gopherjs v0.0.0-20190430165422-3e4dfb77656c h1:7lF+Vz0LqiRidnzC1Oq86fpX1q/iEv2KJdrCtttYjT4=
 github.com/gopherjs/gopherjs v0.0.0-20190430165422-3e4dfb77656c/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY=
@@ -318,7 +295,6 @@ github.com/gpustack/gguf-parser-go v0.22.1/go.mod h1:y4TwTtDqFWTK+xvprOjRUh+dowg
 github.com/gregjones/httpcache v0.0.0-20180305231024-9cad4c3443a7/go.mod h1:FecbI9+v66THATjSRHfNgh1IVFe/9kFxbXtjV0ctIMA=
 github.com/grpc-ecosystem/grpc-gateway v1.5.0/go.mod h1:RSKVYQBd5MCa4OVpNdGskqpgL2+G+NZTnrVHpWWfpdw=
 github.com/grpc-ecosystem/grpc-gateway v1.16.0 h1:gmcG1KaJ57LophUzW0Hy8NmPhnMZb4M0+kPpLofRdBo=
-github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0 h1:asbCHRVmodnJTuQ3qamDwqVOIjwqUPTYmYuemVOx+Ys=
 github.com/grpc-ecosystem/grpc-gateway/v2 v2.22.0/go.mod h1:ggCgvZ2r7uOoQjOyu2Y1NhHmEPPzzuhWgcza5M1Ji1I=
 github.com/hack-pad/go-indexeddb v0.3.2 h1:DTqeJJYc1usa45Q5r52t01KhvlSN02+Oq+tQbSBI91A=
@@ -516,8 +492,8 @@ github.com/moby/sys/userns v0.1.0 h1:tVLXkFOxVu9A64/yh59slHVv9ahO9UIev4JZusOLG/g
 github.com/moby/sys/userns v0.1.0/go.mod h1:IHUYgu/kao6N8YZlp9Cf444ySSvCmDlmzUcYfDHOl28=
 github.com/moby/term v0.5.2 h1:6qk3FJAFDs6i/q3W/pQ97SX192qKfZgGjCQqfCJkgzQ=
 github.com/moby/term v0.5.2/go.mod h1:d3djjFCrjnB+fl8NJux+EJzu0msscUP+f8it8hPkFLc=
-github.com/modelcontextprotocol/go-sdk v1.1.0 h1:Qjayg53dnKC4UZ+792W21e4BpwEZBzwgRW6LrjLWSwA=
-github.com/modelcontextprotocol/go-sdk v1.1.0/go.mod h1:6fM3LCm3yV7pAs8isnKLn07oKtB0MP9LHd3DfAcKw10=
+github.com/modelcontextprotocol/go-sdk v1.2.0 h1:Y23co09300CEk8iZ/tMxIX1dVmKZkzoSBZOpJwUnc/s=
+github.com/modelcontextprotocol/go-sdk v1.2.0/go.mod h1:6fM3LCm3yV7pAs8isnKLn07oKtB0MP9LHd3DfAcKw10=
 github.com/modern-go/concurrent v0.0.0-20180228061459-e0a39a4cb421/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd h1:TRLaZ9cD/w8PVh93nsPXa1VrQ6jlwL5oN8l14QlcNfg=
 github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd/go.mod h1:6dJC0mAP4ikYIbvyc7fijjWJddQyLn8Ig3JB5CqoB9Q=
@@ -691,7 +667,6 @@ github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
 github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/robfig/cron/v3 v3.0.1 h1:WdRxkvbJztn8LMz/QEvLN5sBU+xKpSqwwUO1Pjr4qDs=
 github.com/robfig/cron/v3 v3.0.1/go.mod h1:eQICP3HwyT7UooqI/z+Ov+PtYAWygg1TEWWzGIFLtro=
-github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.3.0/go.mod h1:M8bDsm7K2OlrFYOpmOWEs/qY81heoFRclV5y23lUDJ4=
 github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ=
 github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc=
@@ -703,10 +678,8 @@ github.com/rymdport/portal v0.4.2 h1:7jKRSemwlTyVHHrTGgQg7gmNPJs88xkbKcIL3NlcmSU
 github.com/rymdport/portal v0.4.2/go.mod h1:kFF4jslnJ8pD5uCi17brj/ODlfIidOxlgUDTO5ncnC4=
 github.com/sashabaranov/go-openai v1.41.2 h1:vfPRBZNMpnqu8ELsclWcAvF19lDNgh1t6TVfFFOPiSM=
 github.com/sashabaranov/go-openai v1.41.2/go.mod h1:lj5b/K+zjTSFxVLijLSTDZuP7adOgerWeFyZLUhAKRg=
-github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee h1:8Iv5m6xEo1NR1AvpV+7XmhI4r39LGNzwUL4YpMuL5vk=
-github.com/savsgio/gotils v0.0.0-20230208104028-c358bd845dee/go.mod h1:qwtSXrKuJh/zsFQ12yEE89xfCrGKK63Rr7ctU/uCo4g=
-github.com/schollz/progressbar/v3 v3.18.0 h1:uXdoHABRFmNIjUfte/Ex7WtuyVslrw2wVPQmCN62HpA=
-github.com/schollz/progressbar/v3 v3.18.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
+github.com/schollz/progressbar/v3 v3.19.0 h1:Ea18xuIRQXLAUidVDox3AbwfUhD0/1IvohyTutOIFoc=
+github.com/schollz/progressbar/v3 v3.19.0/go.mod h1:IsO3lpbaGuzh8zIMzgY3+J8l4C8GjO0Y9S69eFvNsec=
 github.com/sergi/go-diff v1.0.0/go.mod h1:0CfEIISq7TuYL3j771MWULgwwjU+GofnZX9QAmXWZgo=
 github.com/shirou/gopsutil/v3 v3.24.5 h1:i0t8kL+kQTvpAYToeuiVk3TgDeKOFioZO3Ztz/iZ9pI=
 github.com/shirou/gopsutil/v3 v3.24.5/go.mod h1:bsoOS1aStSs9ErQ1WWfxllSeS1K5D+U30r2NfcubMVk=
@@ -810,12 +783,8 @@ github.com/ulikunitz/xz v0.5.14/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0o
 github.com/urfave/cli v1.22.10/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
 github.com/valyala/bytebufferpool v1.0.0 h1:GqA5TC/0021Y/b9FG4Oi9Mr3q7XYx6KllzawFIhcdPw=
 github.com/valyala/bytebufferpool v1.0.0/go.mod h1:6bBcMArwyJ5K/AmCkWv1jt77kVWyCJ6HpOuEn7z0Csc=
-github.com/valyala/fasthttp v1.51.0 h1:8b30A5JlZ6C7AS81RsWjYMQmrZG6feChmgAolCl1SqA=
-github.com/valyala/fasthttp v1.51.0/go.mod h1:oI2XroL+lI7vdXyYoQk03bXBThfFl2cVdIA3Xl7cH8g=
 github.com/valyala/fasttemplate v1.2.2 h1:lxLXG0uE3Qnshl9QyaK6XJxMXlQZELvChBOCmQD0Loo=
 github.com/valyala/fasttemplate v1.2.2/go.mod h1:KHLXt3tVN2HBp8eijSv/kGJopbvo7S+qRAEEKiv+SiQ=
-github.com/valyala/tcplisten v1.0.0 h1:rBHj/Xf+E1tRGZyWIWwJDiRY0zc1Js+CV5DqwacVSA8=
-github.com/valyala/tcplisten v1.0.0/go.mod h1:T0xQ8SeCZGxckz9qRXTfG43PvQ/mcWh7FwZEA7Ioqkc=
 github.com/vbatts/tar-split v0.12.2 h1:w/Y6tjxpeiFMR47yzZPlPj/FcPLpXbTUi/9H7d3CPa4=
 github.com/vbatts/tar-split v0.12.2/go.mod h1:eF6B6i6ftWQcDqEn3/iGFRFRo8cBIMSJVOpnNdfTMFA=
 github.com/viant/assertly v0.4.8/go.mod h1:aGifi++jvCrUaklKEKT0BU95igDNaqkvz+49uaYMPRU=
@@ -945,7 +914,6 @@ golang.org/x/net v0.0.0-20190313220215-9f648a60d977/go.mod h1:t9HGtf8HONx5eT2rtn
 golang.org/x/net v0.0.0-20190404232315-eb5bcb51f2a3/go.mod h1:t9HGtf8HONx5eT2rtn7q6eTqICYqUVnKs3thJo3Qplg=
 golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
 golang.org/x/net v0.0.0-20200226121028-0de0cce0169b/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s=
-golang.org/x/net v0.0.0-20200822124328-c89045814202/go.mod h1:/O7V0waA8r7cgGh81Ro3o1hOxt32SMVPicZroKQ2sZA=
 golang.org/x/net v0.0.0-20201021035429-f5854403a974/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20201110031124-69a78807bb2b/go.mod h1:sp8m0HH+o8qH0wwXwYZr8TS3Oi6o0r6Gce1SSxlDquU=
 golang.org/x/net v0.0.0-20210119194325-5f4716e94777/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
@@ -963,7 +931,6 @@ golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAG
 golang.org/x/oauth2 v0.0.0-20181017192945-9dcd33a902f4/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20181203162652-d668ce993890/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U=
 golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
-golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw=
 golang.org/x/oauth2 v0.33.0 h1:4Q+qn+E5z8gPRJfmRy7C2gGG3T4jIprK6aSYgTXGRpo=
 golang.org/x/oauth2 v0.33.0/go.mod h1:lzm5WQJQwKZ3nwavOZ3IS5Aulzxi68dUSgRHujetwEA=
 golang.org/x/perf v0.0.0-20180704124530-6e6d33e29852/go.mod h1:JLpeXjPJfIyPr5TlbXLkXWLhP8nz10XfvxElABhCtcw=
@@ -986,7 +953,6 @@ golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5h
 golang.org/x/sys v0.0.0-20190316082340-a2f829d7f35f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190412213103-97732733099d/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20190916202348-b4ddaad3f8a3/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
-golang.org/x/sys v0.0.0-20200323222414-85ca7c5b95cd/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200602225109-6fdc65e7d980/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20200930185726-fdedc70b468f/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
 golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
@@ -1070,8 +1036,6 @@ gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E
 google.golang.org/api v0.0.0-20180910000450-7ca32eb868bf/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0=
 google.golang.org/api v0.0.0-20181030000543-1d582fd0359e/go.mod h1:4mhQ8q/RsB7i+udVvVy5NUi08OU8ZlA0gRVgrF7VFY0=
 google.golang.org/api v0.1.0/go.mod h1:UGEZY7KEX120AnNLIHFMKIo4obdJhkp2tPbaPlQx13Y=
-google.golang.org/api v0.218.0 h1:x6JCjEWeZ9PFCRe9z0FBrNwj7pB7DOAqT35N+IPnAUA=
-google.golang.org/api v0.218.0/go.mod h1:5VGHBAkxrA/8EFjLVEYmMUJ8/8+gWWQ3s4cFH0FxG2M=
 google.golang.org/appengine v1.1.0/go.mod h1:EbEs0AVv82hx2wNQdGPgUI5lhzA/G0D9YwlJXL52JkM=
 google.golang.org/appengine v1.2.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
 google.golang.org/appengine v1.3.0/go.mod h1:xpcJRLb0r/rnEns0DIKYYv+WjYCduHsrkT7/EB5XEv4=
@@ -1082,14 +1046,12 @@ google.golang.org/genproto v0.0.0-20181029155118-b69ba1387ce2/go.mod h1:JiN7NxoA
 google.golang.org/genproto v0.0.0-20181202183823-bd91e49a0898/go.mod h1:7Ep/1NZk928CDR8SjdVbjWNpdIf6nzjE3BTgJDr2Atg=
 google.golang.org/genproto v0.0.0-20190306203927-b5d61aea6440/go.mod h1:VzzqZJRnGkLBvHegQrXjBqPurQTc5/KpmUdxsrq26oE=
 google.golang.org/genproto v0.0.0-20190819201941-24fa4b261c55/go.mod h1:DMBHOl98Agz4BDEuKkezgsaosCRResVns1a3J2ZsMNc=
-google.golang.org/genproto v0.0.0-20200513103714-09dca8ec2884/go.mod h1:55QSHmfGQM9UVYDPBsyGGes0y52j32PQ3BqQfXhyH3c=
 google.golang.org/genproto v0.0.0-20200526211855-cb27e3aa2013/go.mod h1:NbSheEEYHJ7i3ixzK3sjbqSGDJWnxyFXZblF3eUsNvo=
 google.golang.org/genproto v0.0.0-20241118233622-e639e219e697 h1:ToEetK57OidYuqD4Q5w+vfEnPvPpuTwedCNVohYJfNk=
-google.golang.org/genproto v0.0.0-20241118233622-e639e219e697/go.mod h1:JJrvXBWRZaFMxBufik1a4RpFw4HhgVtBBWQeQgUj2cc=
-google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8 h1:mepRgnBZa07I4TRuomDE4sTIYieg/osKmzIf4USdWS4=
-google.golang.org/genproto/googleapis/api v0.0.0-20251022142026-3a174f9686a8/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8 h1:M1rk8KBnUsBDg1oPGHNCxG4vc1f49epmTO7xscSajMk=
-google.golang.org/genproto/googleapis/rpc v0.0.0-20251022142026-3a174f9686a8/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
+google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda h1:+2XxjfsAu6vqFxwGBRcHiMaDCuZiqXGDUDVWVtrFAnE=
+google.golang.org/genproto/googleapis/api v0.0.0-20251029180050-ab9386a59fda/go.mod h1:fDMmzKV90WSg1NbozdqrE64fkuTv6mlq2zxo9ad+3yo=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20251029180050-ab9386a59fda h1:i/Q+bfisr7gq6feoJnS/DlpdwEL4ihp41fvRiM3Ork0=
+google.golang.org/genproto/googleapis/rpc v0.0.0-20251029180050-ab9386a59fda/go.mod h1:7i2o+ce6H/6BluujYR+kqX3GKH+dChPTQU19wjRPiGk=
 google.golang.org/grpc v1.14.0/go.mod h1:yo6s7OP7yaDglbqo1J04qKzAhqBH6lvTonzMVmEdcZw=
 google.golang.org/grpc v1.16.0/go.mod h1:0JHn/cJsOMiMfNA9+DeHDlAU7KAAB5GDlYFpa9MZMio=
 google.golang.org/grpc v1.17.0/go.mod h1:6QZJwpn2B+Zp71q/5VxRsJ6NXXVCE5NRUHRo+f3cWCs=
@@ -1097,10 +1059,9 @@ google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZi
 google.golang.org/grpc v1.23.0/go.mod h1:Y5yQAOtifL1yxbo5wqy6BxZv8vAUGQwXBOALyacEbxg=
 google.golang.org/grpc v1.25.1/go.mod h1:c3i+UQWmh7LiEpx4sFZnkU36qjEYZ0imhYfXVyQciAY=
 google.golang.org/grpc v1.27.0/go.mod h1:qbnxyOmOxrQa7FizSgH+ReBfzJrCY1pSN7KXBS8abTk=
-google.golang.org/grpc v1.33.1/go.mod h1:fr5YgcSWrqhRRxogOsw7RzIpsmvOZ6IcH4kBYTpR3n0=
 google.golang.org/grpc v1.33.2/go.mod h1:JMHMWHQWaTccqQQlmk3MJZS+GWXOdAesneDmEnv2fbc=
-google.golang.org/grpc v1.77.0 h1:wVVY6/8cGA6vvffn+wWK5ToddbgdU3d8MNENr4evgXM=
-google.golang.org/grpc v1.77.0/go.mod h1:z0BY1iVj0q8E1uSQCjL9cppRj+gnZjzDnzV0dHhrNig=
+google.golang.org/grpc v1.78.0 h1:K1XZG/yGDJnzMdd/uZHAkVqJE+xIDOcmdSFZkBUicNc=
+google.golang.org/grpc v1.78.0/go.mod h1:I47qjTo4OKbMkjA/aOOwxDIiPSBofUtQUI5EfpWvW7U=
 google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8=
 google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0=
 google.golang.org/protobuf v0.0.0-20200228230310-ab0ca4ff8a60/go.mod h1:cfTl7dwQJ+fmap5saPgwCLgHXTUD7jkjRqWcaiX5VyM=
@@ -1124,7 +1085,6 @@ gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7 h1:uRGJdciOHaEIrze2W8Q3AKkep
 gopkg.in/tomb.v1 v1.0.0-20141024135613-dd632973f1e7/go.mod h1:dt/ZhP58zS4L8KSrWDmTeBkI65Dw0HsyUHuEVlX15mw=
 gopkg.in/yaml.v2 v2.2.1/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
-gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI=
 gopkg.in/yaml.v2 v2.4.0 h1:D8xgwECY7CYvx+Y2n4sBz93Jn9JRvxdiyyo8CTfuKaY=
 gopkg.in/yaml.v2 v2.4.0/go.mod h1:RDklbk79AGWmwhnvt/jBztapEOGDOx6ZbXqjP6csGnQ=
 gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
--- a/pkg/model/initializers.go
+++ b/pkg/model/initializers.go
@@ -9,8 +9,8 @@ import (
 	"time"

 	grpc "github.com/mudler/LocalAI/pkg/grpc"
-	"github.com/phayes/freeport"
 	"github.com/mudler/xlog"
+	"github.com/phayes/freeport"
 )

 const (
@@ -173,7 +173,7 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e

 	model, err := ml.LoadModel(o.modelID, o.model, ml.grpcModel(backend, o))
 	if err != nil {
-		if stopErr := ml.StopGRPC(only(o.modelID));stopErr != nil {
+		if stopErr := ml.StopGRPC(only(o.modelID)); stopErr != nil {
 			xlog.Error("error stopping model", "error", stopErr, "model", o.modelID)
 		}
 		xlog.Error("Failed to load model", "modelID", o.modelID, "error", err, "backend", o.backendString)
@@ -186,13 +186,47 @@ func (ml *ModelLoader) backendLoader(opts ...Option) (client grpc.Backend, err e
 // enforceLRULimit enforces the LRU limit before loading a new model.
 // This is called before loading a model to ensure we don't exceed the limit.
 // It accounts for models that are currently being loaded by other goroutines.
+// If models are busy and can't be evicted, it will wait and retry until space is available.
 func (ml *ModelLoader) enforceLRULimit() {
 	if ml.wd == nil {
 		return
 	}
+
 	// Get the count of models currently being loaded to account for concurrent requests
 	pendingLoads := ml.GetLoadingCount()
-	ml.wd.EnforceLRULimit(pendingLoads)
+
+	// Get retry settings from ModelLoader
+	ml.mu.Lock()
+	maxRetries := ml.lruEvictionMaxRetries
+	retryInterval := ml.lruEvictionRetryInterval
+	ml.mu.Unlock()
+
+	for attempt := 0; attempt < maxRetries; attempt++ {
+		result := ml.wd.EnforceLRULimit(pendingLoads)
+
+		if !result.NeedMore {
+			// Successfully evicted enough models (or no eviction needed)
+			if result.EvictedCount > 0 {
+				xlog.Info("[ModelLoader] LRU enforcement complete", "evicted", result.EvictedCount)
+			}
+			return
+		}
+
+		// Need more evictions but models are busy - wait and retry
+		if attempt < maxRetries-1 {
+			xlog.Info("[ModelLoader] Waiting for busy models to become idle before eviction",
+				"evicted", result.EvictedCount,
+				"attempt", attempt+1,
+				"maxRetries", maxRetries,
+				"retryIn", retryInterval)
+			time.Sleep(retryInterval)
+		} else {
+			// Last attempt - log warning but proceed (might fail to load, but at least we tried)
+			xlog.Warn("[ModelLoader] LRU enforcement incomplete after max retries",
+				"evicted", result.EvictedCount,
+				"reason", "models are still busy with active API calls")
+		}
+	}
 }

 // updateModelLastUsed updates the last used time for a model (for LRU tracking)
--- a/pkg/model/loader.go
+++ b/pkg/model/loader.go
@@ -20,22 +20,26 @@ import (

 // TODO: Split ModelLoader and TemplateLoader? Just to keep things more organized. Left together to share a mutex until I look into that. Would split if we separate directories for .bin/.yaml and .tmpl
 type ModelLoader struct {
-	ModelPath        string
-	mu               sync.Mutex
-	models           map[string]*Model
-	loading          map[string]chan struct{} // tracks models currently being loaded
-	wd               *WatchDog
-	externalBackends map[string]string
+	ModelPath                string
+	mu                       sync.Mutex
+	models                   map[string]*Model
+	loading                  map[string]chan struct{} // tracks models currently being loaded
+	wd                       *WatchDog
+	externalBackends         map[string]string
+	lruEvictionMaxRetries    int           // Maximum number of retries when waiting for busy models
+	lruEvictionRetryInterval time.Duration // Interval between retries when waiting for busy models
 }

 // NewModelLoader creates a new ModelLoader instance.
 // LRU eviction is now managed through the WatchDog component.
 func NewModelLoader(system *system.SystemState) *ModelLoader {
 	nml := &ModelLoader{
-		ModelPath:        system.Model.ModelsPath,
-		models:           make(map[string]*Model),
-		loading:          make(map[string]chan struct{}),
-		externalBackends: make(map[string]string),
+		ModelPath:                system.Model.ModelsPath,
+		models:                   make(map[string]*Model),
+		loading:                  make(map[string]chan struct{}),
+		externalBackends:         make(map[string]string),
+		lruEvictionMaxRetries:    30,              // Default: 30 retries
+		lruEvictionRetryInterval: 1 * time.Second, // Default: 1 second
 	}

 	return nml
@@ -56,6 +60,14 @@ func (ml *ModelLoader) GetWatchDog() *WatchDog {
 	return ml.wd
 }

+// SetLRUEvictionRetrySettings updates the LRU eviction retry settings
+func (ml *ModelLoader) SetLRUEvictionRetrySettings(maxRetries int, retryInterval time.Duration) {
+	ml.mu.Lock()
+	defer ml.mu.Unlock()
+	ml.lruEvictionMaxRetries = maxRetries
+	ml.lruEvictionRetryInterval = retryInterval
+}
+
 func (ml *ModelLoader) ExistsInModelPath(s string) bool {
 	return utils.ExistsInPath(ml.ModelPath, s)
 }
--- a/pkg/model/loader_test.go
+++ b/pkg/model/loader_test.go
@@ -262,4 +262,13 @@ var _ = Describe("ModelLoader", func() {
 			Expect(modelLoader.GetLoadingCount()).To(Equal(0))
 		})
 	})
+
+	Context("LRU Eviction Retry Settings", func() {
+		It("should allow updating retry settings", func() {
+			modelLoader.SetLRUEvictionRetrySettings(50, 2*time.Second)
+			// Settings are updated - we can verify through behavior if needed
+			// For now, just verify the call doesn't panic
+			Expect(modelLoader).ToNot(BeNil())
+		})
+	})
 })
--- a/pkg/model/watchdog.go
+++ b/pkg/model/watchdog.go
@@ -41,6 +41,9 @@ type WatchDog struct {
 	memoryReclaimerEnabled   bool    // Enable memory threshold monitoring
 	memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
 	watchdogInterval         time.Duration
+
+	// Eviction settings
+	forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
 }

 type ProcessManager interface {
@@ -78,6 +81,7 @@ func NewWatchDog(opts ...WatchDogOption) *WatchDog {
 		memoryReclaimerEnabled:   o.memoryReclaimerEnabled,
 		memoryReclaimerThreshold: o.memoryReclaimerThreshold,
 		watchdogInterval:         o.watchdogInterval,
+		forceEvictionWhenBusy:    o.forceEvictionWhenBusy,
 	}
 }

@@ -110,6 +114,13 @@ func (wd *WatchDog) GetMemoryReclaimerSettings() (enabled bool, threshold float6
 	return wd.memoryReclaimerEnabled, wd.memoryReclaimerThreshold
 }

+// SetForceEvictionWhenBusy updates the force eviction when busy setting dynamically
+func (wd *WatchDog) SetForceEvictionWhenBusy(force bool) {
+	wd.Lock()
+	defer wd.Unlock()
+	wd.forceEvictionWhenBusy = force
+}
+
 func (wd *WatchDog) Shutdown() {
 	wd.Lock()
 	defer wd.Unlock()
@@ -169,13 +180,19 @@ type modelUsageInfo struct {
 	lastUsed time.Time
 }

+// EnforceLRULimitResult contains the result of LRU enforcement
+type EnforceLRULimitResult struct {
+	EvictedCount int  // Number of models successfully evicted
+	NeedMore     bool // True if more evictions are needed but couldn't be done (e.g., all models are busy)
+}
+
 // EnforceLRULimit ensures we're under the LRU limit by evicting least recently used models.
 // This should be called before loading a new model.
 // pendingLoads is the number of models currently being loaded (to account for concurrent loads).
-// Returns the number of models evicted.
-func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
+// Returns the result containing evicted count and whether more evictions are needed.
+func (wd *WatchDog) EnforceLRULimit(pendingLoads int) EnforceLRULimitResult {
 	if wd.lruLimit <= 0 {
-		return 0 // LRU disabled
+		return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false} // LRU disabled
 	}

 	wd.Lock()
@@ -186,9 +203,10 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
 	// We need: currentCount + pendingLoads + 1 <= lruLimit
 	// So evict: currentCount + pendingLoads + 1 - lruLimit = currentCount - lruLimit + pendingLoads + 1
 	modelsToEvict := currentCount - wd.lruLimit + pendingLoads + 1
+	forceEvictionWhenBusy := wd.forceEvictionWhenBusy
 	if modelsToEvict <= 0 {
 		wd.Unlock()
-		return 0
+		return EnforceLRULimitResult{EvictedCount: 0, NeedMore: false}
 	}

 	xlog.Debug("[WatchDog] LRU enforcement triggered", "current", currentCount, "pendingLoads", pendingLoads, "limit", wd.lruLimit, "toEvict", modelsToEvict)
@@ -215,13 +233,25 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {

 	// Collect models to evict (the oldest ones)
 	var modelsToShutdown []string
-	for i := 0; i < modelsToEvict && i < len(models); i++ {
+	evictedCount := 0
+	skippedBusyCount := 0
+	for i := 0; evictedCount < modelsToEvict && i < len(models); i++ {
 		m := models[i]
-		xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed)
+		// Check if model is busy
+		_, isBusy := wd.busyTime[m.address]
+		if isBusy && !forceEvictionWhenBusy {
+			// Skip eviction for busy models when forceEvictionWhenBusy is false
+			xlog.Warn("[WatchDog] Skipping LRU eviction for busy model", "model", m.model, "reason", "model has active API calls")
+			skippedBusyCount++
+			continue
+		}
+		xlog.Info("[WatchDog] LRU evicting model", "model", m.model, "lastUsed", m.lastUsed, "busy", isBusy)
 		modelsToShutdown = append(modelsToShutdown, m.model)
 		// Clean up the maps while we have the lock
 		wd.untrack(m.address)
+		evictedCount++
 	}
+	needMore := evictedCount < modelsToEvict && skippedBusyCount > 0
 	wd.Unlock()

 	// Now shutdown models without holding the watchdog lock to prevent deadlock
@@ -232,7 +262,14 @@ func (wd *WatchDog) EnforceLRULimit(pendingLoads int) int {
 		xlog.Debug("[WatchDog] LRU eviction complete", "model", model)
 	}

-	return len(modelsToShutdown)
+	if needMore {
+		xlog.Warn("[WatchDog] LRU eviction incomplete", "evicted", evictedCount, "needed", modelsToEvict, "skippedBusy", skippedBusyCount, "reason", "some models are busy with active API calls")
+	}
+
+	return EnforceLRULimitResult{
+		EvictedCount: len(modelsToShutdown),
+		NeedMore:     needMore,
+	}
 }

 func (wd *WatchDog) Run() {
@@ -376,6 +413,8 @@ func (wd *WatchDog) evictLRUModel() {
 		return
 	}

+	forceEvictionWhenBusy := wd.forceEvictionWhenBusy
+
 	// Build a list of models sorted by last used time (oldest first)
 	var models []modelUsageInfo
 	for address, model := range wd.addressModelMap {
@@ -400,8 +439,27 @@ func (wd *WatchDog) evictLRUModel() {
 		return models[i].lastUsed.Before(models[j].lastUsed)
 	})

-	// Get the LRU model
-	lruModel := models[0]
+	// Find the first non-busy model (or first model if forceEvictionWhenBusy is true)
+	var lruModel *modelUsageInfo
+	for i := 0; i < len(models); i++ {
+		m := models[i]
+		_, isBusy := wd.busyTime[m.address]
+		if isBusy && !forceEvictionWhenBusy {
+			// Skip busy models when forceEvictionWhenBusy is false
+			xlog.Warn("[WatchDog] Skipping memory reclaimer eviction for busy model", "model", m.model, "reason", "model has active API calls")
+			continue
+		}
+		lruModel = &m
+		break
+	}
+
+	if lruModel == nil {
+		// All models are busy and forceEvictionWhenBusy is false
+		wd.Unlock()
+		xlog.Warn("[WatchDog] Memory reclaimer cannot evict: all models are busy with active API calls")
+		return
+	}
+
 	xlog.Info("[WatchDog] Memory reclaimer evicting LRU model", "model", lruModel.model, "lastUsed", lruModel.lastUsed)

 	// Untrack the model
--- a/pkg/model/watchdog_options.go
+++ b/pkg/model/watchdog_options.go
@@ -28,6 +28,9 @@ type WatchDogOptions struct {
 	// Memory reclaimer settings (works with GPU if available, otherwise RAM)
 	memoryReclaimerEnabled   bool    // Enable memory threshold monitoring
 	memoryReclaimerThreshold float64 // Threshold 0.0-1.0 (e.g., 0.95 = 95%)
+
+	// Eviction settings
+	forceEvictionWhenBusy bool // Force eviction even when models have active API calls (default: false for safety)
 }

 // WatchDogOption is a function that configures WatchDogOptions
@@ -105,6 +108,14 @@ func WithMemoryReclaimerThreshold(threshold float64) WatchDogOption {
 	}
 }

+// WithForceEvictionWhenBusy sets whether to force eviction even when models have active API calls
+// Default: false (skip eviction when busy for safety)
+func WithForceEvictionWhenBusy(force bool) WatchDogOption {
+	return func(o *WatchDogOptions) {
+		o.forceEvictionWhenBusy = force
+	}
+}
+
 // DefaultWatchDogOptions returns default options for the watchdog
 func DefaultWatchDogOptions() *WatchDogOptions {
 	return &WatchDogOptions{
@@ -116,6 +127,7 @@ func DefaultWatchDogOptions() *WatchDogOptions {
 		lruLimit:                 0,
 		memoryReclaimerEnabled:   false,
 		memoryReclaimerThreshold: DefaultMemoryReclaimerThreshold,
+		forceEvictionWhenBusy:    false, // Default: skip eviction when busy for safety
 	}
 }

--- a/pkg/model/watchdog_test.go
+++ b/pkg/model/watchdog_test.go
@@ -170,15 +170,18 @@ var _ = Describe("WatchDog", func() {
 				model.WithBusyTimeout(5*time.Minute),
 				model.WithIdleTimeout(15*time.Minute),
 				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests to match old behavior
 			)
 		})

 		It("should not evict when under limit", func() {
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle (not busy)

-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(0))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(0))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(BeEmpty())
 		})

@@ -186,14 +189,17 @@ var _ = Describe("WatchDog", func() {
 			// Add two models
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle

 			// Enforce LRU with limit of 2 (need to make room for 1 new model)
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(1))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(ContainElement("model1")) // oldest should be evicted
 		})

@@ -201,19 +207,23 @@ var _ = Describe("WatchDog", func() {
 			// Add three models
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr3", "model3")
 			wd.Mark("addr3")
+			wd.UnMark("addr3") // Unmark to make it idle

 			// Set limit to 1, should evict 2 oldest + 1 for new = 3 evictions
 			wd.SetLRULimit(1)
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(3))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(3))
+			Expect(result.NeedMore).To(BeFalse())
 			shutdowns := pm.getShutdownCalls()
 			Expect(shutdowns).To(ContainElement("model1"))
 			Expect(shutdowns).To(ContainElement("model2"))
@@ -224,15 +234,18 @@ var _ = Describe("WatchDog", func() {
 			// Add two models (at limit)
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(10 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle

 			// With 1 pending load, we need to evict 2 (current=2, pending=1, new=1, limit=2)
 			// total after = 2 + 1 + 1 = 4, need to evict 4 - 2 = 2
-			evicted := wd.EnforceLRULimit(1)
-			Expect(evicted).To(Equal(2))
+			result := wd.EnforceLRULimit(1)
+			Expect(result.EvictedCount).To(Equal(2))
+			Expect(result.NeedMore).To(BeFalse())
 		})

 		It("should not evict when LRU is disabled", func() {
@@ -242,8 +255,9 @@ var _ = Describe("WatchDog", func() {
 			wd.AddAddressModelMap("addr2", "model2")
 			wd.AddAddressModelMap("addr3", "model3")

-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(0))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(0))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(BeEmpty())
 		})

@@ -253,10 +267,12 @@ var _ = Describe("WatchDog", func() {
 			// Add models with different lastUsed times
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle
 			time.Sleep(20 * time.Millisecond)

 			wd.AddAddressModelMap("addr2", "model2")
 			wd.Mark("addr2")
+			wd.UnMark("addr2") // Unmark to make it idle
 			time.Sleep(20 * time.Millisecond)

 			// Touch model1 again to make it more recent
@@ -265,10 +281,12 @@ var _ = Describe("WatchDog", func() {

 			wd.AddAddressModelMap("addr3", "model3")
 			wd.Mark("addr3")
+			wd.UnMark("addr3") // Unmark to make it idle

 			// Now model2 is the oldest, should be evicted first
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(BeNumerically(">=", 1))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(BeNumerically(">=", 1))
+			Expect(result.NeedMore).To(BeFalse())

 			shutdowns := pm.getShutdownCalls()
 			// model2 should be evicted first (it's the oldest)
@@ -285,16 +303,19 @@ var _ = Describe("WatchDog", func() {
 				model.WithBusyTimeout(5*time.Minute),
 				model.WithIdleTimeout(15*time.Minute),
 				model.WithLRULimit(1),
+				model.WithForceEvictionWhenBusy(true), // Enable force eviction for these tests
 			)
 		})

 		It("should evict existing model when loading new one", func() {
 			wd.AddAddressModelMap("addr1", "model1")
 			wd.Mark("addr1")
+			wd.UnMark("addr1") // Unmark to make it idle

 			// With limit=1, loading a new model should evict the existing one
-			evicted := wd.EnforceLRULimit(0)
-			Expect(evicted).To(Equal(1))
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse())
 			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
 		})

@@ -302,6 +323,7 @@ var _ = Describe("WatchDog", func() {
 			for i := 0; i < 5; i++ {
 				wd.AddAddressModelMap("addr", "model")
 				wd.Mark("addr")
+				wd.UnMark("addr") // Unmark to make it idle
 				wd.EnforceLRULimit(0)
 			}
 			// All previous models should have been evicted
@@ -309,6 +331,233 @@ var _ = Describe("WatchDog", func() {
 		})
 	})

+	Context("Force Eviction When Busy", func() {
+		BeforeEach(func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(false), // Default: skip eviction when busy
+			)
+		})
+
+		It("should skip eviction for busy models when forceEvictionWhenBusy is false", func() {
+			// Add two models (at limit of 2, need to evict 1 for new model)
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+
+			// Keep model1 as busy (simulating active API call)
+			// model1 is already marked as busy from the first Mark call
+
+			// Try to enforce LRU - should skip busy model1, evict model2
+			result := wd.EnforceLRULimit(0)
+			// Should evict model2 (not busy) but skip model1 (busy)
+			// Since we evicted 1 (which is what we needed), NeedMore should be false
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse()) // We evicted enough, even though we skipped model1
+			Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
+			Expect(pm.getShutdownCalls()).ToNot(ContainElement("model1"))
+		})
+
+		It("should evict busy models when forceEvictionWhenBusy is true", func() {
+			wd.SetForceEvictionWhenBusy(true)
+
+			// Add two models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+
+			// Keep model1 as busy (already marked from first Mark call)
+
+			// Try to enforce LRU - should evict model1 even though busy
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
+		})
+
+		It("should set NeedMore when all models are busy and forceEvictionWhenBusy is false", func() {
+			// Add two models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+
+			// Mark both as busy
+			wd.Mark("addr1")
+			wd.Mark("addr2")
+
+			// Try to enforce LRU - should skip both busy models
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(0))
+			Expect(result.NeedMore).To(BeTrue())
+			Expect(pm.getShutdownCalls()).To(BeEmpty())
+		})
+
+		It("should allow updating forceEvictionWhenBusy dynamically", func() {
+			// Start with false
+			Expect(wd).ToNot(BeNil())
+
+			// Add models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			// Keep model1 busy (already marked)
+
+			// With forceEvictionWhenBusy=false, should skip busy model1, evict model2
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())    // We evicted enough (1 model)
+			Expect(result.EvictedCount).To(Equal(1)) // Should evict model2 (not busy)
+
+			// Now enable force eviction
+			wd.SetForceEvictionWhenBusy(true)
+
+			// Add models again
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			// Keep model1 busy (already marked)
+
+			// With forceEvictionWhenBusy=true, should evict busy model1
+			result = wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(result.EvictedCount).To(Equal(1))
+		})
+
+		It("should continue to next LRU model when busy model is skipped", func() {
+			// Add three models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr3", "model3")
+			wd.Mark("addr3")
+			wd.UnMark("addr3") // Make model3 idle
+
+			// Keep model1 as busy (oldest, already marked)
+
+			// Need to evict 2 models (limit=2, current=3, need room for 1 new)
+			// Should skip model1 (busy), evict model2 and model3 (not busy)
+			result := wd.EnforceLRULimit(0)
+			// Should evict model2 and model3 (2 models, which is what we needed)
+			Expect(result.EvictedCount).To(Equal(2))
+			Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
+			Expect(pm.getShutdownCalls()).To(ContainElement("model2"))
+			Expect(pm.getShutdownCalls()).To(ContainElement("model3"))
+		})
+	})
+
+	Context("EnforceLRULimitResult", func() {
+		BeforeEach(func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(false),
+			)
+		})
+
+		It("should return NeedMore=false when eviction is successful", func() {
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			wd.UnMark("addr1") // Make idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make idle
+
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(result.EvictedCount).To(Equal(1))
+		})
+
+		It("should return NeedMore=true when not enough models can be evicted", func() {
+			// Add two models (at limit of 2, need to evict 1 for new model)
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+
+			// Mark both as busy (keep them busy)
+			// Both are already marked as busy from the Mark calls above
+
+			// Need to evict 1, but both are busy
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeTrue())
+			Expect(result.EvictedCount).To(Equal(0))
+		})
+
+		It("should return NeedMore=true when need to evict multiple but some are busy", func() {
+			// Set limit to 1, add 3 models (need to evict 2 for new model)
+			wd.SetLRULimit(1)
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr3", "model3")
+			wd.Mark("addr3")
+			// Keep model1 and model3 busy
+
+			// Need to evict 2 models, but model1 and model3 are busy, only model2 is idle
+			// Should evict model2 (1 model), but NeedMore=true because we needed 2
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(result.NeedMore).To(BeTrue())
+		})
+
+		It("should return correct EvictedCount when some models are evicted", func() {
+			// Add three models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.UnMark("addr2") // Make model2 idle
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr3", "model3")
+			wd.Mark("addr3")
+			wd.UnMark("addr3") // Make model3 idle
+
+			// Keep model1 as busy (already marked)
+
+			// Need to evict 2 models, but model1 is busy
+			// Should evict model2 and model3 (2 models, which is what we needed)
+			result := wd.EnforceLRULimit(0)
+			Expect(result.EvictedCount).To(Equal(2))
+			Expect(result.NeedMore).To(BeFalse()) // We evicted enough (2 models)
+		})
+	})
+
 	Context("Functional Options", func() {
 		It("should use default options when none provided", func() {
 			wd = model.NewWatchDog(
@@ -331,6 +580,7 @@ var _ = Describe("WatchDog", func() {
 				model.WithLRULimit(5),
 				model.WithMemoryReclaimerEnabled(true),
 				model.WithMemoryReclaimerThreshold(0.80),
+				model.WithForceEvictionWhenBusy(true),
 			)

 			Expect(wd.GetLRULimit()).To(Equal(5))
@@ -339,5 +589,48 @@ var _ = Describe("WatchDog", func() {
 			Expect(enabled).To(BeTrue())
 			Expect(threshold).To(Equal(0.80))
 		})
+
+		It("should use default forceEvictionWhenBusy (false) when not specified", func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+			)
+			// Default should be false - we can test this by checking behavior
+			// Add a busy model and verify it's skipped
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			wd.Mark("addr1") // Keep model1 busy
+
+			wd.SetLRULimit(1)
+			result := wd.EnforceLRULimit(0)
+			// Should skip busy model1, evict model2, but NeedMore=true
+			Expect(result.NeedMore).To(BeTrue())
+		})
+
+		It("should allow setting forceEvictionWhenBusy via option", func() {
+			wd = model.NewWatchDog(
+				model.WithProcessManager(pm),
+				model.WithLRULimit(2),
+				model.WithForceEvictionWhenBusy(true),
+			)
+
+			// Add models
+			wd.AddAddressModelMap("addr1", "model1")
+			wd.Mark("addr1")
+			time.Sleep(10 * time.Millisecond)
+
+			wd.AddAddressModelMap("addr2", "model2")
+			wd.Mark("addr2")
+			// Keep model1 busy (already marked from first Mark call)
+
+			// Should evict busy model1
+			result := wd.EnforceLRULimit(0)
+			Expect(result.NeedMore).To(BeFalse())
+			Expect(result.EvictedCount).To(Equal(1))
+			Expect(pm.getShutdownCalls()).To(ContainElement("model1"))
+		})
 	})
 })
--- a/pkg/xsysinfo/gpu.go
+++ b/pkg/xsysinfo/gpu.go
@@ -354,8 +354,8 @@ func getAMDGPUMemory() []GPUMemoryInfo {
 		}

 		// Parse memory values (in bytes or MB depending on rocm-smi version)
-		usedBytes, _ := strconv.ParseUint(strings.TrimSpace(parts[1]), 10, 64)
-		totalBytes, _ := strconv.ParseUint(strings.TrimSpace(parts[2]), 10, 64)
+		usedBytes, _ := strconv.ParseUint(strings.TrimSpace(parts[2]), 10, 64)
+		totalBytes, _ := strconv.ParseUint(strings.TrimSpace(parts[1]), 10, 64)

 		// If values seem like MB, convert to bytes
 		if totalBytes < 1000000 {
--- a/swagger/docs.go
+++ b/swagger/docs.go
@@ -1889,6 +1889,17 @@ const docTemplate = `{
                }
            }
        },
+        "schema.InputTokensDetails": {
+            "type": "object",
+            "properties": {
+                "image_tokens": {
+                    "type": "integer"
+                },
+                "text_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
        "schema.Item": {
            "type": "object",
            "properties": {
@@ -2435,10 +2446,6 @@ const docTemplate = `{
                        "type": "string"
                    }
                },
-                "mode": {
-                    "description": "Image (not supported by OpenAI)",
-                    "type": "integer"
-                },
                "model": {
                    "type": "string"
                },
@@ -2465,6 +2472,7 @@ const docTemplate = `{
                    "description": "Prompt is read only by completion/image API calls"
                },
                "quality": {
+                    "description": "Image (not supported by OpenAI)",
                    "type": "string"
                },
                "reasoning_effort": {
@@ -2581,6 +2589,16 @@ const docTemplate = `{
                "completion_tokens": {
                    "type": "integer"
                },
+                "input_tokens": {
+                    "description": "Fields for image generation API compatibility",
+                    "type": "integer"
+                },
+                "input_tokens_details": {
+                    "$ref": "#/definitions/schema.InputTokensDetails"
+                },
+                "output_tokens": {
+                    "type": "integer"
+                },
                "prompt_tokens": {
                    "type": "integer"
                },
--- a/swagger/swagger.json
+++ b/swagger/swagger.json
@@ -1882,6 +1882,17 @@
                }
            }
        },
+        "schema.InputTokensDetails": {
+            "type": "object",
+            "properties": {
+                "image_tokens": {
+                    "type": "integer"
+                },
+                "text_tokens": {
+                    "type": "integer"
+                }
+            }
+        },
        "schema.Item": {
            "type": "object",
            "properties": {
@@ -2428,10 +2439,6 @@
                        "type": "string"
                    }
                },
-                "mode": {
-                    "description": "Image (not supported by OpenAI)",
-                    "type": "integer"
-                },
                "model": {
                    "type": "string"
                },
@@ -2458,6 +2465,7 @@
                    "description": "Prompt is read only by completion/image API calls"
                },
                "quality": {
+                    "description": "Image (not supported by OpenAI)",
                    "type": "string"
                },
                "reasoning_effort": {
@@ -2574,6 +2582,16 @@
                "completion_tokens": {
                    "type": "integer"
                },
+                "input_tokens": {
+                    "description": "Fields for image generation API compatibility",
+                    "type": "integer"
+                },
+                "input_tokens_details": {
+                    "$ref": "#/definitions/schema.InputTokensDetails"
+                },
+                "output_tokens": {
+                    "type": "integer"
+                },
                "prompt_tokens": {
                    "type": "integer"
                },
--- a/swagger/swagger.yaml
+++ b/swagger/swagger.yaml
@@ -320,6 +320,13 @@ definitions:
      uuid:
        type: string
    type: object
+  schema.InputTokensDetails:
+    properties:
+      image_tokens:
+        type: integer
+      text_tokens:
+        type: integer
+    type: object
  schema.Item:
    properties:
      b64_json:
@@ -698,9 +705,6 @@ definitions:
        additionalProperties:
          type: string
        type: object
-      mode:
-        description: Image (not supported by OpenAI)
-        type: integer
      model:
        type: string
      model_base_name:
@@ -720,6 +724,7 @@ definitions:
      prompt:
        description: Prompt is read only by completion/image API calls
      quality:
+        description: Image (not supported by OpenAI)
        type: string
      reasoning_effort:
        type: string
@@ -802,6 +807,13 @@ definitions:
    properties:
      completion_tokens:
        type: integer
+      input_tokens:
+        description: Fields for image generation API compatibility
+        type: integer
+      input_tokens_details:
+        $ref: '#/definitions/schema.InputTokensDetails'
+      output_tokens:
+        type: integer
      prompt_tokens:
        type: integer
      timing_prompt_processing: