feat(gallery): add locate-anything-3b model to the gallery index

Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
test(backend): locate-anything-cpp Load+Detect wire test
2026-06-12 02:38:19 -04:00 · 2026-06-11 23:04:43 +00:00 · 2026-06-11 23:04:43 +00:00 · 2026-06-11 23:04:43 +00:00 · 2026-06-11 23:04:43 +00:00 · 2026-06-11 23:04:43 +00:00
59 changed files with 3275 additions and 93 deletions
--- a/.github/backend-matrix.yml
+++ b/.github/backend-matrix.yml
@@ -703,6 +703,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "8"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-12-locate-anything-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "12"
    cuda-minor-version: "8"
@@ -1543,6 +1556,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-nvidia-cuda-13-locate-anything-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -1569,6 +1595,19 @@ include:
    backend: "rfdetr-cpp"
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
+  - build-type: 'cublas'
+    cuda-major-version: "13"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-cuda-13-arm64-locate-anything-cpp'
+    base-image: "ubuntu:24.04"
+    ubuntu-version: '2404'
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
  - build-type: 'cublas'
    cuda-major-version: "13"
    cuda-minor-version: "0"
@@ -2806,6 +2845,74 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2404'
+  # locate-anything-cpp
+  - build-type: ''
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-cpu-locate-anything-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f32'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f32-locate-anything-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'sycl_f16'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-intel-sycl-f16-locate-anything-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "intel/oneapi-basekit:2025.3.0-0-devel-ubuntu24.04"
+    skip-drivers: 'false'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/amd64'
+    platform-tag: 'amd64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-locate-anything-cpp'
+    runs-on: 'ubuntu-latest'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
+  - build-type: 'vulkan'
+    cuda-major-version: ""
+    cuda-minor-version: ""
+    platforms: 'linux/arm64'
+    platform-tag: 'arm64'
+    tag-latest: 'auto'
+    tag-suffix: '-gpu-vulkan-locate-anything-cpp'
+    runs-on: 'ubuntu-24.04-arm'
+    base-image: "ubuntu:24.04"
+    skip-drivers: 'false'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2404'
  - build-type: 'sycl_f32'
    cuda-major-version: ""
    cuda-minor-version: ""
@@ -2899,6 +3006,19 @@ include:
    dockerfile: "./backend/Dockerfile.golang"
    context: "./"
    ubuntu-version: '2204'
+  - build-type: 'cublas'
+    cuda-major-version: "12"
+    cuda-minor-version: "0"
+    platforms: 'linux/arm64'
+    skip-drivers: 'false'
+    tag-latest: 'auto'
+    tag-suffix: '-nvidia-l4t-arm64-locate-anything-cpp'
+    base-image: "nvcr.io/nvidia/l4t-jetpack:r36.4.0"
+    runs-on: 'ubuntu-24.04-arm'
+    backend: "locate-anything-cpp"
+    dockerfile: "./backend/Dockerfile.golang"
+    context: "./"
+    ubuntu-version: '2204'
  # whisper
  - build-type: ''
    cuda-major-version: ""
--- a/4
+++ b/4
@@ -180,7 +180,7 @@ osx-signed: build

 ## Run
 run: ## run local-ai
-	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./
+	CGO_LDFLAGS="$(CGO_LDFLAGS)" $(GOCMD) run ./cmd/local-ai

 prepare-test: protogen-go build-mock-backend

@@ -566,6 +566,7 @@ prepare-test-extra: protogen-python
 	$(MAKE) -C backend/python/speaker-recognition
 	$(MAKE) -C backend/rust/kokoros kokoros-grpc
 	$(MAKE) -C backend/go/rfdetr-cpp
+	$(MAKE) -C backend/go/locate-anything-cpp

 test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/transformers test
@@ -593,6 +594,7 @@ test-extra: prepare-test-extra
 	$(MAKE) -C backend/python/speaker-recognition test
 	$(MAKE) -C backend/rust/kokoros test
 	$(MAKE) -C backend/go/rfdetr-cpp test
+	$(MAKE) -C backend/go/locate-anything-cpp test

 ##
 ## End-to-end gRPC tests that exercise a built backend container image.
--- a/README.md
+++ b/README.md
@@ -149,6 +149,16 @@ local-ai run https://gist.githubusercontent.com/.../phi-2.yaml
 local-ai run oci://localai/phi-2:latest
 ```

+To test a running LocalAI server from the terminal, open an interactive chat session from another shell. Inside the prompt, `/models` lists installed models and `/model <name>` switches between them.
+
+```bash
+# Terminal 1
+local-ai run llama-3.2-1b-instruct:q4_k_m
+
+# Terminal 2
+local-ai chat --model llama-3.2-1b-instruct:q4_k_m
+```
+
 > **Automatic Backend Detection**: LocalAI automatically detects your GPU capabilities and downloads the appropriate backend. For advanced options, see [GPU Acceleration](https://localai.io/features/gpu-acceleration/).

 For more details, see the [Getting Started guide](https://localai.io/basics/getting_started/).
--- a/backend/cpp/ds4/Makefile
+++ b/backend/cpp/ds4/Makefile
@@ -1,10 +1,10 @@
 # ds4 backend Makefile.
 #
-# Upstream pin lives below as DS4_VERSION?=c463029c205c2ec8d7ab6c0df4a3f52979091286
+# Upstream pin lives below as DS4_VERSION?=91bafb5acd5a6cf00b1e55ef68bf40ddd207bee7
 # (.github/bump_deps.sh) can find and update it - matches the
 # llama-cpp / ik-llama-cpp / turboquant convention.

-DS4_VERSION?=c463029c205c2ec8d7ab6c0df4a3f52979091286
+DS4_VERSION?=91bafb5acd5a6cf00b1e55ef68bf40ddd207bee7
 DS4_REPO?=https://github.com/antirez/ds4

 CURRENT_MAKEFILE_DIR := $(dir $(abspath $(lastword $(MAKEFILE_LIST))))
--- a/backend/cpp/ik-llama-cpp/Makefile
+++ b/backend/cpp/ik-llama-cpp/Makefile
@@ -1,5 +1,5 @@

-IK_LLAMA_VERSION?=6b9de3dbaa21ae95ea80638e5ee836795cc48c93
+IK_LLAMA_VERSION?=e6f8112f3ba126eed3ff5b30cdd08085414a7516
 LLAMA_REPO?=https://github.com/ikawrakow/ik_llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/Makefile
+++ b/backend/cpp/llama-cpp/Makefile
@@ -1,5 +1,5 @@

-LLAMA_VERSION?=9e3b928fd8c9d14dbf15a8768b9fdd7e5c721d66
+LLAMA_VERSION?=039e20a2db9e87b2477c76cc04905f3e1acad77f
 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp

 CMAKE_ARGS?=
--- a/backend/cpp/llama-cpp/grpc-server.cpp
+++ b/backend/cpp/llama-cpp/grpc-server.cpp
@@ -381,6 +381,15 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const
            });
    }

+    // for each video in the request, add the video data
+    for (int i = 0; i < predict->videos_size(); i++) {
+        data["video_data"].push_back(json
+            {
+                {"id", i},
+                {"data",    predict->videos(i)},
+            });
+    }
+
    data["stop"] = predict->stopprompts();
    // data["n_probs"] = predict->nprobs();
    //TODO: images,
@@ -1503,7 +1512,7 @@ public:
                    msg_json["role"] = msg.role();

                    bool is_last_user_msg = (i == last_user_msg_idx);
-                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
+                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);

                    // Handle content - can be string, null, or array
                    // For multimodal content, we'll embed images/audio from separate fields
@@ -1554,6 +1563,16 @@ public:
                                    content_array.push_back(audio_chunk);
                                }
                            }
+                            if (request->videos_size() > 0) {
+                                for (int j = 0; j < request->videos_size(); j++) {
+                                    json video_chunk;
+                                    video_chunk["type"] = "input_video";
+                                    json input_video;
+                                    input_video["data"] = request->videos(j);
+                                    video_chunk["input_video"] = input_video;
+                                    content_array.push_back(video_chunk);
+                                }
+                            }
                            msg_json["content"] = content_array;
                        } else {
                            // Use content as-is (already array or not last user message)
@@ -1588,6 +1607,16 @@ public:
                                content_array.push_back(audio_chunk);
                            }
                        }
+                        if (request->videos_size() > 0) {
+                            for (int j = 0; j < request->videos_size(); j++) {
+                                json video_chunk;
+                                video_chunk["type"] = "input_video";
+                                json input_video;
+                                input_video["data"] = request->videos(j);
+                                video_chunk["input_video"] = input_video;
+                                content_array.push_back(video_chunk);
+                            }
+                        }
                        msg_json["content"] = content_array;
                    } else if (msg.role() == "tool") {
                        // Tool role messages must have content field set, even if empty
@@ -2039,6 +2068,16 @@ public:
                        files.push_back(decoded_data);
                    }
                }
+
+                const auto &video_data = data.find("video_data");
+                if (video_data != data.end() && video_data->is_array())
+                {
+                    for (const auto &video : *video_data)
+                    {
+                        auto decoded_data = base64_decode(video["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
+                }
            }

            const bool has_mtmd = ctx_server.impl->mctx != nullptr;
@@ -2291,7 +2330,7 @@ public:
                    }

                    bool is_last_user_msg = (i == last_user_msg_idx);
-                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0);
+                    bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);

                    // Handle content - can be string, null, or array
                    // For multimodal content, we'll embed images/audio from separate fields
@@ -2344,6 +2383,16 @@ public:
                                    content_array.push_back(audio_chunk);
                                }
                            }
+                            if (request->videos_size() > 0) {
+                                for (int j = 0; j < request->videos_size(); j++) {
+                                    json video_chunk;
+                                    video_chunk["type"] = "input_video";
+                                    json input_video;
+                                    input_video["data"] = request->videos(j);
+                                    video_chunk["input_video"] = input_video;
+                                    content_array.push_back(video_chunk);
+                                }
+                            }
                            msg_json["content"] = content_array;
                        } else {
                            // Use content as-is (already array or not last user message)
@@ -2383,6 +2432,16 @@ public:
                                content_array.push_back(audio_chunk);
                            }
                        }
+                        if (request->videos_size() > 0) {
+                            for (int j = 0; j < request->videos_size(); j++) {
+                                json video_chunk;
+                                video_chunk["type"] = "input_video";
+                                json input_video;
+                                input_video["data"] = request->videos(j);
+                                video_chunk["input_video"] = input_video;
+                                content_array.push_back(video_chunk);
+                            }
+                        }
                        msg_json["content"] = content_array;
                        SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i);
                    } else if (!msg.tool_calls().empty()) {
@@ -2845,6 +2904,16 @@ public:
                        files.push_back(decoded_data);
                    }
                }
+
+                const auto &video_data = data.find("video_data");
+                if (video_data != data.end() && video_data->is_array())
+                {
+                    for (const auto &video : *video_data)
+                    {
+                        auto decoded_data = base64_decode(video["data"].get<std::string>());
+                        files.push_back(decoded_data);
+                    }
+                }
            }

            // process files
--- a/backend/go/crispasr/Makefile
+++ b/backend/go/crispasr/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # CrispASR version (release tag)
 CRISPASR_REPO?=https://github.com/CrispStrobe/CrispASR
-CRISPASR_VERSION?=f7838a306687f22c281d29c250f879a4ab3df2d7
+CRISPASR_VERSION?=c29f6653a516a3001d923944dad8892072cc7334
 SO_TARGET?=libgocrispasr.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/go/locate-anything-cpp/.gitignore
+++ b/backend/go/locate-anything-cpp/.gitignore
@@ -0,0 +1,7 @@
+sources/
+build*/
+package/
+liblocateanythingcpp*.so
+locate-anything-cpp
+test-models/
+test-data/
--- a/backend/go/locate-anything-cpp/CMakeLists.txt
+++ b/backend/go/locate-anything-cpp/CMakeLists.txt
@@ -0,0 +1,57 @@
+cmake_minimum_required(VERSION 3.18)
+project(liblocateanythingcpp LANGUAGES C CXX)
+
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+# Static-link ggml + locate_anything so the resulting .so has no runtime
+# dependency on extra ggml/locate_anything shared libraries — only on
+# libc/libstdc++/libgomp, which the LocalAI package step bundles into the
+# docker image.
+set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build static libraries" FORCE)
+
+# locate-anything.cpp build switches: skip CLI/tests, keep static lib.
+set(LA_BUILD_CLI OFF CACHE BOOL "Disable locate-anything CLI" FORCE)
+set(LA_BUILD_TESTS OFF CACHE BOOL "Disable locate-anything tests" FORCE)
+set(LA_SHARED OFF CACHE BOOL "Build locate_anything as static lib" FORCE)
+
+# Unlike rt-detr.cpp, locate-anything.cpp ships no in-tree ggml patches, so
+# there is no apply_ggml_patches.sh hook to shim here.
+add_subdirectory(./sources/locate-anything.cpp)
+
+# locate-anything.cpp's top-level CMakeLists points its own target's include
+# dirs at ${CMAKE_SOURCE_DIR}/{include,src,third_party,...}. CMAKE_SOURCE_DIR
+# is the *top-level* source dir of the whole CMake tree, so when we pull it in
+# via add_subdirectory it resolves to OUR directory, not theirs, and the
+# locate_anything target fails to find its own headers (la_capi.h, stb_image.h,
+# la_gguf_keys.h). Re-add the correct, subdir-relative include paths to the
+# already-defined target so it compiles regardless of where it's nested.
+set(LA_SRC ${CMAKE_CURRENT_SOURCE_DIR}/sources/locate-anything.cpp)
+target_include_directories(locate_anything PRIVATE
+    ${LA_SRC}/include
+    ${LA_SRC}/src
+    ${LA_SRC}/third_party
+    ${LA_SRC}/third_party/stb)
+
+# locate-anything.cpp's C-API symbols already live inside liblocate_anything
+# (src/la_capi.cpp is compiled into the lib). We re-export them via a MODULE
+# library that links locate_anything so the symbols are visible at dlopen time.
+add_library(locateanythingcpp MODULE
+    sources/locate-anything.cpp/src/la_capi.cpp)
+
+target_include_directories(locateanythingcpp PRIVATE
+    sources/locate-anything.cpp/include
+    sources/locate-anything.cpp/src
+    sources/locate-anything.cpp/third_party
+    sources/locate-anything.cpp/third_party/stb
+)
+
+target_link_libraries(locateanythingcpp PRIVATE locate_anything ggml)
+
+if(CMAKE_CXX_COMPILER_ID MATCHES "GNU" AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0)
+    target_link_libraries(locateanythingcpp PRIVATE stdc++fs)
+endif()
+
+set_property(TARGET locateanythingcpp PROPERTY CXX_STANDARD 17)
+set_target_properties(locateanythingcpp PROPERTIES LIBRARY_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR})
--- a/backend/go/locate-anything-cpp/Makefile
+++ b/backend/go/locate-anything-cpp/Makefile
@@ -0,0 +1,134 @@
+CMAKE_ARGS?=
+BUILD_TYPE?=
+NATIVE?=false
+
+GOCMD?=go
+GO_TAGS?=
+JOBS?=$(shell nproc --ignore=1)
+
+# locate-anything.cpp. Pin to a specific commit for a stable build; leaving
+# this on `master` always picks up the latest C-API surface (incl. the
+# per-detection accessor functions used by golocateanythingcpp.go).
+LOCATEANYTHING_REPO?=https://github.com/mudler/locate-anything.cpp.git
+LOCATEANYTHING_VERSION?=60e450945476d5e97e0754a8c0e71a9ea81690e0
+
+ifeq ($(NATIVE),false)
+	CMAKE_ARGS+=-DGGML_NATIVE=OFF
+endif
+
+# Forward LocalAI's BUILD_TYPE to the matching ggml backend switch.
+ifeq ($(BUILD_TYPE),cublas)
+	CMAKE_ARGS+=-DGGML_CUDA=ON -DLA_GGML_CUDA=ON
+else ifeq ($(BUILD_TYPE),openblas)
+	CMAKE_ARGS+=-DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS
+else ifeq ($(BUILD_TYPE),clblas)
+	CMAKE_ARGS+=-DGGML_CLBLAST=ON
+else ifeq ($(BUILD_TYPE),hipblas)
+	ROCM_HOME ?= /opt/rocm
+	ROCM_PATH ?= /opt/rocm
+	export CXX=$(ROCM_HOME)/llvm/bin/clang++
+	export CC=$(ROCM_HOME)/llvm/bin/clang
+	AMDGPU_TARGETS?=gfx908,gfx90a,gfx942,gfx950,gfx1030,gfx1100,gfx1101,gfx1102,gfx1200,gfx1201
+	CMAKE_ARGS+=-DGGML_HIPBLAS=ON -DAMDGPU_TARGETS=$(AMDGPU_TARGETS)
+else ifeq ($(BUILD_TYPE),vulkan)
+	CMAKE_ARGS+=-DGGML_VULKAN=ON -DLA_GGML_VULKAN=ON
+else ifeq ($(OS),Darwin)
+	ifneq ($(BUILD_TYPE),metal)
+		CMAKE_ARGS+=-DGGML_METAL=OFF
+	else
+		CMAKE_ARGS+=-DGGML_METAL=ON
+		CMAKE_ARGS+=-DGGML_METAL_EMBED_LIBRARY=ON
+		CMAKE_ARGS+=-DLA_GGML_METAL=ON
+	endif
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f16)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx \
+		-DGGML_SYCL_F16=ON
+endif
+
+ifeq ($(BUILD_TYPE),sycl_f32)
+	CMAKE_ARGS+=-DGGML_SYCL=ON \
+		-DCMAKE_C_COMPILER=icx \
+		-DCMAKE_CXX_COMPILER=icpx
+endif
+
+sources/locate-anything.cpp:
+	mkdir -p sources && \
+	git clone --recursive $(LOCATEANYTHING_REPO) sources/locate-anything.cpp && \
+	cd sources/locate-anything.cpp && \
+	git checkout $(LOCATEANYTHING_VERSION) && \
+	git submodule update --init --recursive --depth 1 --single-branch
+
+# Detect OS
+UNAME_S := $(shell uname -s)
+
+# Only build CPU variants on Linux
+ifeq ($(UNAME_S),Linux)
+	VARIANT_TARGETS = liblocateanythingcpp-avx.so liblocateanythingcpp-avx2.so liblocateanythingcpp-avx512.so liblocateanythingcpp-fallback.so
+else
+	# On non-Linux (e.g., Darwin), build only fallback variant
+	VARIANT_TARGETS = liblocateanythingcpp-fallback.so
+endif
+
+locate-anything-cpp: main.go golocateanythingcpp.go $(VARIANT_TARGETS)
+	CGO_ENABLED=0 $(GOCMD) build -tags "$(GO_TAGS)" -o locate-anything-cpp ./
+
+package: locate-anything-cpp
+	bash package.sh
+
+build: package
+
+clean: purge
+	rm -rf liblocateanythingcpp*.so locate-anything-cpp package sources
+
+purge:
+	rm -rf build*
+
+# Build all variants (Linux only)
+ifeq ($(UNAME_S),Linux)
+liblocateanythingcpp-avx.so: sources/locate-anything.cpp
+	rm -rfv build-$@
+	$(info ${GREEN}I locate-anything-cpp build info:avx${RESET})
+	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
+	rm -rfv build-$@
+
+liblocateanythingcpp-avx2.so: sources/locate-anything.cpp
+	rm -rfv build-$@
+	$(info ${GREEN}I locate-anything-cpp build info:avx2${RESET})
+	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=off -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) liblocateanythingcpp-custom
+	rm -rfv build-$@
+
+liblocateanythingcpp-avx512.so: sources/locate-anything.cpp
+	rm -rfv build-$@
+	$(info ${GREEN}I locate-anything-cpp build info:avx512${RESET})
+	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=on -DGGML_AVX2=on -DGGML_AVX512=on -DGGML_FMA=on -DGGML_F16C=on -DGGML_BMI2=on" $(MAKE) liblocateanythingcpp-custom
+	rm -rfv build-$@
+endif
+
+# Build fallback variant (all platforms)
+liblocateanythingcpp-fallback.so: sources/locate-anything.cpp
+	rm -rfv build-$@
+	$(info ${GREEN}I locate-anything-cpp build info:fallback${RESET})
+	SO_TARGET=$@ CMAKE_ARGS="$(CMAKE_ARGS) -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_BMI2=off" $(MAKE) liblocateanythingcpp-custom
+	rm -rfv build-$@
+
+liblocateanythingcpp-custom: CMakeLists.txt
+	mkdir -p build-$(SO_TARGET) && \
+	cd build-$(SO_TARGET) && \
+	cmake .. $(CMAKE_ARGS) && \
+	cmake --build . --config Release -j$(JOBS) && \
+	cd .. && \
+	mv build-$(SO_TARGET)/liblocateanythingcpp.so ./$(SO_TARGET)
+
+all: locate-anything-cpp package
+
+# `test` is invoked by the top-level Makefile's `test-extra` target. It builds
+# the backend binary + the fallback shared library (needed for dlopen at
+# runtime), then runs test.sh which downloads the q8_0 GGUF + COCO image and
+# exercises the gRPC Load/Detect wire path via the Go smoke test in
+# main_test.go.
+test: locate-anything-cpp liblocateanythingcpp-fallback.so
+	bash test.sh
--- a/backend/go/locate-anything-cpp/golocateanythingcpp.go
+++ b/backend/go/locate-anything-cpp/golocateanythingcpp.go
@@ -0,0 +1,174 @@
+package main
+
+// golocateanythingcpp.go - gRPC handlers (Load, Detect) for the
+// locate-anything-cpp backend.
+//
+// Embeds base.SingleThread to default unimplemented RPCs to "not supported"
+// while we only implement open-vocabulary object detection (Detect).
+
+import (
+	"encoding/base64"
+	"fmt"
+	"os"
+	"path/filepath"
+	"unsafe"
+
+	"github.com/mudler/LocalAI/pkg/grpc/base"
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+)
+
+// la_ctx* is an opaque handle. la_capi_load returns it directly (0 == failure),
+// unlike rfdetr's out-parameter convention.
+var (
+	// la_capi_load(const char* gguf_path, int n_threads) -> la_ctx* (0 = fail)
+	CapiLoad func(gguf string, nThreads int32) uintptr
+	// la_capi_free(la_ctx* ctx)
+	CapiFree func(handle uintptr)
+	// la_capi_locate_path(ctx, image_path, prompt, mode) -> char* json (0 = err)
+	CapiLocatePath func(handle uintptr, imagePath string, prompt string, mode int32) uintptr
+	// la_capi_locate_buffer(ctx, bytes, len, prompt, mode) -> char* json (0 = err)
+	CapiLocateBuffer func(handle uintptr, bytes uintptr, length uintptr, prompt string, mode int32) uintptr
+	// la_capi_get_n_detections(ctx) -> int
+	CapiGetNDetections func(handle uintptr) int32
+	// la_capi_get_detection_box(ctx, i, out_xyxy[4]) -> int (0 on success)
+	CapiGetDetectionBox func(handle uintptr, i int32, outXYXY uintptr) int32
+	// la_capi_get_detection_label(ctx, i, buf, buf_size) -> int (required size incl NUL; two-call sizing)
+	CapiGetDetectionLabel func(handle uintptr, i int32, buf uintptr, bufSize int32) int32
+	// la_capi_free_string(char* s)
+	CapiFreeString func(s uintptr)
+	// la_capi_last_error(ctx) -> const char* (owned by ctx, "" if none / null ctx).
+	// purego marshals the returned C string into a Go string (a copy), so we
+	// never free it and avoid raw pointer arithmetic.
+	CapiLastError func(handle uintptr) string
+)
+
+type LocateAnythingCpp struct {
+	base.SingleThread
+	handle uintptr
+}
+
+// Load loads the GGUF model at opts.ModelFile (joined with opts.ModelPath if
+// relative) and stores the la_ctx handle for later Detect calls.
+func (r *LocateAnythingCpp) Load(opts *pb.ModelOptions) error {
+	modelFile := opts.ModelFile
+	if modelFile == "" {
+		modelFile = opts.Model
+	}
+	if modelFile == "" {
+		return fmt.Errorf("locate-anything-cpp: ModelFile is empty")
+	}
+
+	var modelPath string
+	if filepath.IsAbs(modelFile) {
+		modelPath = modelFile
+	} else {
+		modelPath = filepath.Join(opts.ModelPath, modelFile)
+	}
+
+	if _, err := os.Stat(modelPath); err != nil {
+		return fmt.Errorf("locate-anything-cpp: model file not found: %s: %w", modelPath, err)
+	}
+
+	threads := opts.Threads
+	if threads <= 0 {
+		threads = 4
+	}
+
+	// Release previous model if any (re-Load).
+	if r.handle != 0 {
+		CapiFree(r.handle)
+		r.handle = 0
+	}
+
+	h := CapiLoad(modelPath, threads)
+	if h == 0 {
+		// la_capi_last_error needs a ctx; on a failed load we have none (it
+		// returns "" for a null ctx), so the text is best-effort. Surface it
+		// when present.
+		if msg := CapiLastError(0); msg != "" {
+			return fmt.Errorf("locate-anything-cpp: la_capi_load failed for %s: %s", modelPath, msg)
+		}
+		return fmt.Errorf("locate-anything-cpp: la_capi_load failed for %s", modelPath)
+	}
+	r.handle = h
+	return nil
+}
+
+// Detect runs open-vocabulary detection on the base64-encoded image in opts.Src
+// using the required text prompt in opts.Prompt, returning one pb.Detection per
+// located object with its predicted label as ClassName.
+func (r *LocateAnythingCpp) Detect(opts *pb.DetectOptions) (pb.DetectResponse, error) {
+	if r.handle == 0 {
+		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: model not loaded")
+	}
+
+	// Open-vocabulary detection is prompt-driven; without a prompt there is
+	// nothing to locate.
+	prompt := opts.Prompt
+	if prompt == "" {
+		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: a text prompt is required (open-vocabulary detection)")
+	}
+
+	// Decode base64 image and write to temp file.
+	imgData, err := base64.StdEncoding.DecodeString(opts.Src)
+	if err != nil {
+		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to decode base64 image: %w", err)
+	}
+
+	tmpFile, err := os.CreateTemp("", "locate-anything-*.img")
+	if err != nil {
+		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to create temp file: %w", err)
+	}
+	defer func() { _ = os.Remove(tmpFile.Name()) }()
+
+	if _, err := tmpFile.Write(imgData); err != nil {
+		_ = tmpFile.Close()
+		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to write temp file: %w", err)
+	}
+	if err := tmpFile.Close(); err != nil {
+		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: failed to close temp file: %w", err)
+	}
+
+	// mode 0 = hybrid (Parallel Box Decoding). The JSON return value is unused:
+	// structured detections are read via the accessor functions. Still must
+	// free the returned string.
+	jsonPtr := CapiLocatePath(r.handle, tmpFile.Name(), prompt, 0)
+	if jsonPtr != 0 {
+		CapiFreeString(jsonPtr)
+	}
+
+	n := CapiGetNDetections(r.handle)
+	if n < 0 {
+		return pb.DetectResponse{}, fmt.Errorf("locate-anything-cpp: invalid n_detections=%d", n)
+	}
+
+	detections := make([]*pb.Detection, 0, n)
+	for i := int32(0); i < n; i++ {
+		var xyxy [4]float32 // x1, y1, x2, y2
+		if CapiGetDetectionBox(r.handle, i, uintptr(unsafe.Pointer(&xyxy[0]))) != 0 {
+			continue
+		}
+
+		// Two-call sizing for the label string.
+		label := ""
+		need := CapiGetDetectionLabel(r.handle, i, 0, 0)
+		if need > 0 {
+			buf := make([]byte, need)
+			CapiGetDetectionLabel(r.handle, i, uintptr(unsafe.Pointer(&buf[0])), need)
+			label = string(buf[:need-1])
+		}
+
+		detections = append(detections, &pb.Detection{
+			X:          xyxy[0],
+			Y:          xyxy[1],
+			Width:      xyxy[2] - xyxy[0],
+			Height:     xyxy[3] - xyxy[1],
+			Confidence: 1.0,
+			ClassName:  label,
+		})
+	}
+
+	return pb.DetectResponse{
+		Detections: detections,
+	}, nil
+}
--- a/backend/go/locate-anything-cpp/main.go
+++ b/backend/go/locate-anything-cpp/main.go
@@ -0,0 +1,59 @@
+package main
+
+// main.go - entry point for the locate-anything-cpp gRPC backend.
+//
+// Dlopens liblocateanythingcpp-<variant>.so via purego at the path in
+// LOCATEANYTHING_LIBRARY (set by run.sh based on /proc/cpuinfo), registers
+// the la_capi_* C ABI symbols, then starts the gRPC server.
+
+import (
+	"flag"
+	"os"
+
+	"github.com/ebitengine/purego"
+	grpc "github.com/mudler/LocalAI/pkg/grpc"
+)
+
+var (
+	addr = flag.String("addr", "localhost:50051", "the address to connect to")
+)
+
+type LibFuncs struct {
+	FuncPtr any
+	Name    string
+}
+
+func main() {
+	// Get library name from environment variable, default to fallback
+	libName := os.Getenv("LOCATEANYTHING_LIBRARY")
+	if libName == "" {
+		libName = "./liblocateanythingcpp-fallback.so"
+	}
+
+	lib, err := purego.Dlopen(libName, purego.RTLD_NOW|purego.RTLD_GLOBAL)
+	if err != nil {
+		panic(err)
+	}
+
+	libFuncs := []LibFuncs{
+		{&CapiLoad, "la_capi_load"},
+		{&CapiFree, "la_capi_free"},
+		{&CapiLocatePath, "la_capi_locate_path"},
+		{&CapiLocateBuffer, "la_capi_locate_buffer"},
+		{&CapiGetNDetections, "la_capi_get_n_detections"},
+		{&CapiGetDetectionBox, "la_capi_get_detection_box"},
+		{&CapiGetDetectionLabel, "la_capi_get_detection_label"},
+		{&CapiFreeString, "la_capi_free_string"},
+		{&CapiLastError, "la_capi_last_error"},
+	}
+
+	for _, lf := range libFuncs {
+		purego.RegisterLibFunc(lf.FuncPtr, lib, lf.Name)
+	}
+
+	flag.Parse()
+
+	if err := grpc.StartServer(*addr, &LocateAnythingCpp{}); err != nil {
+		panic(err)
+	}
+}
--- a/backend/go/locate-anything-cpp/main_test.go
+++ b/backend/go/locate-anything-cpp/main_test.go
@@ -0,0 +1,176 @@
+package main
+
+// main_test.go - end-to-end smoke test for the locate-anything-cpp gRPC backend.
+//
+// Spawns the compiled locate-anything-cpp binary on a free local port, dials it
+// via gRPC, and exercises LoadModel + Detect against the test fixtures
+// downloaded by test.sh: the q8_0 GGUF of nvidia/LocateAnything-3B and a real
+// COCO image with people + cars. Asserts that open-vocabulary detection driven
+// by a text prompt returns at least one detection, each carrying a non-empty
+// class name and a bounding box of non-zero size.
+//
+// The spec Skip()s cleanly if its fixtures (the ~6.3 GB model, the test image,
+// the built binary, or the fallback .so) are missing, so the test target stays
+// usable on a fresh checkout / on CI runners where the large model hasn't been
+// downloaded.
+
+import (
+	"context"
+	"encoding/base64"
+	"fmt"
+	"net"
+	"os"
+	"os/exec"
+	"path/filepath"
+	"testing"
+	"time"
+
+	pb "github.com/mudler/LocalAI/pkg/grpc/proto"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+)
+
+func TestDetect(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "locate-anything-cpp backend smoke suite")
+}
+
+// freePort grabs an ephemeral TCP port and immediately releases it so the
+// spawned backend can bind to it. There is a tiny TOCTOU window here but in
+// practice it's adequate for a smoke test on a quiet runner.
+func freePort() int {
+	l, err := net.Listen("tcp", "127.0.0.1:0")
+	Expect(err).ToNot(HaveOccurred(), "freePort listen")
+	port := l.Addr().(*net.TCPAddr).Port
+	Expect(l.Close()).To(Succeed())
+	return port
+}
+
+// startBackend spawns the locate-anything-cpp binary on the given port and
+// waits until it accepts TCP connections (up to 10s). It mirrors how main.go
+// resolves the purego library: the LOCATEANYTHING_LIBRARY env var points the
+// dlopen at the freshly built fallback .so, and the la_capi_* symbols are
+// registered there. The returned cleanup func kills the process and reaps it.
+func startBackend(port int) func() {
+	binary, err := filepath.Abs("./locate-anything-cpp")
+	Expect(err).ToNot(HaveOccurred())
+	if _, err := os.Stat(binary); err != nil {
+		Skip(fmt.Sprintf("backend binary not built: %s (run `make locate-anything-cpp` first)", binary))
+	}
+
+	libPath, err := filepath.Abs("./liblocateanythingcpp-fallback.so")
+	Expect(err).ToNot(HaveOccurred())
+	if _, err := os.Stat(libPath); err != nil {
+		Skip(fmt.Sprintf("fallback library not built: %s (run `make liblocateanythingcpp-fallback.so` first)", libPath))
+	}
+
+	addr := fmt.Sprintf("127.0.0.1:%d", port)
+	cmd := exec.Command(binary, "--addr", addr)
+	cmd.Env = append(os.Environ(), "LOCATEANYTHING_LIBRARY="+libPath)
+	cmd.Stdout = os.Stderr
+	cmd.Stderr = os.Stderr
+	Expect(cmd.Start()).To(Succeed())
+
+	cleanup := func() {
+		if cmd.Process != nil {
+			_ = cmd.Process.Kill()
+			_, _ = cmd.Process.Wait()
+		}
+	}
+
+	deadline := time.Now().Add(10 * time.Second)
+	for time.Now().Before(deadline) {
+		c, err := net.DialTimeout("tcp", addr, 200*time.Millisecond)
+		if err == nil {
+			_ = c.Close()
+			return cleanup
+		}
+		time.Sleep(200 * time.Millisecond)
+	}
+
+	cleanup()
+	Fail(fmt.Sprintf("backend did not become ready on %s within 10s", addr))
+	return func() {}
+}
+
+// loadTestImage reads the COCO test image downloaded by test.sh and returns its
+// base64-encoded content (the wire format accepted by the Detect RPC).
+func loadTestImage() string {
+	imgPath, err := filepath.Abs("test-data/test.jpg")
+	Expect(err).ToNot(HaveOccurred())
+	imgBytes, err := os.ReadFile(imgPath)
+	if err != nil {
+		Skip(fmt.Sprintf("test image not present: %s (run test.sh first)", imgPath))
+	}
+	return base64.StdEncoding.EncodeToString(imgBytes)
+}
+
+// dialBackend opens a gRPC client connection to the spawned backend.
+func dialBackend(port int) (pb.BackendClient, func()) {
+	addr := fmt.Sprintf("127.0.0.1:%d", port)
+	conn, err := grpc.NewClient(addr, grpc.WithTransportCredentials(insecure.NewCredentials()))
+	Expect(err).ToNot(HaveOccurred())
+	return pb.NewBackendClient(conn), func() { _ = conn.Close() }
+}
+
+// modelPathOrSkip resolves the model file under ./test-models/ and Skip()s the
+// current spec if it's missing (the ~6.3 GB GGUF is not present on a fresh
+// checkout / on CI runners without the download).
+func modelPathOrSkip(name string) string {
+	modelDir, err := filepath.Abs("test-models")
+	Expect(err).ToNot(HaveOccurred())
+	modelPath := filepath.Join(modelDir, name)
+	if _, err := os.Stat(modelPath); err != nil {
+		Skip(fmt.Sprintf("model not present: %s (run test.sh first)", modelPath))
+	}
+	return modelPath
+}
+
+var _ = Describe("locate-anything-cpp backend", func() {
+	It("runs open-vocabulary detection against a known-good COCO image", func() {
+		modelPath := modelPathOrSkip("locate-anything-q8_0.gguf")
+		imgB64 := loadTestImage()
+
+		port := freePort()
+		cleanup := startBackend(port)
+		defer cleanup()
+
+		client, closeConn := dialBackend(port)
+		defer closeConn()
+
+		// The q8_0 model is ~6.3 GB and hybrid Parallel Box Decoding on CPU is
+		// not cheap, so give LoadModel + Detect a generous deadline.
+		ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute)
+		defer cancel()
+
+		loadResp, err := client.LoadModel(ctx, &pb.ModelOptions{
+			Model:     "locate-anything-q8_0.gguf",
+			ModelFile: modelPath,
+			Threads:   4,
+		})
+		Expect(err).ToNot(HaveOccurred(), "LoadModel")
+		Expect(loadResp.GetSuccess()).To(BeTrue(), "LoadModel reported failure: %s", loadResp.GetMessage())
+
+		// Open-vocabulary detection is prompt-driven; the prompt names the
+		// classes to locate (people + cars), separated by the </c> control token.
+		detResp, err := client.Detect(ctx, &pb.DetectOptions{
+			Src:    imgB64,
+			Prompt: "Locate all the instances that matches the following description: person</c>car.",
+		})
+		Expect(err).ToNot(HaveOccurred(), "Detect")
+		Expect(detResp.GetDetections()).ToNot(BeEmpty(), "no detections returned on a known-good COCO image")
+
+		_, _ = fmt.Fprintf(GinkgoWriter, "detection OK: %d detections\n", len(detResp.GetDetections()))
+		for i, d := range detResp.GetDetections() {
+			Expect(d.GetClassName()).ToNot(BeEmpty(), "detection %d has empty class_name", i)
+			Expect(d.GetWidth()).To(BeNumerically(">", float32(0)),
+				"detection %d has non-positive width", i)
+			Expect(d.GetHeight()).To(BeNumerically(">", float32(0)),
+				"detection %d has non-positive height", i)
+			_, _ = fmt.Fprintf(GinkgoWriter, "  [%d] %s box=(%.1f,%.1f,%.1fx%.1f)\n",
+				i, d.GetClassName(), d.GetX(), d.GetY(), d.GetWidth(), d.GetHeight())
+		}
+	})
+})
--- a/backend/go/locate-anything-cpp/package.sh
+++ b/backend/go/locate-anything-cpp/package.sh
@@ -0,0 +1,59 @@
+#!/bin/bash
+
+# Script to copy the appropriate libraries based on architecture
+
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+REPO_ROOT="${CURDIR}/../../.."
+
+# Create lib directory
+mkdir -p $CURDIR/package/lib
+
+cp -avf $CURDIR/liblocateanythingcpp-*.so $CURDIR/package/
+cp -avf $CURDIR/locate-anything-cpp $CURDIR/package/
+cp -fv $CURDIR/run.sh $CURDIR/package/
+
+# Detect architecture and copy appropriate libraries
+if [ -f "/lib64/ld-linux-x86-64.so.2" ]; then
+    # x86_64 architecture
+    echo "Detected x86_64 architecture, copying x86_64 libraries..."
+    cp -arfLv /lib64/ld-linux-x86-64.so.2 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/x86_64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/x86_64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/x86_64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/x86_64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ -f "/lib/ld-linux-aarch64.so.1" ]; then
+    # ARM64 architecture
+    echo "Detected ARM64 architecture, copying ARM64 libraries..."
+    cp -arfLv /lib/ld-linux-aarch64.so.1 $CURDIR/package/lib/ld.so
+    cp -arfLv /lib/aarch64-linux-gnu/libc.so.6 $CURDIR/package/lib/libc.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgcc_s.so.1 $CURDIR/package/lib/libgcc_s.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libstdc++.so.6 $CURDIR/package/lib/libstdc++.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libm.so.6 $CURDIR/package/lib/libm.so.6
+    cp -arfLv /lib/aarch64-linux-gnu/libgomp.so.1 $CURDIR/package/lib/libgomp.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libdl.so.2 $CURDIR/package/lib/libdl.so.2
+    cp -arfLv /lib/aarch64-linux-gnu/librt.so.1 $CURDIR/package/lib/librt.so.1
+    cp -arfLv /lib/aarch64-linux-gnu/libpthread.so.0 $CURDIR/package/lib/libpthread.so.0
+elif [ $(uname -s) = "Darwin" ]; then
+    echo "Detected Darwin"
+else
+    echo "Error: Could not detect architecture"
+    exit 1
+fi
+
+# Package GPU libraries based on BUILD_TYPE
+GPU_LIB_SCRIPT="${REPO_ROOT}/scripts/build/package-gpu-libs.sh"
+if [ -f "$GPU_LIB_SCRIPT" ]; then
+    echo "Packaging GPU libraries for BUILD_TYPE=${BUILD_TYPE:-cpu}..."
+    source "$GPU_LIB_SCRIPT" "$CURDIR/package/lib"
+    package_gpu_libs
+fi
+
+echo "Packaging completed successfully"
+ls -liah $CURDIR/package/
+ls -liah $CURDIR/package/lib/
--- a/backend/go/locate-anything-cpp/run.sh
+++ b/backend/go/locate-anything-cpp/run.sh
@@ -0,0 +1,52 @@
+#!/bin/bash
+set -ex
+
+# Get the absolute current dir where the script is located
+CURDIR=$(dirname "$(realpath $0)")
+
+cd /
+
+echo "CPU info:"
+if [ "$(uname)" != "Darwin" ]; then
+	grep -e "model\sname" /proc/cpuinfo | head -1
+	grep -e "flags" /proc/cpuinfo | head -1
+fi
+
+LIBRARY="$CURDIR/liblocateanythingcpp-fallback.so"
+
+if [ "$(uname)" != "Darwin" ]; then
+	if grep -q -e "\savx\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX    found OK"
+		if [ -e $CURDIR/liblocateanythingcpp-avx.so ]; then
+			LIBRARY="$CURDIR/liblocateanythingcpp-avx.so"
+		fi
+	fi
+
+	if grep -q -e "\savx2\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX2   found OK"
+		if [ -e $CURDIR/liblocateanythingcpp-avx2.so ]; then
+			LIBRARY="$CURDIR/liblocateanythingcpp-avx2.so"
+		fi
+	fi
+
+	# Check avx 512
+	if grep -q -e "\savx512f\s" /proc/cpuinfo ; then
+		echo "CPU:    AVX512F found OK"
+		if [ -e $CURDIR/liblocateanythingcpp-avx512.so ]; then
+			LIBRARY="$CURDIR/liblocateanythingcpp-avx512.so"
+		fi
+	fi
+fi
+
+export LD_LIBRARY_PATH=$CURDIR/lib:$LD_LIBRARY_PATH
+export LOCATEANYTHING_LIBRARY=$LIBRARY
+
+# If there is a lib/ld.so, use it
+if [ -f $CURDIR/lib/ld.so ]; then
+	echo "Using lib/ld.so"
+	echo "Using library: $LIBRARY"
+	exec $CURDIR/lib/ld.so $CURDIR/locate-anything-cpp "$@"
+fi
+
+echo "Using library: $LIBRARY"
+exec $CURDIR/locate-anything-cpp "$@"
--- a/backend/go/locate-anything-cpp/test.sh
+++ b/backend/go/locate-anything-cpp/test.sh
@@ -0,0 +1,47 @@
+#!/bin/bash
+set -e
+
+CURDIR=$(dirname "$(realpath $0)")
+
+echo "Running locate-anything-cpp backend tests..."
+
+# Test model from the mudler/locate-anything.cpp-gguf HuggingFace repo. This is
+# the q8_0 quantization of nvidia/LocateAnything-3B (~6.3 GB), so the download
+# is the slow step. It is resumed with `curl -C -` and skipped entirely if the
+# file is already present.
+LOCATEANYTHING_MODEL_DIR="${LOCATEANYTHING_MODEL_DIR:-$CURDIR/test-models}"
+
+LOCATEANYTHING_MODEL_FILE="${LOCATEANYTHING_MODEL_FILE:-locate-anything-q8_0.gguf}"
+LOCATEANYTHING_MODEL_URL="${LOCATEANYTHING_MODEL_URL:-https://huggingface.co/mudler/locate-anything.cpp-gguf/resolve/main/locate-anything-q8_0.gguf}"
+
+mkdir -p "$LOCATEANYTHING_MODEL_DIR"
+
+if [ ! -f "$LOCATEANYTHING_MODEL_DIR/$LOCATEANYTHING_MODEL_FILE" ]; then
+    echo "Downloading locate-anything q8_0 model (~6.3 GB, this is slow)..."
+    # -C - resumes a partial download so an interrupted run doesn't restart from 0.
+    curl -L -C - -o "$LOCATEANYTHING_MODEL_DIR/$LOCATEANYTHING_MODEL_FILE" "$LOCATEANYTHING_MODEL_URL" --progress-bar
+fi
+
+# Use a real COCO test image (people + cars) from the upstream rf-detr.cpp repo
+# (~46 KB). Open-vocabulary detection needs real content to locate, so a
+# synthetic image would trivially yield zero detections.
+TEST_IMAGE_DIR="$CURDIR/test-data"
+TEST_IMAGE_FILE="$TEST_IMAGE_DIR/test.jpg"
+TEST_IMAGE_URL="${TEST_IMAGE_URL:-https://raw.githubusercontent.com/mudler/rf-detr.cpp/main/tests/fixtures/ci/test_image.jpg}"
+
+mkdir -p "$TEST_IMAGE_DIR"
+if [ ! -f "$TEST_IMAGE_FILE" ]; then
+    echo "Downloading COCO test image..."
+    curl -L -o "$TEST_IMAGE_FILE" "$TEST_IMAGE_URL" --progress-bar
+fi
+
+echo "locate-anything-cpp test setup complete."
+echo "  model:      $LOCATEANYTHING_MODEL_DIR/$LOCATEANYTHING_MODEL_FILE"
+echo "  test image: $TEST_IMAGE_FILE"
+
+# Run the Go smoke test: spawns the backend binary on a free port, calls
+# LoadModel + Detect via gRPC against the downloaded GGUF + COCO image.
+echo ""
+echo "Running Go smoke test..."
+cd "$CURDIR"
+go test -v -timeout 30m ./...
--- a/backend/go/stablediffusion-ggml/Makefile
+++ b/backend/go/stablediffusion-ggml/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # stablediffusion.cpp (ggml)
 STABLEDIFFUSION_GGML_REPO?=https://github.com/leejet/stable-diffusion.cpp
-STABLEDIFFUSION_GGML_VERSION?=b3d56d0ba1bd437886079e339118e8e75bb79ee7
+STABLEDIFFUSION_GGML_VERSION?=19bdfe22d255d5b4dff39d449318b9bc5ea2317f

 CMAKE_ARGS+=-DGGML_MAX_NAME=128

--- a/backend/go/whisper/Makefile
+++ b/backend/go/whisper/Makefile
@@ -8,7 +8,7 @@ JOBS?=$(shell nproc --ignore=1)

 # whisper.cpp version
 WHISPER_REPO?=https://github.com/ggml-org/whisper.cpp
-WHISPER_CPP_VERSION?=a8ec021f2750a473ff4a8f3883bc9fdf5feafa84
+WHISPER_CPP_VERSION?=df7638d8229a243af8a4b5a8ae557e0d74e0a0ae
 SO_TARGET?=libgowhisper.so

 CMAKE_ARGS+=-DBUILD_SHARED_LIBS=OFF
--- a/backend/index.yaml
+++ b/backend/index.yaml
@@ -337,6 +337,35 @@
    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-rfdetr-cpp"
    intel: "intel-sycl-f32-rfdetr-cpp"
    vulkan: "vulkan-rfdetr-cpp"
+- &locateanything
+  name: "locate-anything"
+  alias: "locate-anything"
+  license: apache-2.0
+  description: |
+    Open-vocabulary object detection and visual grounding (NVIDIA
+    LocateAnything-3B) in C/C++ using GGML. Loads pre-built GGUF weights
+    and, given an image and a free-form text prompt, returns bounding
+    boxes, class labels, and confidence scores for the referred objects.
+  urls:
+    - https://github.com/mudler/locate-anything.cpp
+    - https://huggingface.co/nvidia/LocateAnything-3B
+  tags:
+    - object-detection
+    - visual-grounding
+    - open-vocabulary
+    - locate-anything
+    - gpu
+    - cpu
+  capabilities:
+    default: "cpu-locate-anything-cpp"
+    nvidia: "cuda12-locate-anything-cpp"
+    nvidia-cuda-12: "cuda12-locate-anything-cpp"
+    nvidia-cuda-13: "cuda13-locate-anything-cpp"
+    nvidia-l4t: "nvidia-l4t-arm64-locate-anything-cpp"
+    nvidia-l4t-cuda-12: "nvidia-l4t-arm64-locate-anything-cpp"
+    nvidia-l4t-cuda-13: "cuda13-nvidia-l4t-arm64-locate-anything-cpp"
+    intel: "intel-sycl-f32-locate-anything-cpp"
+    vulkan: "vulkan-locate-anything-cpp"
 - &vllm
  name: "vllm"
  license: apache-2.0
--- a/core/cli/chat/chat.go
+++ b/core/cli/chat/chat.go
@@ -0,0 +1,30 @@
+package chat
+
+import (
+	"context"
+	"io"
+	"strings"
+)
+
+type Options struct {
+	Model   string
+	BaseURL string
+	APIKey  string
+	In      io.Reader
+	Out     io.Writer
+}
+
+func Run(ctx context.Context, opts Options) error {
+	if opts.In == nil {
+		opts.In = strings.NewReader("")
+	}
+	if opts.Out == nil {
+		opts.Out = io.Discard
+	}
+
+	session, err := newChatSession(ctx, newLocalAIChatClient(opts.BaseURL, opts.APIKey), opts.Model)
+	if err != nil {
+		return err
+	}
+	return runTerminalChat(ctx, session, opts.In, opts.Out)
+}
--- a/core/cli/chat/chat_suite_test.go
+++ b/core/cli/chat/chat_suite_test.go
@@ -0,0 +1,13 @@
+package chat
+
+import (
+	"testing"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+func TestChat(t *testing.T) {
+	RegisterFailHandler(Fail)
+	RunSpecs(t, "Chat Suite")
+}
--- a/core/cli/chat/chat_test.go
+++ b/core/cli/chat/chat_test.go
@@ -0,0 +1,172 @@
+package chat
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io"
+	"net/http"
+	"net/http/httptest"
+	"strings"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Run chat", func() {
+	It("streams a single chat response", func() {
+		var capturedModel string
+		var capturedAuth string
+		server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+			if r.URL.Path == "/v1/models" {
+				w.Header().Set("Content-Type", "application/json")
+				writeResponse(w, `{"object":"list","data":[{"id":"test-model","object":"model"}]}`)
+				return
+			}
+
+			Expect(r.URL.Path).To(Equal("/v1/chat/completions"))
+			capturedAuth = r.Header.Get("Authorization")
+
+			var body struct {
+				Model    string `json:"model"`
+				Messages []struct {
+					Role    string `json:"role"`
+					Content string `json:"content"`
+				} `json:"messages"`
+			}
+			Expect(json.NewDecoder(r.Body).Decode(&body)).To(Succeed())
+			capturedModel = body.Model
+			Expect(body.Messages).To(HaveLen(1))
+			Expect(body.Messages[0].Role).To(Equal("user"))
+			Expect(body.Messages[0].Content).To(Equal("hello"))
+
+			w.Header().Set("Content-Type", "text/event-stream")
+			writeResponse(w, "data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\"hi\"}}]}\n\n")
+			writeResponse(w, "data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\"!\"}}]}\n\n")
+			writeResponse(w, "data: [DONE]\n\n")
+		}))
+		defer server.Close()
+
+		var out bytes.Buffer
+		err := Run(GinkgoT().Context(), Options{
+			Model:   "test-model",
+			BaseURL: server.URL + "/v1",
+			APIKey:  "secret",
+			In:      strings.NewReader("hello\n/exit\n"),
+			Out:     &out,
+		})
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(capturedModel).To(Equal("test-model"))
+		Expect(capturedAuth).To(Equal("Bearer secret"))
+		Expect(out.String()).To(ContainSubstring("assistant: hi!"))
+		Expect(out.String()).To(ContainSubstring("bye"))
+	})
+
+	It("auto-selects the only available model", func() {
+		server := chatTestServer([]string{"solo"}, nil)
+		defer server.Close()
+
+		var out bytes.Buffer
+		err := Run(GinkgoT().Context(), Options{
+			BaseURL: server.URL + "/v1",
+			In:      strings.NewReader("/exit\n"),
+			Out:     &out,
+		})
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(out.String()).To(ContainSubstring("LocalAI chat (solo)"))
+	})
+
+	It("returns an actionable error when no models are installed", func() {
+		server := chatTestServer(nil, nil)
+		defer server.Close()
+
+		err := Run(GinkgoT().Context(), Options{
+			BaseURL: server.URL + "/v1",
+			In:      strings.NewReader(""),
+		})
+
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("no chat models are installed"))
+		Expect(err.Error()).To(ContainSubstring("local-ai models install <model>"))
+	})
+
+	It("returns an actionable error when multiple models are available without a selection", func() {
+		server := chatTestServer([]string{"alpha", "beta"}, nil)
+		defer server.Close()
+
+		err := Run(GinkgoT().Context(), Options{
+			BaseURL: server.URL + "/v1",
+			In:      strings.NewReader(""),
+		})
+
+		Expect(err).To(HaveOccurred())
+		Expect(err.Error()).To(ContainSubstring("multiple models are available"))
+		Expect(err.Error()).To(ContainSubstring("--model"))
+		Expect(err.Error()).To(ContainSubstring("alpha"))
+		Expect(err.Error()).To(ContainSubstring("beta"))
+	})
+
+	It("lists and switches models inside the chat", func() {
+		requestedModels := []string{}
+		server := chatTestServer([]string{"alpha", "beta"}, func(model string) {
+			requestedModels = append(requestedModels, model)
+		})
+		defer server.Close()
+
+		var out bytes.Buffer
+		err := Run(GinkgoT().Context(), Options{
+			Model:   "alpha",
+			BaseURL: server.URL + "/v1",
+			In:      strings.NewReader("/models\n/model beta\nhello\n/exit\n"),
+			Out:     &out,
+		})
+
+		Expect(err).ToNot(HaveOccurred())
+		Expect(out.String()).To(ContainSubstring("* alpha"))
+		Expect(out.String()).To(ContainSubstring("  beta"))
+		Expect(out.String()).To(ContainSubstring("switched to beta; conversation cleared"))
+		Expect(requestedModels).To(Equal([]string{"beta"}))
+	})
+})
+
+func chatTestServer(models []string, onChat func(model string)) *httptest.Server {
+	return httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		switch r.URL.Path {
+		case "/v1/models":
+			w.Header().Set("Content-Type", "application/json")
+			writeResponse(w, `{"object":"list","data":[`)
+			for i, model := range models {
+				if i > 0 {
+					writeResponse(w, ",")
+				}
+				writeResponsef(w, `{"id":%q,"object":"model"}`, model)
+			}
+			writeResponse(w, `]}`)
+		case "/v1/chat/completions":
+			var body struct {
+				Model string `json:"model"`
+			}
+			Expect(json.NewDecoder(r.Body).Decode(&body)).To(Succeed())
+			if onChat != nil {
+				onChat(body.Model)
+			}
+			w.Header().Set("Content-Type", "text/event-stream")
+			writeResponse(w, "data: {\"choices\":[{\"index\":0,\"delta\":{\"content\":\"ok\"}}]}\n\n")
+			writeResponse(w, "data: [DONE]\n\n")
+		default:
+			w.WriteHeader(http.StatusNotFound)
+		}
+	}))
+}
+
+func writeResponse(w io.Writer, text string) {
+	_, err := fmt.Fprint(w, text)
+	Expect(err).ToNot(HaveOccurred())
+}
+
+func writeResponsef(w io.Writer, format string, args ...any) {
+	_, err := fmt.Fprintf(w, format, args...)
+	Expect(err).ToNot(HaveOccurred())
+}
--- a/core/cli/chat/client.go
+++ b/core/cli/chat/client.go
@@ -0,0 +1,114 @@
+package chat
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"sort"
+	"strings"
+
+	openai "github.com/sashabaranov/go-openai"
+)
+
+type chatClient interface {
+	ListModels(ctx context.Context) ([]string, error)
+	StreamChat(ctx context.Context, model string, messages []chatMessage, out io.Writer) (string, error)
+}
+
+type localAIChatClient struct {
+	client *openai.Client
+}
+
+func newLocalAIChatClient(baseURL string, apiKey string) *localAIChatClient {
+	cfg := openai.DefaultConfig(apiKey)
+	cfg.BaseURL = baseURL
+	return &localAIChatClient{client: openai.NewClientWithConfig(cfg)}
+}
+
+func (c *localAIChatClient) ListModels(ctx context.Context) ([]string, error) {
+	resp, err := c.client.ListModels(ctx)
+	if err != nil {
+		return nil, err
+	}
+
+	models := make([]string, 0, len(resp.Models))
+	for _, model := range resp.Models {
+		if model.ID != "" {
+			models = append(models, model.ID)
+		}
+	}
+	sort.Strings(models)
+	return models, nil
+}
+
+func (c *localAIChatClient) StreamChat(ctx context.Context, model string, messages []chatMessage, out io.Writer) (string, error) {
+	stream, err := c.client.CreateChatCompletionStream(ctx, openai.ChatCompletionRequest{
+		Model:    model,
+		Messages: openAIChatMessages(messages),
+	})
+	if err != nil {
+		return "", friendlyChatError(err, model)
+	}
+	defer func() {
+		_ = stream.Close()
+	}()
+
+	var answer strings.Builder
+	for {
+		resp, err := stream.Recv()
+		if errors.Is(err, io.EOF) {
+			break
+		}
+		if err != nil {
+			return answer.String(), friendlyChatError(err, model)
+		}
+		if len(resp.Choices) == 0 {
+			continue
+		}
+
+		token := resp.Choices[0].Delta.Content
+		if token == "" {
+			continue
+		}
+		answer.WriteString(token)
+		if _, err := fmt.Fprint(out, token); err != nil {
+			return answer.String(), err
+		}
+	}
+
+	return answer.String(), nil
+}
+
+func openAIChatMessages(messages []chatMessage) []openai.ChatCompletionMessage {
+	converted := make([]openai.ChatCompletionMessage, len(messages))
+	for i, message := range messages {
+		converted[i] = openai.ChatCompletionMessage{
+			Role:    message.Role,
+			Content: message.Content,
+		}
+	}
+	return converted
+}
+
+func friendlyChatError(err error, model string) error {
+	var apiErr *openai.APIError
+	if errors.As(err, &apiErr) {
+		switch apiErr.HTTPStatusCode {
+		case 404:
+			return fmt.Errorf("model %q is not available. Run `local-ai models list`, install a model with `local-ai models install <model>`, or switch with `/model <name>`", model)
+		case 403:
+			return fmt.Errorf("model %q is disabled. Enable it from LocalAI settings or choose another model with `/model <name>`", model)
+		}
+		if apiErr.Message != "" {
+			return errors.New(apiErr.Message)
+		}
+	}
+
+	msg := err.Error()
+	if strings.Contains(msg, "model") && strings.Contains(msg, "not found") {
+		return fmt.Errorf("model %q is not available. Run `local-ai models list`, install a model with `local-ai models install <model>`, or switch with `/model <name>`", model)
+	}
+
+	return err
+}
--- a/core/cli/chat/models.go
+++ b/core/cli/chat/models.go
@@ -0,0 +1,17 @@
+package chat
+
+import "strings"
+
+func formatChatModelList(models []string, current string) string {
+	var b strings.Builder
+	for _, model := range models {
+		prefix := "  "
+		if model == current {
+			prefix = "* "
+		}
+		b.WriteString(prefix)
+		b.WriteString(model)
+		b.WriteByte('\n')
+	}
+	return b.String()
+}
--- a/core/cli/chat/session.go
+++ b/core/cli/chat/session.go
@@ -0,0 +1,120 @@
+package chat
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"io"
+	"strings"
+)
+
+const (
+	chatRoleUser      = "user"
+	chatRoleAssistant = "assistant"
+)
+
+type chatMessage struct {
+	Role    string
+	Content string
+}
+
+type chatSession struct {
+	client   chatClient
+	model    string
+	models   []string
+	messages []chatMessage
+}
+
+func newChatSession(ctx context.Context, client chatClient, requestedModel string) (*chatSession, error) {
+	models, err := client.ListModels(ctx)
+	if err != nil {
+		return nil, fmt.Errorf("list models: %w", err)
+	}
+
+	model, err := resolveChatModel(requestedModel, models)
+	if err != nil {
+		return nil, err
+	}
+
+	return &chatSession{
+		client: client,
+		model:  model,
+		models: models,
+	}, nil
+}
+
+func (s *chatSession) CurrentModel() string {
+	return s.model
+}
+
+func (s *chatSession) Models() []string {
+	models := make([]string, len(s.models))
+	copy(models, s.models)
+	return models
+}
+
+func (s *chatSession) Clear() {
+	s.messages = nil
+}
+
+func (s *chatSession) SwitchModel(model string) error {
+	if !modelExists(s.models, model) {
+		return fmt.Errorf("model %q is not available. Use /models to see installed models", model)
+	}
+	s.model = model
+	s.Clear()
+	return nil
+}
+
+func (s *chatSession) Send(ctx context.Context, prompt string, out io.Writer) error {
+	s.messages = append(s.messages, chatMessage{
+		Role:    chatRoleUser,
+		Content: prompt,
+	})
+
+	answer, err := s.client.StreamChat(ctx, s.model, s.messages, out)
+	if err != nil {
+		return err
+	}
+
+	s.messages = append(s.messages, chatMessage{
+		Role:    chatRoleAssistant,
+		Content: answer,
+	})
+	return nil
+}
+
+func resolveChatModel(requested string, models []string) (string, error) {
+	switch {
+	case requested == "" && len(models) == 0:
+		return "", errors.New(`no chat models are installed.
+
+Install a model first, for example:
+  local-ai models list
+  local-ai models install <model>
+  local-ai run
+
+Then start a chat session:
+  local-ai chat --model <model>`)
+	case requested == "" && len(models) == 1:
+		return models[0], nil
+	case requested == "" && len(models) > 1:
+		var b strings.Builder
+		b.WriteString("multiple models are available; choose one with --model:\n")
+		b.WriteString(formatChatModelList(models, ""))
+		return "", errors.New(b.String())
+	case !modelExists(models, requested):
+		return "", fmt.Errorf("model %q is not available. Use `local-ai models list` and `local-ai models install <model>`, or pass an installed model with --model", requested)
+	default:
+		return requested, nil
+	}
+}
+
+func modelExists(models []string, name string) bool {
+	for _, model := range models {
+		if model == name {
+			return true
+		}
+	}
+	return false
+}
--- a/core/cli/chat/session_test.go
+++ b/core/cli/chat/session_test.go
@@ -0,0 +1,56 @@
+package chat
+
+import (
+	"context"
+	"io"
+
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Chat session", func() {
+	It("keeps model switching and message history out of the terminal adapter", func() {
+		client := &fakeChatClient{
+			models: []string{"alpha", "beta"},
+			answer: "pong",
+		}
+
+		session, err := newChatSession(context.Background(), client, "alpha")
+		Expect(err).ToNot(HaveOccurred())
+		Expect(session.CurrentModel()).To(Equal("alpha"))
+
+		Expect(session.SwitchModel("beta")).To(Succeed())
+		Expect(session.CurrentModel()).To(Equal("beta"))
+		Expect(session.Send(context.Background(), "ping", io.Discard)).To(Succeed())
+
+		Expect(client.requests).To(HaveLen(1))
+		Expect(client.requests[0].model).To(Equal("beta"))
+		Expect(client.requests[0].messages).To(HaveLen(1))
+		Expect(client.requests[0].messages[0].Content).To(Equal("ping"))
+	})
+})
+
+type fakeChatClient struct {
+	models   []string
+	answer   string
+	requests []fakeChatRequest
+}
+
+type fakeChatRequest struct {
+	model    string
+	messages []chatMessage
+}
+
+func (c *fakeChatClient) ListModels(context.Context) ([]string, error) {
+	return c.models, nil
+}
+
+func (c *fakeChatClient) StreamChat(_ context.Context, model string, messages []chatMessage, out io.Writer) (string, error) {
+	copied := make([]chatMessage, len(messages))
+	copy(copied, messages)
+	c.requests = append(c.requests, fakeChatRequest{model: model, messages: copied})
+	if _, err := io.WriteString(out, c.answer); err != nil {
+		return "", err
+	}
+	return c.answer, nil
+}
--- a/core/cli/chat/terminal.go
+++ b/core/cli/chat/terminal.go
@@ -0,0 +1,93 @@
+package chat
+
+import (
+	"bufio"
+	"context"
+	"fmt"
+	"io"
+	"strings"
+)
+
+func runTerminalChat(ctx context.Context, session *chatSession, in io.Reader, out io.Writer) error {
+	scanner := bufio.NewScanner(in)
+	scanner.Buffer(make([]byte, 0, 64*1024), 4*1024*1024)
+
+	if err := writeChat(out, "LocalAI chat (%s)\n", session.CurrentModel()); err != nil {
+		return err
+	}
+	if err := writeChat(out, "Type /exit to quit, /clear to reset the conversation, /models to list models.\n"); err != nil {
+		return err
+	}
+
+	for {
+		if err := writeChat(out, "\n> "); err != nil {
+			return err
+		}
+		if !scanner.Scan() {
+			break
+		}
+
+		prompt := strings.TrimSpace(scanner.Text())
+		switch prompt {
+		case "":
+			continue
+		case "/bye", "/exit", "/quit":
+			return writeChat(out, "bye\n")
+		case "/clear":
+			session.Clear()
+			if err := writeChat(out, "conversation cleared\n"); err != nil {
+				return err
+			}
+			continue
+		case "/models":
+			if err := printChatModels(out, session.Models(), session.CurrentModel()); err != nil {
+				return err
+			}
+			continue
+		}
+
+		if nextModel, ok := strings.CutPrefix(prompt, "/model "); ok {
+			nextModel = strings.TrimSpace(nextModel)
+			if nextModel == "" {
+				if err := writeChat(out, "usage: /model <name>\n"); err != nil {
+					return err
+				}
+				continue
+			}
+			if err := session.SwitchModel(nextModel); err != nil {
+				if writeErr := writeChat(out, "%s\n", err); writeErr != nil {
+					return writeErr
+				}
+				continue
+			}
+			if err := writeChat(out, "switched to %s; conversation cleared\n", session.CurrentModel()); err != nil {
+				return err
+			}
+			continue
+		}
+
+		if err := writeChat(out, "assistant: "); err != nil {
+			return err
+		}
+		if err := session.Send(ctx, prompt, out); err != nil {
+			return err
+		}
+		if err := writeChat(out, "\n"); err != nil {
+			return err
+		}
+	}
+
+	return scanner.Err()
+}
+
+func printChatModels(out io.Writer, models []string, current string) error {
+	if len(models) == 0 {
+		return writeChat(out, "no models installed\n")
+	}
+	return writeChat(out, "%s", formatChatModelList(models, current))
+}
+
+func writeChat(out io.Writer, format string, args ...any) error {
+	_, err := fmt.Fprintf(out, format, args...)
+	return err
+}
--- a/core/cli/chat_cmd.go
+++ b/core/cli/chat_cmd.go
@@ -0,0 +1,25 @@
+package cli
+
+import (
+	"context"
+	"os"
+
+	chatcli "github.com/mudler/LocalAI/core/cli/chat"
+	cliContext "github.com/mudler/LocalAI/core/cli/context"
+)
+
+type ChatCMD struct {
+	Model    string `short:"m" help:"Model name to use. Defaults to the only model returned by the server when exactly one is available"`
+	Endpoint string `env:"LOCALAI_CHAT_ENDPOINT" default:"http://127.0.0.1:8080" help:"LocalAI server endpoint. The /v1 path is added automatically when omitted"`
+	APIKey   string `env:"LOCALAI_API_KEY,API_KEY" help:"API key to use when the LocalAI server requires authentication"`
+}
+
+func (c *ChatCMD) Run(ctx *cliContext.Context) error {
+	return chatcli.Run(context.Background(), chatcli.Options{
+		Model:   c.Model,
+		BaseURL: chatAPIBaseURL(c.Endpoint),
+		APIKey:  c.APIKey,
+		In:      os.Stdin,
+		Out:     os.Stdout,
+	})
+}
--- a/core/cli/chat_cmd_test.go
+++ b/core/cli/chat_cmd_test.go
@@ -0,0 +1,27 @@
+package cli
+
+import (
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("Chat command wiring", func() {
+	Describe("chatAPIBaseURL", func() {
+		It("adds /v1 to a root endpoint", func() {
+			Expect(chatAPIBaseURL("http://127.0.0.1:8080")).To(Equal("http://127.0.0.1:8080/v1"))
+		})
+
+		It("keeps endpoints that already include /v1", func() {
+			Expect(chatAPIBaseURL("http://127.0.0.1:8080/v1")).To(Equal("http://127.0.0.1:8080/v1"))
+			Expect(chatAPIBaseURL("http://127.0.0.1:8080/v1/")).To(Equal("http://127.0.0.1:8080/v1"))
+		})
+
+		It("adds a default http scheme", func() {
+			Expect(chatAPIBaseURL("127.0.0.1:8080")).To(Equal("http://127.0.0.1:8080/v1"))
+		})
+
+		It("preserves non-root paths before /v1", func() {
+			Expect(chatAPIBaseURL("http://127.0.0.1:8080/localai")).To(Equal("http://127.0.0.1:8080/localai/v1"))
+		})
+	})
+})
--- a/core/cli/chat_endpoint.go
+++ b/core/cli/chat_endpoint.go
@@ -0,0 +1,29 @@
+package cli
+
+import (
+	"net/url"
+	"strings"
+)
+
+func chatAPIBaseURL(endpoint string) string {
+	if !strings.Contains(endpoint, "://") {
+		endpoint = "http://" + endpoint
+	}
+
+	u, err := url.Parse(endpoint)
+	if err != nil {
+		return strings.TrimRight(endpoint, "/") + "/v1"
+	}
+
+	path := strings.TrimRight(u.Path, "/")
+	if path == "" {
+		u.Path = "/v1"
+	} else if path != "/v1" && !strings.HasSuffix(path, "/v1") {
+		u.Path = path + "/v1"
+	} else {
+		u.Path = path
+	}
+	u.RawQuery = ""
+	u.Fragment = ""
+	return u.String()
+}
--- a/core/cli/cli.go
+++ b/core/cli/cli.go
@@ -9,6 +9,7 @@ var CLI struct {
 	cliContext.Context `embed:""`

 	Run             RunCMD             `cmd:"" help:"Run LocalAI, this the default command if no other command is specified. Run 'local-ai run --help' for more information" default:"withargs"`
+	Chat            ChatCMD            `cmd:"" help:"Open an interactive chat session against a running LocalAI server"`
 	Federated       FederatedCLI       `cmd:"" help:"Run LocalAI in federated mode"`
 	Models          ModelsCMD          `cmd:"" help:"Manage LocalAI models and definitions"`
 	Backends        BackendsCMD        `cmd:"" help:"Manage LocalAI backends and definitions"`
--- a/core/cli/run.go
+++ b/core/cli/run.go
@@ -30,6 +30,8 @@ type RunCMD struct {
 	ModelArgs []string `arg:"" optional:"" name:"models" help:"Model configuration URLs to load"`

 	ExternalBackends             []string      `env:"LOCALAI_EXTERNAL_BACKENDS,EXTERNAL_BACKENDS" help:"A list of external backends to load from gallery on boot" group:"backends"`
+	WebRTCNAT1To1IPs             []string      `env:"LOCALAI_WEBRTC_NAT_1TO1_IPS,WEBRTC_NAT_1TO1_IPS" help:"IPs advertised as the host ICE candidates for /v1/realtime WebRTC instead of every local interface. Set to the reachable host/LAN IP when running under Docker host networking or NAT, where pion otherwise offers unreachable bridge addresses and the connection drops after ICE consent checks fail." group:"api"`
+	WebRTCICEInterfaces          []string      `env:"LOCALAI_WEBRTC_ICE_INTERFACES,WEBRTC_ICE_INTERFACES" help:"Restrict /v1/realtime WebRTC ICE candidate gathering to these network interfaces (e.g. eth0), filtering out docker0/veth noise." group:"api"`
 	BackendsPath                 string        `env:"LOCALAI_BACKENDS_PATH,BACKENDS_PATH" type:"path" default:"${basepath}/backends" help:"Path containing backends used for inferencing" group:"backends"`
 	BackendsSystemPath           string        `env:"LOCALAI_BACKENDS_SYSTEM_PATH,BACKEND_SYSTEM_PATH" type:"path" default:"/var/lib/local-ai/backends" help:"Path containing system backends used for inferencing" group:"backends"`
 	ModelsPath                   string        `env:"LOCALAI_MODELS_PATH,MODELS_PATH" type:"path" default:"${basepath}/models" help:"Path containing models used for inferencing" group:"storage"`
@@ -225,6 +227,8 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 		config.WithApiKeys(r.APIKeys),
 		config.WithModelsURL(append(r.Models, r.ModelArgs...)...),
 		config.WithExternalBackends(r.ExternalBackends...),
+		config.WithWebRTCNAT1To1IPs(r.WebRTCNAT1To1IPs...),
+		config.WithWebRTCICEInterfaces(r.WebRTCICEInterfaces...),
 		config.WithOpaqueErrors(r.OpaqueErrors),
 		config.WithEnforcedPredownloadScans(!r.DisablePredownloadScan),
 		config.WithSubtleKeyComparison(r.UseSubtleKeyComparison),
@@ -652,12 +656,12 @@ func (r *RunCMD) Run(ctx *cliContext.Context) error {
 // waitForServerReady polls the given address until the HTTP server is
 // accepting connections or the context is cancelled.
 func waitForServerReady(address string, ctx context.Context) {
-	// Ensure the address has a host component for dialing.
-	// Echo accepts ":8080" but net.Dial needs a resolvable host.
 	host, port, err := net.SplitHostPort(address)
 	if err == nil && host == "" {
 		address = "127.0.0.1:" + port
 	}
+	ticker := time.NewTicker(250 * time.Millisecond)
+	defer ticker.Stop()

 	for {
 		select {
@@ -665,11 +669,17 @@ func waitForServerReady(address string, ctx context.Context) {
 			return
 		default:
 		}
+
 		conn, err := net.DialTimeout("tcp", address, 500*time.Millisecond)
 		if err == nil {
 			conn.Close()
 			return
 		}
-		time.Sleep(250 * time.Millisecond)
+
+		select {
+		case <-ctx.Done():
+			return
+		case <-ticker.C:
+		}
 	}
 }
--- a/core/config/application_config.go
+++ b/core/config/application_config.go
@@ -12,10 +12,19 @@ import (
 )

 type ApplicationConfig struct {
-	Context                             context.Context
-	ConfigFile                          string
-	SystemState                         *system.SystemState
-	ExternalBackends                    []string
+	Context          context.Context
+	ConfigFile       string
+	SystemState      *system.SystemState
+	ExternalBackends []string
+
+	// WebRTCNAT1To1IPs, when set, are advertised as the host ICE candidates for
+	// /v1/realtime WebRTC instead of every local interface address. Needed when
+	// the routable address differs from what pion gathers — e.g. Docker host
+	// networking (where pion also offers unreachable bridge IPs) or NAT.
+	WebRTCNAT1To1IPs []string
+	// WebRTCICEInterfaces, when set, restricts ICE candidate gathering to these
+	// network interfaces (e.g. eth0), filtering out docker0/veth noise.
+	WebRTCICEInterfaces                 []string
 	UploadLimitMB, Threads, ContextSize int
 	F16                                 bool
 	Debug                               bool
@@ -81,7 +90,6 @@ type ApplicationConfig struct {
 	// file is mode 0600.
 	MITMCADir string

-
 	// PIIPatternOverrides applies persisted per-id deltas (action,
 	// disabled) to the live redactor at startup. Loaded from
 	// runtime_settings.json and applied right after pii.NewRedactor.
@@ -116,11 +124,11 @@ type ApplicationConfig struct {
 	// --require-backend-integrity / LOCALAI_REQUIRE_BACKEND_INTEGRITY.
 	RequireBackendIntegrity bool

-	SingleBackend           bool // Deprecated: use MaxActiveBackends = 1 instead
-	MaxActiveBackends       int  // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
-	WatchDogIdle bool
-	WatchDogBusy bool
-	WatchDog     bool
+	SingleBackend     bool // Deprecated: use MaxActiveBackends = 1 instead
+	MaxActiveBackends int  // Maximum number of active backends (0 = unlimited, 1 = single backend mode)
+	WatchDogIdle      bool
+	WatchDogBusy      bool
+	WatchDog          bool

 	// Memory Reclaimer settings (works with GPU if available, otherwise RAM)
 	MemoryReclaimerEnabled   bool    // Enable memory threshold monitoring
@@ -311,6 +319,18 @@ func WithExternalBackends(backends ...string) AppOption {
 	}
 }

+func WithWebRTCNAT1To1IPs(ips ...string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.WebRTCNAT1To1IPs = ips
+	}
+}
+
+func WithWebRTCICEInterfaces(interfaces ...string) AppOption {
+	return func(o *ApplicationConfig) {
+		o.WebRTCICEInterfaces = interfaces
+	}
+}
+
 func WithMachineTag(tag string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.MachineTag = tag
@@ -702,7 +722,6 @@ func WithMITMCADir(dir string) AppOption {
 	}
 }

-
 func WithDynamicConfigDir(dynamicConfigsDir string) AppOption {
 	return func(o *ApplicationConfig) {
 		o.DynamicConfigsDir = dynamicConfigsDir
--- a/core/gallery/importers/importers.go
+++ b/core/gallery/importers/importers.go
@@ -158,6 +158,11 @@ var defaultImporters = []Importer{
 	// RFDetrImporter must run before TransformersImporter — RF-DETR
 	// checkpoints may carry tokenizer-adjacent artefacts.
 	&RFDetrImporter{},
+	// LocateAnythingImporter (NVIDIA LocateAnything open-vocab detection,
+	// native C++/ggml port) must run before LlamaCPPImporter so its GGUF
+	// bundles aren't claimed by the generic .gguf importer; kept next to
+	// RFDetrImporter as both are detection models.
+	&LocateAnythingImporter{},
 	// Existing
 	// DS4Importer must precede LlamaCPPImporter - ds4 weights are GGUFs and
 	// would otherwise be claimed by the generic .gguf-handling llama-cpp
--- a/core/gallery/importers/locate-anything.go
+++ b/core/gallery/importers/locate-anything.go
@@ -0,0 +1,137 @@
+package importers
+
+import (
+	"encoding/json"
+	"path/filepath"
+	"strings"
+
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/LocalAI/core/gallery"
+	"github.com/mudler/LocalAI/core/schema"
+	"go.yaml.in/yaml/v2"
+)
+
+var _ Importer = &LocateAnythingImporter{}
+
+// LocateAnythingImporter routes NVIDIA LocateAnything open-vocabulary
+// object-detection / visual-grounding repositories to the
+// "locate-anything-cpp" backend (a native C++/ggml port). It must be
+// registered BEFORE the generic GGUF matchers (LlamaCPPImporter) so its
+// GGUF bundles aren't swallowed by the generic .gguf-handling importer,
+// and alongside RFDetrImporter since both are detection models that may
+// carry tokenizer-adjacent artefacts.
+//
+// Detection signals:
+//   - preferences.backend="locate-anything-cpp" (explicit override);
+//   - repo name contains "locate-anything" or "locateanything"
+//     (case-insensitive).
+type LocateAnythingImporter struct{}
+
+func (i *LocateAnythingImporter) Name() string      { return "locate-anything-cpp" }
+func (i *LocateAnythingImporter) Modality() string  { return "detection" }
+func (i *LocateAnythingImporter) AutoDetects() bool { return true }
+
+func repoLooksLikeLocateAnything(repo string) bool {
+	lower := strings.ToLower(repo)
+	return strings.Contains(lower, "locate-anything") ||
+		strings.Contains(lower, "locateanything") ||
+		strings.Contains(lower, "locate-anything.cpp") ||
+		strings.Contains(lower, "locate-anything-cpp")
+}
+
+func (i *LocateAnythingImporter) Match(details Details) bool {
+	preferences, err := details.Preferences.MarshalJSON()
+	if err != nil {
+		return false
+	}
+	preferencesMap := make(map[string]any)
+	if len(preferences) > 0 {
+		if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
+			return false
+		}
+	}
+
+	if b, ok := preferencesMap["backend"].(string); ok && b == "locate-anything-cpp" {
+		return true
+	}
+
+	if details.HuggingFace != nil {
+		repoName := details.HuggingFace.ModelID
+		if idx := strings.Index(repoName, "/"); idx >= 0 {
+			repoName = repoName[idx+1:]
+		}
+		if repoLooksLikeLocateAnything(repoName) {
+			return true
+		}
+	}
+
+	// Fallback: hfapi recursion bug may leave HuggingFace nil — decide
+	// from the URI owner/repo.
+	if _, repo, ok := HFOwnerRepoFromURI(details.URI); ok {
+		if repoLooksLikeLocateAnything(repo) {
+			return true
+		}
+	}
+
+	return false
+}
+
+func (i *LocateAnythingImporter) Import(details Details) (gallery.ModelConfig, error) {
+	preferences, err := details.Preferences.MarshalJSON()
+	if err != nil {
+		return gallery.ModelConfig{}, err
+	}
+	preferencesMap := make(map[string]any)
+	if len(preferences) > 0 {
+		if err := json.Unmarshal(preferences, &preferencesMap); err != nil {
+			return gallery.ModelConfig{}, err
+		}
+	}
+
+	name, ok := preferencesMap["name"].(string)
+	if !ok {
+		name = filepath.Base(details.URI)
+	}
+
+	description, ok := preferencesMap["description"].(string)
+	if !ok {
+		description = "Imported from " + details.URI
+	}
+
+	// Prefer the canonical HF "owner/repo" identifier so the emitted
+	// YAML mirrors gallery locate-anything entries.
+	model := details.URI
+	if details.HuggingFace != nil && details.HuggingFace.ModelID != "" {
+		model = details.HuggingFace.ModelID
+	} else if owner, repo, ok := HFOwnerRepoFromURI(details.URI); ok {
+		model = owner + "/" + repo
+	}
+
+	// Always the native C++/ggml backend; explicit preferences.backend
+	// overrides the default.
+	backend := "locate-anything-cpp"
+	if b, ok := preferencesMap["backend"].(string); ok && b != "" {
+		backend = b
+	}
+
+	modelConfig := config.ModelConfig{
+		Name:                name,
+		Description:         description,
+		Backend:             backend,
+		KnownUsecaseStrings: []string{"detection"},
+		PredictionOptions: schema.PredictionOptions{
+			BasicModelRequest: schema.BasicModelRequest{Model: model},
+		},
+	}
+
+	data, err := yaml.Marshal(modelConfig)
+	if err != nil {
+		return gallery.ModelConfig{}, err
+	}
+
+	return gallery.ModelConfig{
+		Name:        name,
+		Description: description,
+		ConfigFile:  string(data),
+	}, nil
+}
--- a/core/gallery/importers/locate-anything_test.go
+++ b/core/gallery/importers/locate-anything_test.go
@@ -0,0 +1,218 @@
+package importers_test
+
+import (
+	"encoding/json"
+	"fmt"
+
+	"github.com/mudler/LocalAI/core/gallery/importers"
+	hfapi "github.com/mudler/LocalAI/pkg/huggingface-api"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("LocateAnythingImporter", func() {
+	Context("Importer interface metadata", func() {
+		It("exposes name/modality/autodetect", func() {
+			imp := &importers.LocateAnythingImporter{}
+			Expect(imp.Name()).To(Equal("locate-anything-cpp"))
+			Expect(imp.Modality()).To(Equal("detection"))
+			Expect(imp.AutoDetects()).To(BeTrue())
+		})
+	})
+
+	Context("Match", func() {
+		It("matches when backend preference is locate-anything-cpp", func() {
+			imp := &importers.LocateAnythingImporter{}
+			preferences := json.RawMessage(`{"backend": "locate-anything-cpp"}`)
+			details := importers.Details{
+				URI:         "https://example.com/some-model",
+				Preferences: preferences,
+			}
+
+			Expect(imp.Match(details)).To(BeTrue())
+		})
+
+		It("matches when the repo name contains 'locate-anything' (case-insensitive)", func() {
+			imp := &importers.LocateAnythingImporter{}
+			details := importers.Details{
+				URI: "https://huggingface.co/mudler/locate-anything-cpp-3b",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "mudler/Locate-Anything-CPP-3B",
+					Author:  "mudler",
+				},
+			}
+
+			Expect(imp.Match(details)).To(BeTrue())
+		})
+
+		It("matches when the repo name contains 'locateanything' (case-insensitive)", func() {
+			imp := &importers.LocateAnythingImporter{}
+			details := importers.Details{
+				URI: "https://huggingface.co/nvidia/LocateAnything-3B",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "nvidia/LocateAnything-3B",
+					Author:  "nvidia",
+				},
+			}
+
+			Expect(imp.Match(details)).To(BeTrue())
+		})
+
+		It("matches via URI fallback when HuggingFace details are missing", func() {
+			imp := &importers.LocateAnythingImporter{}
+			details := importers.Details{
+				URI: "https://huggingface.co/nvidia/LocateAnything-3B",
+			}
+
+			Expect(imp.Match(details)).To(BeTrue())
+		})
+
+		It("does not match unrelated repos without locate-anything signals", func() {
+			imp := &importers.LocateAnythingImporter{}
+			details := importers.Details{
+				URI: "https://huggingface.co/meta-llama/Llama-3-8B",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "meta-llama/Llama-3-8B",
+					Author:  "meta-llama",
+				},
+			}
+
+			Expect(imp.Match(details)).To(BeFalse())
+		})
+
+		It("does not match an rfdetr repo", func() {
+			imp := &importers.LocateAnythingImporter{}
+			details := importers.Details{
+				URI: "https://huggingface.co/mudler/rfdetr-cpp-nano",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "mudler/rfdetr-cpp-nano",
+					Author:  "mudler",
+				},
+			}
+
+			Expect(imp.Match(details)).To(BeFalse())
+		})
+
+		It("returns false for invalid preferences JSON", func() {
+			imp := &importers.LocateAnythingImporter{}
+			preferences := json.RawMessage(`not valid json`)
+			details := importers.Details{
+				URI:         "https://example.com/model",
+				Preferences: preferences,
+			}
+
+			Expect(imp.Match(details)).To(BeFalse())
+		})
+	})
+
+	Context("Import", func() {
+		It("produces a YAML with backend locate-anything-cpp and the repo as the model", func() {
+			imp := &importers.LocateAnythingImporter{}
+			details := importers.Details{
+				URI: "https://huggingface.co/nvidia/LocateAnything-3B",
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "nvidia/LocateAnything-3B",
+					Author:  "nvidia",
+				},
+			}
+
+			modelConfig, err := imp.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("backend: locate-anything-cpp"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("nvidia/LocateAnything-3B"), fmt.Sprintf("Model config: %+v", modelConfig))
+			Expect(modelConfig.ConfigFile).To(ContainSubstring("detection"), fmt.Sprintf("Model config: %+v", modelConfig))
+		})
+
+		It("respects custom name and description from preferences", func() {
+			imp := &importers.LocateAnythingImporter{}
+			preferences := json.RawMessage(`{"name": "my-locate", "description": "Custom"}`)
+			details := importers.Details{
+				URI:         "https://huggingface.co/nvidia/LocateAnything-3B",
+				Preferences: preferences,
+				HuggingFace: &hfapi.ModelDetails{
+					ModelID: "nvidia/LocateAnything-3B",
+					Author:  "nvidia",
+				},
+			}
+
+			modelConfig, err := imp.Import(details)
+
+			Expect(err).ToNot(HaveOccurred())
+			Expect(modelConfig.Name).To(Equal("my-locate"))
+			Expect(modelConfig.Description).To(Equal("Custom"))
+		})
+	})
+
+	// Table-driven coverage of the backend routing: locate-anything repos
+	// always route to the native locate-anything-cpp backend, with an
+	// explicit preferences.backend override honoured.
+	//
+	// Cases are kept offline-deterministic by injecting Details directly
+	// rather than going through DiscoverModelConfig (which would hit live HF).
+	Context("backend routing (offline)", func() {
+		hfFile := func(path string) hfapi.ModelFile {
+			return hfapi.ModelFile{Path: path}
+		}
+
+		type tc struct {
+			name          string
+			uri           string
+			modelID       string
+			files         []hfapi.ModelFile
+			prefs         string
+			expectBackend string // expected `backend:` line content
+		}
+
+		entries := []tc{
+			{
+				name:          "canonical NVIDIA repo routes to locate-anything-cpp",
+				uri:           "https://huggingface.co/nvidia/LocateAnything-3B",
+				modelID:       "nvidia/LocateAnything-3B",
+				files:         []hfapi.ModelFile{hfFile("locate-anything-3b-q8_0.gguf"), hfFile("README.md")},
+				prefs:         "",
+				expectBackend: "backend: locate-anything-cpp",
+			},
+			{
+				name:          "GGUF bundle with locate-anything name routes to locate-anything-cpp",
+				uri:           "https://huggingface.co/mudler/locate-anything.cpp-3b",
+				modelID:       "mudler/locate-anything.cpp-3b",
+				files:         []hfapi.ModelFile{hfFile("model-f16.gguf")},
+				prefs:         "",
+				expectBackend: "backend: locate-anything-cpp",
+			},
+			{
+				name:          "explicit preferences.backend override is honoured",
+				uri:           "https://huggingface.co/nvidia/LocateAnything-3B",
+				modelID:       "nvidia/LocateAnything-3B",
+				files:         nil,
+				prefs:         `{"backend": "locate-anything-cpp"}`,
+				expectBackend: "backend: locate-anything-cpp",
+			},
+		}
+
+		for _, e := range entries {
+			e := e // capture for closure
+			It(e.name, func() {
+				imp := &importers.LocateAnythingImporter{}
+				details := importers.Details{
+					URI: e.uri,
+					HuggingFace: &hfapi.ModelDetails{
+						ModelID: e.modelID,
+						Files:   e.files,
+					},
+				}
+				if e.prefs != "" {
+					details.Preferences = json.RawMessage(e.prefs)
+				}
+
+				Expect(imp.Match(details)).To(BeTrue(), fmt.Sprintf("Match should fire for %+v", details))
+
+				modelConfig, err := imp.Import(details)
+				Expect(err).ToNot(HaveOccurred(), fmt.Sprintf("Import error: %v", err))
+				Expect(modelConfig.ConfigFile).To(ContainSubstring(e.expectBackend),
+					fmt.Sprintf("Model config: %+v", modelConfig))
+			})
+		}
+	})
+})
--- a/core/http/endpoints/openai/chat.go
+++ b/core/http/endpoints/openai/chat.go
@@ -103,7 +103,12 @@ func applyAutoparserOverride(
 	// blocks like "<think></think>" that some models emit when reasoning
 	// is disabled.
 	if deltaReasoning == "" && deltaContent != "" {
-		deltaReasoning, deltaContent = reason.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, reasoningConfig)
+		// Complete-response extraction: only honor a prefilled <think> start
+		// token when deltaContent actually closes the reasoning block. Without
+		// it the model answered directly and the whole answer must stay in
+		// content rather than be swallowed as unclosed reasoning. See
+		// reason.ExtractReasoningComplete.
+		deltaReasoning, deltaContent = reason.ExtractReasoningComplete(deltaContent, thinkingStartToken, reasoningConfig)
 	}
 	xlog.Debug("[ChatDeltas] non-SSE no-tools: overriding result with C++ autoparser deltas",
 		"content_len", len(deltaContent), "reasoning_len", len(deltaReasoning))
--- a/core/http/endpoints/openai/chat_test.go
+++ b/core/http/endpoints/openai/chat_test.go
@@ -186,6 +186,114 @@ var _ = Describe("applyAutoparserOverride", func() {
 			Expect(result).To(Equal(existing))
 		})
 	})
+
+	// Regression tests for the prefilled-thinking-token path (thinkingStartToken
+	// != ""). This is the configuration the gallery qwen3 family runs in: the
+	// chat template injects <think> into the prompt, so DetectThinkingStartToken
+	// returns "<think>" and the model's output begins *inside* a reasoning block
+	// — it emits a closing </think> but no opening tag.
+	//
+	// The defensive Go-side fallback prepends the start token so the standard
+	// extractor can pair it with the model's </think>. But on a *complete*
+	// response that contains NO closing tag (the model answered directly with no
+	// reasoning at all), prepending <think> manufactures an unclosed block that
+	// swallows the entire answer into reasoning, leaving content empty. That is
+	// the bug: short/direct answers (session names, JSON summaries) come back
+	// with an empty content field.
+	Context("autoparser delivered content with empty reasoning and a prefilled thinking token", func() {
+		const startToken = "<think>"
+
+		It("keeps a tag-less direct answer as content instead of swallowing it as reasoning", func() {
+			// Model answered directly: no <think>, no </think> anywhere.
+			chatDeltas := []*pb.ChatDelta{
+				{Content: "hello", ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(result[0].Message.Content).ToNot(BeNil())
+			Expect(*(result[0].Message.Content.(*string))).To(Equal("hello"),
+				"a complete answer with no closing reasoning tag must stay in content")
+			Expect(result[0].Message.Reasoning).To(BeNil(),
+				"no reasoning block was emitted, so Reasoning must not be set")
+		})
+
+		It("keeps a tag-less JSON answer as content (the summary case)", func() {
+			raw := `{"short":"Tests pass","long":"go test ./... succeeded."}`
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal(raw))
+			Expect(result[0].Message.Reasoning).To(BeNil())
+		})
+
+		It("still splits reasoning when the model emits the closing tag (prefill paired with </think>)", func() {
+			// The legitimate prefill case: <think> was in the prompt, so the
+			// output carries only the closing tag. The closing tag is the proof
+			// that a reasoning block exists, so extraction must run.
+			raw := "The user wants a greeting.\n</think>\n\nHello there!"
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			content := *(result[0].Message.Content.(*string))
+			Expect(content).To(ContainSubstring("Hello there!"))
+			Expect(content).ToNot(ContainSubstring("</think>"))
+			Expect(content).ToNot(ContainSubstring("The user wants a greeting"))
+			Expect(result[0].Message.Reasoning).ToNot(BeNil())
+			Expect(*result[0].Message.Reasoning).To(ContainSubstring("The user wants a greeting"))
+		})
+
+		It("still splits a fully-tagged <think>…</think> block with a prefill token set", func() {
+			raw := "<think>Reasoning here.</think>Final answer."
+			chatDeltas := []*pb.ChatDelta{
+				{Content: raw, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal("Final answer."))
+			Expect(result[0].Message.Reasoning).ToNot(BeNil())
+			Expect(*result[0].Message.Reasoning).To(ContainSubstring("Reasoning here"))
+		})
+
+		// End-to-end regression for the real production failure: a request with
+		// enable_thinking=false against a <think>-capable model (qwen3 family).
+		//
+		// In non-thinking mode the model emits no reasoning block, so llama.cpp's
+		// autoparser correctly returns ChatDeltas with Content set and
+		// ReasoningContent EMPTY (verified against stock llama-server: the same
+		// model with chat_template_kwargs.enable_thinking=false returns
+		// reasoning_content=null and content="hello"). But thinkingStartToken is
+		// detected per-model from the enable_thinking=TRUE render
+		// (grpc-server renders with enable_thinking=true; DetectThinkingStartToken
+		// does not evaluate the jinja {% if enable_thinking %} conditional), so it
+		// is "<think>" even for this non-thinking request. The old code prepended
+		// it and swallowed the answer. This is the case that broke session
+		// summaries and auto-titles and was NOT covered before.
+		It("preserves content for a non-thinking-mode request (enable_thinking=false, empty reasoning_content)", func() {
+			// What llama.cpp's autoparser actually returns in non-thinking mode.
+			chatDeltas := []*pb.ChatDelta{
+				{Content: `{"short":"Go tests passed for internal/session"}`, ReasoningContent: ""},
+			}
+
+			result := applyAutoparserOverride(chatDeltas, startToken, reason.Config{}, nil)
+
+			Expect(result).To(HaveLen(1))
+			Expect(*(result[0].Message.Content.(*string))).To(Equal(`{"short":"Go tests passed for internal/session"}`),
+				"non-thinking-mode answers must reach the client intact, not be swallowed as reasoning")
+			Expect(result[0].Message.Reasoning).To(BeNil())
+		})
+	})
 })

 var _ = Describe("mergeToolCallDeltas", func() {
--- a/core/http/endpoints/openai/realtime.go
+++ b/core/http/endpoints/openai/realtime.go
@@ -1579,7 +1579,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		// ExtractReasoningWithConfig is a no-op when no tag pair matches,
 		// so it's safe to apply unconditionally in the no-reasoning branch.
 		if deltaReasoning == "" && deltaContent != "" {
-			deltaReasoning, deltaContent = reasoning.ExtractReasoningWithConfig(deltaContent, thinkingStartToken, config.ReasoningConfig)
+			deltaReasoning, deltaContent = reasoning.ExtractReasoningComplete(deltaContent, thinkingStartToken, config.ReasoningConfig)
 		}
 		reasoningText = deltaReasoning
 		responseWithoutReasoning = deltaContent
@@ -1587,7 +1587,7 @@ func triggerResponseAtTurn(ctx context.Context, session *Session, conv *Conversa
 		cleanedResponse = deltaContent
 		toolCalls = deltaToolCalls
 	} else {
-		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningWithConfig(rawResponse, thinkingStartToken, config.ReasoningConfig)
+		reasoningText, responseWithoutReasoning = reasoning.ExtractReasoningComplete(rawResponse, thinkingStartToken, config.ReasoningConfig)
 		textContent = functions.ParseTextContent(responseWithoutReasoning, config.FunctionsConfig)
 		cleanedResponse = functions.CleanupLLMResult(responseWithoutReasoning, config.FunctionsConfig)
 		toolCalls = functions.ParseFunctionCall(cleanedResponse, config.FunctionsConfig)
--- a/core/http/endpoints/openai/realtime_webrtc.go
+++ b/core/http/endpoints/openai/realtime_webrtc.go
@@ -48,7 +48,8 @@ func RealtimeCalls(application *application.Application) echo.HandlerFunc {
 			return c.JSON(http.StatusInternalServerError, map[string]string{"error": "codec registration failed"})
 		}

-		api := webrtc.NewAPI(webrtc.WithMediaEngine(m))
+		se := webRTCSettingEngine(application.ApplicationConfig())
+		api := webrtc.NewAPI(webrtc.WithMediaEngine(m), webrtc.WithSettingEngine(se))

 		pc, err := api.NewPeerConnection(webrtc.Configuration{})
 		if err != nil {
--- a/core/http/endpoints/openai/realtime_webrtc_ice.go
+++ b/core/http/endpoints/openai/realtime_webrtc_ice.go
@@ -0,0 +1,47 @@
+package openai
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	"github.com/mudler/xlog"
+	"github.com/pion/webrtc/v4"
+)
+
+// webRTCSettingEngine builds the pion SettingEngine for /v1/realtime WebRTC.
+//
+// With a default (empty) SettingEngine, pion gathers a host ICE candidate for
+// every local interface. Under Docker host networking that includes bridge
+// addresses (docker0/veth, 172.x) that a remote browser cannot route to; the
+// connection often establishes on a good pair and then drops once ICE consent
+// checks fail on the unreachable ones. The two opt-in knobs below let an
+// operator advertise only the reachable address.
+func webRTCSettingEngine(cfg *config.ApplicationConfig) webrtc.SettingEngine {
+	s := webrtc.SettingEngine{}
+	if cfg == nil {
+		return s
+	}
+	if len(cfg.WebRTCNAT1To1IPs) > 0 {
+		s.SetNAT1To1IPs(cfg.WebRTCNAT1To1IPs, webrtc.ICECandidateTypeHost)
+		xlog.Debug("realtime webrtc: advertising NAT 1:1 host IPs", "ips", cfg.WebRTCNAT1To1IPs)
+	}
+	if filter := iceInterfaceFilter(cfg.WebRTCICEInterfaces); filter != nil {
+		s.SetInterfaceFilter(filter)
+		xlog.Debug("realtime webrtc: restricting ICE interfaces", "interfaces", cfg.WebRTCICEInterfaces)
+	}
+	return s
+}
+
+// iceInterfaceFilter returns an interface allow-list predicate for pion, or nil
+// when no interfaces are configured (pion's default: gather from all).
+func iceInterfaceFilter(allowed []string) func(string) bool {
+	if len(allowed) == 0 {
+		return nil
+	}
+	set := make(map[string]struct{}, len(allowed))
+	for _, name := range allowed {
+		set[name] = struct{}{}
+	}
+	return func(iface string) bool {
+		_, ok := set[iface]
+		return ok
+	}
+}
--- a/core/http/endpoints/openai/realtime_webrtc_ice_test.go
+++ b/core/http/endpoints/openai/realtime_webrtc_ice_test.go
@@ -0,0 +1,39 @@
+package openai
+
+import (
+	"github.com/mudler/LocalAI/core/config"
+	. "github.com/onsi/ginkgo/v2"
+	. "github.com/onsi/gomega"
+)
+
+var _ = Describe("webRTC ICE settings", func() {
+	Describe("iceInterfaceFilter", func() {
+		It("returns nil when no interfaces are configured", func() {
+			Expect(iceInterfaceFilter(nil)).To(BeNil())
+			Expect(iceInterfaceFilter([]string{})).To(BeNil())
+		})
+
+		It("admits only the configured interfaces", func() {
+			f := iceInterfaceFilter([]string{"eth0", "wlan0"})
+			Expect(f).NotTo(BeNil())
+			Expect(f("eth0")).To(BeTrue())
+			Expect(f("wlan0")).To(BeTrue())
+			Expect(f("docker0")).To(BeFalse())
+			Expect(f("veth123")).To(BeFalse())
+		})
+	})
+
+	Describe("webRTCSettingEngine", func() {
+		It("does not panic on a nil config", func() {
+			Expect(func() { webRTCSettingEngine(nil) }).NotTo(Panic())
+		})
+
+		It("builds an engine with NAT 1:1 IPs and an interface filter configured", func() {
+			cfg := &config.ApplicationConfig{
+				WebRTCNAT1To1IPs:    []string{"192.168.1.10"},
+				WebRTCICEInterfaces: []string{"eth0"},
+			}
+			Expect(func() { webRTCSettingEngine(cfg) }).NotTo(Panic())
+		})
+	})
+})
--- a/core/http/endpoints/openresponses/responses.go
+++ b/core/http/endpoints/openresponses/responses.go
@@ -1356,7 +1356,7 @@ func handleOpenResponsesNonStream(c echo.Context, responseID string, createdAt i
 	thinkingStartToken := reason.DetectThinkingStartToken(template, &cfg.ReasoningConfig)

 	// Extract reasoning from result before cleaning
-	reasoningContent, cleanedResult := reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+	reasoningContent, cleanedResult := reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)

 	// Parse tool calls if using functions
 	var outputItems []schema.ORItemField
@@ -1996,7 +1996,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 				finalCleanedResult = extractor.CleanedContent()
 			}
 			if finalReasoning == "" && finalCleanedResult == "" {
-				finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+				finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
 			}

 			// Close reasoning item if it exists and wasn't closed yet
@@ -2493,7 +2493,7 @@ func handleOpenResponsesStream(c echo.Context, responseID string, createdAt int6
 		finalCleanedResult = extractor.CleanedContent()
 	}
 	if finalReasoning == "" && finalCleanedResult == "" {
-		finalReasoning, finalCleanedResult = reason.ExtractReasoningWithConfig(result, thinkingStartToken, cfg.ReasoningConfig)
+		finalReasoning, finalCleanedResult = reason.ExtractReasoningComplete(result, thinkingStartToken, cfg.ReasoningConfig)
 	}

 	// Close reasoning item if it exists and wasn't closed yet
--- a/core/http/react-ui/src/hooks/useChat.js
+++ b/core/http/react-ui/src/hooks/useChat.js
@@ -216,6 +216,12 @@ export function useChat(initialModel = '') {
            audio_url: { url: `data:${file.type};base64,${file.base64}` },
          })
          userFiles.push({ name: file.name, type: 'audio' })
+        } else if (file.type?.startsWith('video/')) {
+          messageContent.push({
+            type: 'video_url',
+            video_url: { url: `data:${file.type};base64,${file.base64}` },
+          })
+          userFiles.push({ name: file.name, type: 'video' })
        } else {
 			// Text/PDF files - append to content
 			if (file.textContent) {
--- a/core/http/react-ui/src/pages/Chat.jsx
+++ b/core/http/react-ui/src/pages/Chat.jsx
@@ -265,7 +265,7 @@ function UserMessageContent({ content, files }) {
        <div className="chat-message-files">
          {files.map((f, i) => (
            <span key={i} className="chat-file-inline">
-              <i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : 'fa-file'}`} />
+              <i className={`fas ${f.type === 'image' ? 'fa-image' : f.type === 'audio' ? 'fa-headphones' : f.type === 'video' ? 'fa-film' : 'fa-file'}`} />
              {f.name}
            </span>
          ))}
@@ -274,6 +274,9 @@ function UserMessageContent({ content, files }) {
      {Array.isArray(content) && content.filter(c => c.type === 'image_url').map((img, i) => (
        <img key={i} src={img.image_url.url} alt="attached" className="chat-inline-image" />
      ))}
+      {Array.isArray(content) && content.filter(c => c.type === 'video_url').map((vid, i) => (
+        <video key={i} src={vid.video_url.url} controls className="chat-inline-video" />
+      ))}
    </>
  )
 }
@@ -711,7 +714,7 @@ export default function Chat() {
    for (const file of e.target.files) {
      const base64 = await fileToBase64(file)
      const entry = { name: file.name, type: file.type, base64 }
-      if (!file.type.startsWith('image/') && !file.type.startsWith('audio/')) {
+      if (!file.type.startsWith('image/') && !file.type.startsWith('audio/') && !file.type.startsWith('video/')) {
        entry.textContent = await file.text().catch(() => '')
      }
      newFiles.push(entry)
@@ -1244,7 +1247,7 @@ export default function Chat() {
          <div className="chat-files">
            {files.map((f, i) => (
              <span key={i} className="chat-file-badge">
-                <i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : 'fa-file'}`} />
+                <i className={`fas ${f.type?.startsWith('image/') ? 'fa-image' : f.type?.startsWith('audio/') ? 'fa-headphones' : f.type?.startsWith('video/') ? 'fa-film' : 'fa-file'}`} />
                {f.name}
                <button onClick={() => setFiles(prev => prev.filter((_, idx) => idx !== i))}>
                  <i className="fas fa-xmark" />
@@ -1343,7 +1346,7 @@ export default function Chat() {
              ref={fileInputRef}
              type="file"
              multiple
-              accept="image/*,audio/*,application/pdf,.txt,.md,.csv,.json"
+              accept="image/*,audio/*,video/*,application/pdf,.txt,.md,.csv,.json"
              style={{ display: 'none' }}
              onChange={handleFileChange}
            />
--- a/core/services/agentpool/agent_pool.go
+++ b/core/services/agentpool/agent_pool.go
@@ -466,10 +466,11 @@ func (s *AgentPoolService) Chat(name, message string) (string, error) {
 				s.collectAndCopyMetadata(metadata, chatUserID)
 			}

+			content := s.appendLocalAGIKBCitations(response.Response, name, message, response.State)
 			msg := map[string]any{
 				"id":        messageID + "-agent",
 				"sender":    "agent",
-				"content":   response.Response,
+				"content":   content,
 				"timestamp": time.Now().Format(time.RFC3339),
 			}
 			if len(metadata) > 0 {
@@ -489,6 +490,79 @@ func (s *AgentPoolService) Chat(name, message string) (string, error) {
 	return messageID, nil
 }

+func (s *AgentPoolService) appendLocalAGIKBCitations(response, agentKey, message string, states []coreTypes.ActionState) string {
+	if strings.TrimSpace(response) == "" {
+		return response
+	}
+
+	userID, collection := splitAgentKey(agentKey)
+	cfg := s.localAGI.pool.GetConfig(agentKey)
+	if cfg == nil || !cfg.EnableKnowledgeBase {
+		return response
+	}
+
+	citations := kbCitationsFromActionStates(states)
+	if len(citations) == 0 && cfg.KBAutoSearch {
+		maxResults := cfg.KnowledgeBaseResults
+		if maxResults <= 0 {
+			maxResults = 5
+		}
+		ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
+		defer cancel()
+		kbResult := agents.KBAutoSearchPrompt(ctx, s.apiURL, s.apiKey, collection, message, maxResults, userID)
+		citations = kbResult.Citations
+	}
+
+	return agents.AppendKBCitations(response, collection, userID, citations)
+}
+
+func splitAgentKey(agentKey string) (userID, name string) {
+	if uid, n, ok := strings.Cut(agentKey, ":"); ok {
+		return uid, n
+	}
+	return "", agentKey
+}
+
+func kbCitationsFromActionStates(states []coreTypes.ActionState) []agents.KBCitation {
+	var citations []agents.KBCitation
+	for _, state := range states {
+		citations = append(citations, kbCitationsFromMetadata(state.Metadata)...)
+	}
+	return citations
+}
+
+func kbCitationsFromMetadata(metadata map[string]any) []agents.KBCitation {
+	if len(metadata) == 0 {
+		return nil
+	}
+
+	fileName := metadata["file_name"]
+	source := metadata["source"]
+	if fileName == nil && source == nil {
+		return nil
+	}
+
+	citation := agents.KBCitation{
+		FileName: metadataString(fileName),
+		EntryKey: metadataString(source),
+	}
+	if citation.FileName == "" && citation.EntryKey == "" {
+		return nil
+	}
+	return []agents.KBCitation{citation}
+}
+
+func metadataString(value any) string {
+	switch v := value.(type) {
+	case string:
+		return v
+	case fmt.Stringer:
+		return v.String()
+	default:
+		return ""
+	}
+}
+
 // userOutputsDir returns the per-user outputs directory, creating it if needed.
 // If userID is empty, falls back to the shared outputs directory.
 func (s *AgentPoolService) userOutputsDir(userID string) string {
--- a/core/services/agents/citations.go
+++ b/core/services/agents/citations.go
@@ -0,0 +1,127 @@
+package agents
+
+import (
+	"fmt"
+	"net/url"
+	"strings"
+	"sync"
+)
+
+type kbCitationList struct {
+	mu        sync.Mutex
+	citations []KBCitation
+}
+
+func (l *kbCitationList) AddKBCitations(citations []KBCitation) {
+	if len(citations) == 0 {
+		return
+	}
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	l.citations = append(l.citations, citations...)
+}
+
+func (l *kbCitationList) Citations() []KBCitation {
+	l.mu.Lock()
+	defer l.mu.Unlock()
+	out := make([]KBCitation, len(l.citations))
+	copy(out, l.citations)
+	return out
+}
+
+// AppendKBCitations appends a markdown Sources block for KB citations.
+func AppendKBCitations(response, collection, userID string, citations []KBCitation) string {
+	if strings.TrimSpace(response) == "" || len(citations) == 0 {
+		return response
+	}
+
+	var lines []string
+	seen := make(map[string]struct{})
+	for _, citation := range citations {
+		key := strings.TrimSpace(citation.EntryKey)
+		if key == "" {
+			key = strings.TrimSpace(citation.FileName)
+		}
+		if key == "" {
+			continue
+		}
+		if _, ok := seen[key]; ok {
+			continue
+		}
+		seen[key] = struct{}{}
+
+		displayName := kbCitationDisplayName(citation)
+		if displayName == "" {
+			continue
+		}
+
+		sourceURL := kbCitationRawFileURL(collection, citation.EntryKey, userID)
+		number := len(lines) + 1
+		if sourceURL == "" {
+			lines = append(lines, fmt.Sprintf("[%d] %s", number, displayName))
+			continue
+		}
+		lines = append(lines, fmt.Sprintf("[%d] [%s](%s)", number, escapeMarkdownLinkText(displayName), sourceURL))
+	}
+
+	if len(lines) == 0 {
+		return response
+	}
+
+	var sb strings.Builder
+	sb.WriteString(strings.TrimRight(response, "\n"))
+	sb.WriteString("\n\nSources:\n")
+	for _, line := range lines {
+		sb.WriteString(line)
+		sb.WriteString("\n")
+	}
+	return strings.TrimRight(sb.String(), "\n")
+}
+
+func kbCitationDisplayName(citation KBCitation) string {
+	if fileName := strings.TrimSpace(citation.FileName); fileName != "" {
+		return fileName
+	}
+
+	segments := strings.Split(strings.Trim(strings.TrimSpace(citation.EntryKey), "/"), "/")
+	for i := len(segments) - 1; i >= 0; i-- {
+		if segment := strings.TrimSpace(segments[i]); segment != "" {
+			return segment
+		}
+	}
+	return ""
+}
+
+func kbCitationRawFileURL(collection, entryKey, userID string) string {
+	collection = strings.TrimSpace(collection)
+	entryKey = strings.Trim(strings.TrimSpace(entryKey), "/")
+	if collection == "" || entryKey == "" {
+		return ""
+	}
+
+	var escapedEntrySegments []string
+	for _, segment := range strings.Split(entryKey, "/") {
+		if segment == "" {
+			continue
+		}
+		escapedEntrySegments = append(escapedEntrySegments, url.PathEscape(segment))
+	}
+	if len(escapedEntrySegments) == 0 {
+		return ""
+	}
+
+	sourceURL := "/api/agents/collections/" + url.PathEscape(collection) + "/entries-raw/" + strings.Join(escapedEntrySegments, "/")
+	if userID != "" {
+		query := url.Values{}
+		query.Set("user_id", userID)
+		sourceURL += "?" + query.Encode()
+	}
+	return sourceURL
+}
+
+func escapeMarkdownLinkText(text string) string {
+	text = strings.ReplaceAll(text, `\`, `\\`)
+	text = strings.ReplaceAll(text, "[", `\[`)
+	text = strings.ReplaceAll(text, "]", `\]`)
+	return text
+}
--- a/core/services/agents/executor.go
+++ b/core/services/agents/executor.go
@@ -167,10 +167,12 @@ func ExecuteChatWithLLM(ctx context.Context, llm cogito.LLM, cfg *AgentConfig, m
 		}
 	}

+	kbCitations := &kbCitationList{}
 	if cfg.EnableKnowledgeBase && (kbMode == KBModeAutoSearch || kbMode == KBModeBoth) {
-		kbResults := KBAutoSearchPrompt(ctx, effectiveURL, effectiveKey, cfg.Name, message, cfg.KnowledgeBaseResults, userID)
-		if kbResults != "" {
-			fragment = fragment.AddMessage(cogito.SystemMessageRole, kbResults)
+		kbResult := KBAutoSearchPrompt(ctx, effectiveURL, effectiveKey, cfg.Name, message, cfg.KnowledgeBaseResults, userID)
+		if kbResult.Prompt != "" {
+			fragment = fragment.AddMessage(cogito.SystemMessageRole, kbResult.Prompt)
+			kbCitations.AddKBCitations(kbResult.Citations)
 		}
 	}

@@ -197,7 +199,7 @@ func ExecuteChatWithLLM(ctx context.Context, llm cogito.LLM, cfg *AgentConfig, m
 		}
 		cogitoOpts = append(cogitoOpts, cogito.WithTools(
 			cogito.NewToolDefinition(
-				KBSearchMemoryTool{APIURL: effectiveURL, APIKey: effectiveKey, Collection: cfg.Name, MaxResults: kbResults, UserID: userID},
+				KBSearchMemoryTool{APIURL: effectiveURL, APIKey: effectiveKey, Collection: cfg.Name, MaxResults: kbResults, UserID: userID, CitationCollector: kbCitations},
 				KBSearchMemoryArgs{},
 				"search_memory",
 				"Search the knowledge base for relevant information",
@@ -336,6 +338,8 @@ func ExecuteChatWithLLM(ctx context.Context, llm cogito.LLM, cfg *AgentConfig, m
 	if cfg.StripThinkingTags && response != "" {
 		response = stripThinkingTags(response)
 	}
+	responseForMemory := response
+	response = AppendKBCitations(response, cfg.Name, userID, kbCitations.Citations())

 	// Save conversation to KB when long-term memory is enabled.
 	// Use a detached context: the parent ctx may be cancelled (e.g. in distributed
@@ -344,7 +348,7 @@ func ExecuteChatWithLLM(ctx context.Context, llm cogito.LLM, cfg *AgentConfig, m
 		go func() {
 			ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 			defer cancel()
-			saveConversationToKB(ctx, llm, effectiveURL, effectiveKey, cfg, message, response, userID)
+			saveConversationToKB(ctx, llm, effectiveURL, effectiveKey, cfg, message, responseForMemory, userID)
 		}()
 	}

--- a/core/services/agents/executor_test.go
+++ b/core/services/agents/executor_test.go
@@ -2,6 +2,8 @@ package agents

 import (
 	"context"
+	"net/http"
+	"net/http/httptest"
 	"sync"
 	"sync/atomic"

@@ -36,6 +38,34 @@ func (m *mockLLM) CreateChatCompletion(ctx context.Context, req openai.ChatCompl
 	}, cogito.LLMUsage{}, nil
 }

+type toolCallingMockLLM struct {
+	createResponses []openai.ChatCompletionResponse
+	askResponse     string
+	callCount       atomic.Int32
+}
+
+func (m *toolCallingMockLLM) Ask(ctx context.Context, f cogito.Fragment) (cogito.Fragment, error) {
+	m.callCount.Add(1)
+	return f.AddMessage(cogito.AssistantMessageRole, m.askResponse), nil
+}
+
+func (m *toolCallingMockLLM) CreateChatCompletion(ctx context.Context, req openai.ChatCompletionRequest) (cogito.LLMReply, cogito.LLMUsage, error) {
+	idx := int(m.callCount.Add(1)) - 1
+	if idx >= len(m.createResponses) {
+		return cogito.LLMReply{
+			ChatCompletionResponse: openai.ChatCompletionResponse{
+				Choices: []openai.ChatCompletionChoice{{
+					Message: openai.ChatCompletionMessage{
+						Role:    "assistant",
+						Content: "No more tools needed.",
+					},
+				}},
+			},
+		}, cogito.LLMUsage{}, nil
+	}
+	return cogito.LLMReply{ChatCompletionResponse: m.createResponses[idx]}, cogito.LLMUsage{}, nil
+}
+
 // statusCollector records status callbacks in a thread-safe way.
 type statusCollector struct {
 	mu       sync.Mutex
@@ -73,6 +103,74 @@ var _ = DescribeTable("stripThinkingTags",
 	Entry("adjacent tag pairs", "<thinking>a</thinking><thinking>b</thinking>", ""),
 )

+var _ = DescribeTable("appendKBCitations",
+	func(response, collection, userID string, citations []KBCitation, want string) {
+		Expect(AppendKBCitations(response, collection, userID, citations)).To(Equal(want))
+	},
+	Entry("leaves responses without citations unchanged",
+		"answer",
+		"agent",
+		"",
+		nil,
+		"answer",
+	),
+	Entry("leaves blank responses unchanged",
+		"",
+		"agent",
+		"",
+		[]KBCitation{{FileName: "source.pdf", EntryKey: "uuid/source.pdf"}},
+		"",
+	),
+	Entry("appends clickable source links",
+		"answer",
+		"my-agent",
+		"",
+		[]KBCitation{{FileName: "new feature.pdf", EntryKey: "uuid/new feature.pdf"}},
+		"answer\n\nSources:\n[1] [new feature.pdf](/api/agents/collections/my-agent/entries-raw/uuid/new%20feature.pdf)",
+	),
+	Entry("deduplicates citations by entry key",
+		"answer",
+		"agent",
+		"",
+		[]KBCitation{
+			{FileName: "first.pdf", EntryKey: "uuid/shared.pdf"},
+			{FileName: "second.pdf", EntryKey: "uuid/shared.pdf"},
+		},
+		"answer\n\nSources:\n[1] [first.pdf](/api/agents/collections/agent/entries-raw/uuid/shared.pdf)",
+	),
+	Entry("uses plain text when entry key is missing",
+		"answer",
+		"agent",
+		"",
+		[]KBCitation{{FileName: "source.pdf"}},
+		"answer\n\nSources:\n[1] source.pdf",
+	),
+	Entry("uses entry basename when filename is missing",
+		"answer",
+		"agent",
+		"",
+		[]KBCitation{{EntryKey: "uuid/source.pdf"}},
+		"answer\n\nSources:\n[1] [source.pdf](/api/agents/collections/agent/entries-raw/uuid/source.pdf)",
+	),
+	Entry("adds user id query when present",
+		"answer",
+		"agent",
+		"user 1",
+		[]KBCitation{{FileName: "source.pdf", EntryKey: "uuid/source.pdf"}},
+		"answer\n\nSources:\n[1] [source.pdf](/api/agents/collections/agent/entries-raw/uuid/source.pdf?user_id=user+1)",
+	),
+	Entry("escapes collection, path segments, and markdown link text",
+		"answer",
+		"agent one",
+		"",
+		[]KBCitation{{FileName: "source [draft].pdf", EntryKey: "uuid/source [draft].pdf"}},
+		`answer
+
+Sources:
+[1] [source \[draft\].pdf](/api/agents/collections/agent%20one/entries-raw/uuid/source%20%5Bdraft%5D.pdf)`,
+	),
+)
+
 var _ = Describe("ExecuteChatWithLLM", func() {
 	var (
 		ctx context.Context
@@ -184,6 +282,150 @@ var _ = Describe("ExecuteChatWithLLM", func() {
 		})
 	})

+	Context("knowledge base citations", func() {
+		It("appends KB sources to the returned response and callback message", func() {
+			mux := http.NewServeMux()
+			mux.HandleFunc("/api/agents/collections/kb-agent/search", func(w http.ResponseWriter, r *http.Request) {
+				Expect(r.URL.Query().Get("user_id")).To(Equal("user-1"))
+				w.Header().Set("Content-Type", "application/json")
+				_, _ = w.Write([]byte(`{
+					"results": [
+						{
+							"content": "KB content",
+							"id": "result-1",
+							"similarity": 0.99,
+							"metadata": {
+								"file_name": "new feature.pdf",
+								"source": "uuid/new feature.pdf"
+							}
+						}
+					],
+					"count": 1
+				}`))
+			})
+			server := httptest.NewServer(mux)
+			defer server.Close()
+
+			var msgContent string
+			cb.OnMessage = func(sender, content, messageID string) {
+				msgContent = content
+			}
+
+			llm := &mockLLM{response: "agent reply"}
+			cfg := &AgentConfig{
+				Name:                "kb-agent",
+				Model:               "test-model",
+				EnableKnowledgeBase: true,
+				KBMode:              KBModeAutoSearch,
+			}
+
+			result, err := ExecuteChatWithLLM(ctx, llm, cfg, "hello", cb, ExecuteChatOpts{
+				APIURL: server.URL,
+				UserID: "user-1",
+			})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(result).To(Equal("agent reply\n\nSources:\n[1] [new feature.pdf](/api/agents/collections/kb-agent/entries-raw/uuid/new%20feature.pdf?user_id=user-1)"))
+			Expect(msgContent).To(Equal(result))
+		})
+
+		It("collects citations from the search_memory tool", func() {
+			mux := http.NewServeMux()
+			mux.HandleFunc("/api/agents/collections/kb-agent/search", func(w http.ResponseWriter, r *http.Request) {
+				w.Header().Set("Content-Type", "application/json")
+				_, _ = w.Write([]byte(`{
+					"results": [
+						{
+							"content": "Tool KB content",
+							"id": "result-1",
+							"similarity": 0.99,
+							"metadata": {
+								"file_name": "tool source.pdf",
+								"source": "uuid/tool source.pdf"
+							}
+						}
+					],
+					"count": 1
+				}`))
+			})
+			server := httptest.NewServer(mux)
+			defer server.Close()
+
+			collector := &kbCitationList{}
+			tool := KBSearchMemoryTool{
+				APIURL:            server.URL,
+				Collection:        "kb-agent",
+				CitationCollector: collector,
+			}
+
+			result, _, err := tool.Run(KBSearchMemoryArgs{Query: "hello"})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(result).To(ContainSubstring("Tool KB content"))
+			Expect(collector.Citations()).To(Equal([]KBCitation{{FileName: "tool source.pdf", EntryKey: "uuid/tool source.pdf"}}))
+		})
+
+		It("appends KB sources found through tools-only search_memory calls", func() {
+			mux := http.NewServeMux()
+			mux.HandleFunc("/api/agents/collections/kb-agent/search", func(w http.ResponseWriter, r *http.Request) {
+				Expect(r.URL.Query().Get("user_id")).To(Equal("user-1"))
+				w.Header().Set("Content-Type", "application/json")
+				_, _ = w.Write([]byte(`{
+					"results": [
+						{
+							"content": "Tool KB content",
+							"id": "result-1",
+							"similarity": 0.99,
+							"metadata": {
+								"file_name": "tool source.pdf",
+								"source": "uuid/tool source.pdf"
+							}
+						}
+					],
+					"count": 1
+				}`))
+			})
+			server := httptest.NewServer(mux)
+			defer server.Close()
+
+			llm := &toolCallingMockLLM{
+				askResponse: "agent reply from tool context",
+				createResponses: []openai.ChatCompletionResponse{
+					{
+						Choices: []openai.ChatCompletionChoice{
+							{
+								Message: openai.ChatCompletionMessage{
+									Role: "assistant",
+									ToolCalls: []openai.ToolCall{
+										{
+											ID:   "call-1",
+											Type: openai.ToolTypeFunction,
+											Function: openai.FunctionCall{
+												Name:      "search_memory",
+												Arguments: `{"query":"hello"}`,
+											},
+										},
+									},
+								},
+							},
+						},
+					},
+				},
+			}
+			cfg := &AgentConfig{
+				Name:                "kb-agent",
+				Model:               "test-model",
+				EnableKnowledgeBase: true,
+				KBMode:              KBModeTools,
+			}
+
+			result, err := ExecuteChatWithLLM(ctx, llm, cfg, "hello", cb, ExecuteChatOpts{
+				APIURL: server.URL,
+				UserID: "user-1",
+			})
+			Expect(err).ToNot(HaveOccurred())
+			Expect(result).To(Equal("agent reply from tool context\n\nSources:\n[1] [tool source.pdf](/api/agents/collections/kb-agent/entries-raw/uuid/tool%20source.pdf?user_id=user-1)"))
+		})
+	})
+
 	Context("context cancellation", func() {
 		It("returns an error when context is already cancelled", func() {
 			cancelledCtx, cancel := context.WithCancel(ctx)
--- a/core/services/agents/knowledge.go
+++ b/core/services/agents/knowledge.go
@@ -8,6 +8,7 @@ import (
 	"io"
 	"mime/multipart"
 	"net/http"
+	"net/url"
 	"strings"
 	"time"

@@ -17,10 +18,19 @@ import (
 	"github.com/mudler/LocalAI/pkg/httpclient"
 )

+// Metadata keys populated by localrecall for every stored chunk. The original
+// upload file name lives under file_name (used for display); source holds the
+// collection entry key ("<uuid>/<filename>") used to build the raw-file URL.
+const (
+	kbMetadataFileName = "file_name"
+	kbMetadataSource   = "source"
+)
+
 // KBSearchResult represents a search result from the knowledge base.
+// Field names mirror the collection search endpoint's JSON response.
 type KBSearchResult struct {
 	Content    string            `json:"content"`
-	Score      float64           `json:"score"`
+	ID         string            `json:"id"`
 	Similarity float64           `json:"similarity"`
 	Metadata   map[string]string `json:"metadata"`
 }
@@ -31,22 +41,48 @@ type kbSearchResponse struct {
 	Count   int              `json:"count"`
 }

-// KBAutoSearchPrompt queries the knowledge base with the user's message
-// and returns a system prompt block with relevant results.
+// KBCitation is a single source document that a KB search drew from. Citations
+// travel alongside the prompt as structured data so the consumer (and UI) can
+// render clickable source links, independent of what the model writes inline.
+type KBCitation struct {
+	// FileName is the original uploaded file name, for display (e.g. "report.pdf").
+	FileName string `json:"file_name"`
+	// EntryKey is the collection entry identifier ("<uuid>/<filename>"), used to
+	// build the raw-file URL and as the de-duplication key.
+	EntryKey string `json:"entry_key"`
+}
+
+// KBSearchContext is the result of an auto-search against the knowledge base:
+// the system-prompt block to feed the model, plus the de-duplicated list of
+// source documents the results were drawn from.
+type KBSearchContext struct {
+	Prompt    string       `json:"prompt"`
+	Citations []KBCitation `json:"citations"`
+}
+
+// KBCitationCollector receives source citations found during KB searches.
+type KBCitationCollector interface {
+	AddKBCitations([]KBCitation)
+}
+
+// KBAutoSearchPrompt queries the knowledge base with the user's message and
+// returns a KBSearchContext: a system prompt block with the relevant results
+// plus the de-duplicated source citations those results came from.
 // Uses LocalAI's collection search endpoint via the API.
-func KBAutoSearchPrompt(ctx context.Context, apiURL, apiKey, collection, query string, maxResults int, userID string) string {
+func KBAutoSearchPrompt(ctx context.Context, apiURL, apiKey, collection, query string, maxResults int, userID string) KBSearchContext {
 	if collection == "" || query == "" {
-		return ""
+		return KBSearchContext{}
 	}

 	if maxResults <= 0 {
 		maxResults = 5
 	}

-	// Call LocalAI's collection search API
-	searchURL := strings.TrimRight(apiURL, "/") + "/api/agents/collections/" + collection + "/search"
+	searchURL := strings.TrimRight(apiURL, "/") + "/api/agents/collections/" + url.PathEscape(collection) + "/search"
 	if userID != "" {
-		searchURL += "?user_id=" + userID
+		query := url.Values{}
+		query.Set("user_id", userID)
+		searchURL += "?" + query.Encode()
 	}
 	reqBody, _ := json.Marshal(map[string]any{
 		"query":       query,
@@ -56,7 +92,7 @@ func KBAutoSearchPrompt(ctx context.Context, apiURL, apiKey, collection, query s
 	req, err := http.NewRequestWithContext(ctx, http.MethodPost, searchURL, strings.NewReader(string(reqBody)))
 	if err != nil {
 		xlog.Warn("KB auto-search: failed to create request", "error", err)
-		return ""
+		return KBSearchContext{}
 	}
 	req.Header.Set("Content-Type", "application/json")
 	if apiKey != "" {
@@ -66,41 +102,70 @@ func KBAutoSearchPrompt(ctx context.Context, apiURL, apiKey, collection, query s
 	resp, err := httpclient.New().Do(req)
 	if err != nil {
 		xlog.Warn("KB auto-search: request failed", "error", err)
-		return ""
+		return KBSearchContext{}
 	}
 	defer resp.Body.Close()

 	if resp.StatusCode != http.StatusOK {
 		body, _ := io.ReadAll(resp.Body)
 		xlog.Warn("KB auto-search: non-200 response", "status", resp.StatusCode, "body", string(body))
-		return ""
+		return KBSearchContext{}
 	}

 	var searchResp kbSearchResponse
 	if err := json.NewDecoder(resp.Body).Decode(&searchResp); err != nil {
 		xlog.Warn("KB auto-search: failed to decode response", "error", err)
-		return ""
+		return KBSearchContext{}
 	}

 	if len(searchResp.Results) == 0 {
-		return ""
+		return KBSearchContext{}
 	}

-	// Format results as a system prompt block (same format as LocalAGI)
+	// Build the system prompt block, labelling each chunk with its source file
+	// so the model can attribute inline, and collect the structured citations.
 	var sb strings.Builder
 	sb.WriteString("Given the user input you have the following in memory:\n")
-	for i, r := range searchResp.Results {
-		sb.WriteString(fmt.Sprintf("- %s", r.Content))
-		if len(r.Metadata) > 0 {
-			meta, _ := json.Marshal(r.Metadata)
-			sb.WriteString(fmt.Sprintf(" (%s)", string(meta)))
+
+	var citations []KBCitation
+	seen := make(map[string]struct{})
+
+	for _, r := range searchResp.Results {
+		fileName := r.Metadata[kbMetadataFileName]
+		source := r.Metadata[kbMetadataSource]
+
+		label := fileName
+		if label == "" {
+			label = "unknown"
 		}
-		if i < len(searchResp.Results)-1 {
-			sb.WriteString("\n")
+		sb.WriteString(fmt.Sprintf("[Source: %s]\n%s\n", label, r.Content))
+
+		// Citations are de-duplicated per source document: many chunks from the
+		// same file share one source key, so a file is listed only once. Skip
+		// results with no source key — they cannot be linked back to a document.
+		dedupKey := source
+		if dedupKey == "" {
+			dedupKey = fileName
 		}
+		if dedupKey == "" {
+			continue
+		}
+		if _, ok := seen[dedupKey]; ok {
+			continue
+		}
+		seen[dedupKey] = struct{}{}
+		citations = append(citations, KBCitation{
+			FileName: fileName,
+			EntryKey: source,
+		})
 	}

-	return sb.String()
+	sb.WriteString("When answering, cite sources using [Source: filename].")
+
+	return KBSearchContext{
+		Prompt:    sb.String(),
+		Citations: citations,
+	}
 }

 // KBSearchMemoryArgs defines the arguments for the search_memory tool.
@@ -110,21 +175,25 @@ type KBSearchMemoryArgs struct {

 // KBSearchMemoryTool implements the search_memory MCP tool.
 type KBSearchMemoryTool struct {
-	APIURL     string
-	APIKey     string
-	Collection string
-	MaxResults int
-	UserID     string
+	APIURL            string
+	APIKey            string
+	Collection        string
+	MaxResults        int
+	UserID            string
+	CitationCollector KBCitationCollector
 }

 func (t KBSearchMemoryTool) Run(args KBSearchMemoryArgs) (string, any, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
 	defer cancel()
 	result := KBAutoSearchPrompt(ctx, t.APIURL, t.APIKey, t.Collection, args.Query, t.MaxResults, t.UserID)
-	if result == "" {
+	if result.Prompt == "" {
 		return "No results found.", nil, nil
 	}
-	return result, nil, nil
+	if t.CitationCollector != nil {
+		t.CitationCollector.AddKBCitations(result.Citations)
+	}
+	return result.Prompt, nil, nil
 }

 // KBAddMemoryArgs defines the arguments for the add_memory tool.
@@ -156,9 +225,11 @@ func (t KBAddMemoryTool) Run(args KBAddMemoryArgs) (string, any, error) {

 // KBStoreContent uploads text content to a collection via the multipart upload API.
 func KBStoreContent(ctx context.Context, apiURL, apiKey, collection, content, userID string) error {
-	uploadURL := strings.TrimRight(apiURL, "/") + "/api/agents/collections/" + collection + "/upload"
+	uploadURL := strings.TrimRight(apiURL, "/") + "/api/agents/collections/" + url.PathEscape(collection) + "/upload"
 	if userID != "" {
-		uploadURL += "?user_id=" + userID
+		query := url.Values{}
+		query.Set("user_id", userID)
+		uploadURL += "?" + query.Encode()
 	}

 	// Build multipart form with the text content as a file
--- a/core/services/nodes/inflight.go
+++ b/core/services/nodes/inflight.go
@@ -157,3 +157,82 @@ func (c *InFlightTrackingClient) Rerank(ctx context.Context, in *pb.RerankReques
 	res, err := c.Backend.Rerank(ctx, in, opts...)
 	return res, c.reconcile(err)
 }
+
+func (c *InFlightTrackingClient) VAD(ctx context.Context, in *pb.VADRequest, opts ...ggrpc.CallOption) (*pb.VADResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.VAD(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) Diarize(ctx context.Context, in *pb.DiarizeRequest, opts ...ggrpc.CallOption) (*pb.DiarizeResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.Diarize(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) FaceVerify(ctx context.Context, in *pb.FaceVerifyRequest, opts ...ggrpc.CallOption) (*pb.FaceVerifyResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.FaceVerify(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) FaceAnalyze(ctx context.Context, in *pb.FaceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.FaceAnalyzeResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.FaceAnalyze(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) VoiceVerify(ctx context.Context, in *pb.VoiceVerifyRequest, opts ...ggrpc.CallOption) (*pb.VoiceVerifyResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.VoiceVerify(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) VoiceAnalyze(ctx context.Context, in *pb.VoiceAnalyzeRequest, opts ...ggrpc.CallOption) (*pb.VoiceAnalyzeResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.VoiceAnalyze(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) VoiceEmbed(ctx context.Context, in *pb.VoiceEmbedRequest, opts ...ggrpc.CallOption) (*pb.VoiceEmbedResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.VoiceEmbed(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) TokenClassify(ctx context.Context, in *pb.TokenClassifyRequest, opts ...ggrpc.CallOption) (*pb.TokenClassifyResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.TokenClassify(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) Score(ctx context.Context, in *pb.ScoreRequest, opts ...ggrpc.CallOption) (*pb.ScoreResponse, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.Score(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) AudioEncode(ctx context.Context, in *pb.AudioEncodeRequest, opts ...ggrpc.CallOption) (*pb.AudioEncodeResult, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.AudioEncode(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) AudioDecode(ctx context.Context, in *pb.AudioDecodeRequest, opts ...ggrpc.CallOption) (*pb.AudioDecodeResult, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.AudioDecode(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+func (c *InFlightTrackingClient) AudioTransform(ctx context.Context, in *pb.AudioTransformRequest, opts ...ggrpc.CallOption) (*pb.AudioTransformResult, error) {
+	defer c.track(ctx)()
+	res, err := c.Backend.AudioTransform(ctx, in, opts...)
+	return res, c.reconcile(err)
+}
+
+// AudioTransformStream, AudioToAudioStream and Forward are deliberately left as
+// embedded passthrough: they return a stream client and the inference spans the
+// stream's lifetime, not the constructor call. Wrapping the constructor with
+// track() would increment and immediately decrement (and fire onFirstComplete)
+// before any audio flows. Tracking those correctly needs the done() func tied to
+// stream close, which the current Backend interface doesn't surface here.
--- a/core/services/nodes/inflight_test.go
+++ b/core/services/nodes/inflight_test.go
@@ -304,6 +304,105 @@ var _ = Describe("InFlightTrackingClient", func() {
 		})
 	})

+	Describe("non-LLM inference methods track in-flight", func() {
+		// silero-vad and friends only ever expose a single non-Predict method.
+		// If that method isn't wrapped, the load-time reservation released by
+		// onFirstComplete never fires and in-flight is stuck at 1 forever.
+		assertTracked := func(call func() error) {
+			var firstFired int
+			client.OnFirstComplete(func() { firstFired++ })
+			err := call()
+			Expect(err).ToNot(HaveOccurred())
+			Expect(tracker.increments).To(Equal(1), "method must increment in-flight")
+			Expect(tracker.decrements).To(Equal(1), "method must decrement in-flight")
+			Expect(firstFired).To(Equal(1), "method must release the load-time reservation")
+		}
+
+		It("VAD", func() {
+			assertTracked(func() error {
+				_, err := client.VAD(context.Background(), &pb.VADRequest{})
+				return err
+			})
+		})
+
+		It("Diarize", func() {
+			assertTracked(func() error {
+				_, err := client.Diarize(context.Background(), &pb.DiarizeRequest{})
+				return err
+			})
+		})
+
+		It("VoiceVerify", func() {
+			assertTracked(func() error {
+				_, err := client.VoiceVerify(context.Background(), &pb.VoiceVerifyRequest{})
+				return err
+			})
+		})
+
+		It("VoiceAnalyze", func() {
+			assertTracked(func() error {
+				_, err := client.VoiceAnalyze(context.Background(), &pb.VoiceAnalyzeRequest{})
+				return err
+			})
+		})
+
+		It("VoiceEmbed", func() {
+			assertTracked(func() error {
+				_, err := client.VoiceEmbed(context.Background(), &pb.VoiceEmbedRequest{})
+				return err
+			})
+		})
+
+		It("FaceVerify", func() {
+			assertTracked(func() error {
+				_, err := client.FaceVerify(context.Background(), &pb.FaceVerifyRequest{})
+				return err
+			})
+		})
+
+		It("FaceAnalyze", func() {
+			assertTracked(func() error {
+				_, err := client.FaceAnalyze(context.Background(), &pb.FaceAnalyzeRequest{})
+				return err
+			})
+		})
+
+		It("TokenClassify", func() {
+			assertTracked(func() error {
+				_, err := client.TokenClassify(context.Background(), &pb.TokenClassifyRequest{})
+				return err
+			})
+		})
+
+		It("Score", func() {
+			assertTracked(func() error {
+				_, err := client.Score(context.Background(), &pb.ScoreRequest{})
+				return err
+			})
+		})
+
+		It("AudioEncode", func() {
+			assertTracked(func() error {
+				_, err := client.AudioEncode(context.Background(), &pb.AudioEncodeRequest{})
+				return err
+			})
+		})
+
+		It("AudioDecode", func() {
+			assertTracked(func() error {
+				_, err := client.AudioDecode(context.Background(), &pb.AudioDecodeRequest{})
+				return err
+			})
+		})
+
+		It("AudioTransform", func() {
+			assertTracked(func() error {
+				_, err := client.AudioTransform(context.Background(), &pb.AudioTransformRequest{})
+				return err
+			})
+		})
+	})
+
 	Describe("stale model reload (self-heal)", func() {
 		It("removes the replica when the backend reports the model is not loaded", func() {
 			backend.predictErr = fmt.Errorf("parakeet-cpp: model not loaded")
--- a/docs/content/features/openai-realtime.md
+++ b/docs/content/features/openai-realtime.md
@@ -74,6 +74,28 @@ EXTERNAL_GRPC_BACKENDS=opus:/path/to/backend/go/opus/opus

 The opus backend is loaded automatically when a WebRTC session starts. It does not require any model configuration file — just the backend binary.

+#### WebRTC behind Docker host networking or NAT
+
+By default pion gathers a host ICE candidate for every local interface. Under
+Docker **host networking** that includes bridge addresses (`docker0`/`veth`,
+`172.x`) that a remote browser cannot route to: the call typically connects on a
+good candidate and then drops a few seconds later when ICE consent checks fail on
+the unreachable ones. Two settings let you advertise only the reachable address:
+
+```bash
+# Advertise these IPs as the host ICE candidates (e.g. the host's LAN IP)
+LOCALAI_WEBRTC_NAT_1TO1_IPS=192.168.1.10
+
+# ...or restrict ICE gathering to specific interfaces
+LOCALAI_WEBRTC_ICE_INTERFACES=eth0
+```
+
+{{% notice tip %}}
+For a browser on another LAN machine talking to LocalAI in a host-networked
+container, set `LOCALAI_WEBRTC_NAT_1TO1_IPS` to the host's LAN IP. This is the
+most reliable fix for WebRTC connections that establish and then drop.
+{{% /notice %}}
+
 ## Protocol

 The API follows the OpenAI Realtime API protocol for handling sessions, audio buffers, and conversation items.
--- a/docs/content/getting-started/try-it-out.md
+++ b/docs/content/getting-started/try-it-out.md
@@ -20,7 +20,29 @@ With the CLI you can list the models with `local-ai models list` and install the
 You can also [run models manually]({{%relref "getting-started/models" %}}) by copying files into the `models` directory.
 {{% /notice %}}

-You can test out the API endpoints using `curl`, few examples are listed below. The models we are referring here (`gpt-4`, `gpt-4-vision-preview`, `tts-1`, `whisper-1`) are examples - replace them with the model names you have installed.
+You can test chat models from the CLI without keeping a separate `curl` command around:
+
+```bash
+# Terminal 1
+local-ai run
+
+# Terminal 2
+local-ai chat --model gpt-4
+```
+
+`local-ai chat` connects to a running LocalAI server, opens an interactive chat prompt, and exits when you type `/exit`, `/quit`, or `/bye`. Use `/models` to list installed models, `/model <name>` to switch models, and `/clear` to reset the current conversation. If the server exposes exactly one model, LocalAI uses that model automatically:
+
+```bash
+# Terminal 1
+local-ai run llama-3.2-1b-instruct:q4_k_m
+
+# Terminal 2
+local-ai chat
+```
+
+When more than one model is configured, pass `--model` with the installed model name to avoid ambiguity. Use `--endpoint` to connect to a non-default server, for example `local-ai chat --endpoint http://127.0.0.1:8081 --model gpt-4`.
+
+You can also test out the API endpoints using `curl`, few examples are listed below. The models we are referring here (`gpt-4`, `gpt-4-vision-preview`, `tts-1`, `whisper-1`) are examples - replace them with the model names you have installed.

 ### Text Generation

--- a/docs/content/reference/cli-reference.md
+++ b/docs/content/reference/cli-reference.md
@@ -118,6 +118,21 @@ For more information on VRAM management, see [VRAM and Memory Management]({{%rel

 See [Authentication & Authorization]({{%relref "features/authentication" %}}) for full documentation.

+## Chat Flags
+
+Use `local-ai chat` to open an interactive terminal chat session against a running LocalAI server.
+
+| Parameter | Default | Description | Environment Variable |
+|-----------|---------|-------------|----------------------|
+| `--endpoint` | `http://127.0.0.1:8080` | LocalAI server endpoint. The `/v1` path is added automatically when omitted. | `$LOCALAI_CHAT_ENDPOINT` |
+| `--model` | | Model name to use. If omitted, LocalAI uses the only model returned by the server when exactly one is available. | |
+| `--api-key` | | API key to use when the LocalAI server requires authentication. | `$LOCALAI_API_KEY`, `$API_KEY` |
+
+- Inside the chat prompt:
+  - Use `/models` to list installed models.
+  - Use `/model <name>` to switch to a different model and clear the conversation.
+  - Use `/clear` to reset the current conversation.
+
 ## P2P Flags

 | Parameter | Default | Description | Environment Variable |
@@ -181,4 +196,3 @@ export LOCALAI_F16=true

 - See [Advanced Usage]({{%relref "advanced/advanced-usage" %}}) for configuration examples
 - See [VRAM and Memory Management]({{%relref "advanced/vram-management" %}}) for memory management options
-
--- a/gallery/index.yaml
+++ b/gallery/index.yaml
@@ -6685,6 +6685,37 @@
    - filename: rfdetr-nano-q8_0.gguf
      uri: huggingface://mudler/rfdetr-cpp-nano/rfdetr-nano-q8_0.gguf
      sha256: 940084c60a780f1a19a51458ae3a601454b3b843675fa0713ff43ae5bccc0d9b
+- name: locate-anything-3b
+  url: github:mudler/LocalAI/gallery/virtual.yaml@master
+  urls:
+    - https://github.com/mudler/locate-anything.cpp
+    - https://huggingface.co/nvidia/LocateAnything-3B
+    - https://huggingface.co/mudler/locate-anything.cpp-gguf
+  description: |
+    NVIDIA LocateAnything-3B open-vocabulary object detection (visual grounding), served via the native
+    locate-anything.cpp backend (C++/ggml + purego, no Python). Describe what to find in a text prompt and
+    get labeled boxes back; separate multiple categories with </c>. Q8_0 is the recommended default:
+    box-identical to F16/F32, ~6.3GB, fastest CPU latency. Drop-in for the /v1/detection endpoint (pass the
+    prompt).
+  license: other
+  icon: https://avatars.githubusercontent.com/u/53104118?s=200&v=4
+  tags:
+    - object-detection
+    - open-vocabulary
+    - locate-anything
+    - native
+    - cpp
+    - cpu
+  overrides:
+    backend: locate-anything-cpp
+    known_usecases:
+      - detection
+    parameters:
+      model: locate-anything-q8_0.gguf
+  files:
+    - filename: locate-anything-q8_0.gguf
+      uri: huggingface://mudler/locate-anything.cpp-gguf/locate-anything-q8_0.gguf
+      sha256: 0909d8a1aba584b482d501baae032611d1559878be1b7f6606ba516687c5380d
 - name: rfdetr-cpp-base
  url: github:mudler/LocalAI/gallery/virtual.yaml@master
  urls:
--- a/pkg/reasoning/reasoning.go
+++ b/pkg/reasoning/reasoning.go
@@ -89,6 +89,35 @@ func ExtractReasoningWithConfig(content, thinkingStartToken string, config Confi
 	return reasoning, cleanedContent
 }

+// ExtractReasoningComplete extracts reasoning from a COMPLETE (non-streaming)
+// model response. It behaves like ExtractReasoningWithConfig except that it only
+// honors a prefilled thinking start token when the response actually contains
+// the matching closing tag.
+//
+// Rationale: when a chat template injects the start token into the prompt (so
+// DetectThinkingStartToken returns e.g. "<think>"), the model's output begins
+// inside a reasoning block and carries only the closing tag. The defensive
+// fallback prepends the start token so the extractor can pair it with that
+// close tag. But on a COMPLETE response with no closing tag, the model answered
+// directly with no reasoning at all — prepending the start token would
+// manufacture an unclosed block that swallows the entire answer into reasoning,
+// leaving content empty (breaking short/direct answers such as session names or
+// JSON summaries). Genuine reasoning tags already present in the content still
+// extract, because dropping the synthetic prefill does not affect them.
+//
+// Streaming callers must keep using ExtractReasoningWithConfig: mid-stream an
+// as-yet-unclosed block is legitimate and its tokens should surface as
+// reasoning deltas as they arrive.
+func ExtractReasoningComplete(content, thinkingStartToken string, config Config) (reasoning string, cleanedContent string) {
+	startToken := thinkingStartToken
+	if startToken != "" {
+		if end := ClosingTokenForStart(startToken, &config); end == "" || !strings.Contains(content, end) {
+			startToken = ""
+		}
+	}
+	return ExtractReasoningWithConfig(content, startToken, config)
+}
+
 // PrependThinkingTokenIfNeeded prepends the thinking start token to content if it was
 // detected in the prompt. This allows the standard extraction logic to work correctly
 // for models where the thinking token is already in the prompt.
@@ -131,6 +160,48 @@ func PrependThinkingTokenIfNeeded(content string, startToken string) string {
 	return startToken + content
 }

+// defaultReasoningTagPairs are the built-in start/end reasoning tag pairs,
+// matching llama.cpp's chat-parser.cpp. Kept at package scope so that
+// ExtractReasoning and ClosingTokenForStart share a single source of truth.
+var defaultReasoningTagPairs = []TagPair{
+	{Start: "<|START_THINKING|>", End: "<|END_THINKING|>"},            // Command-R models
+	{Start: "<|inner_prefix|>", End: "<|inner_suffix|>"},              // Apertus models
+	{Start: "<seed:think>", End: "</seed:think>"},                     // Seed models
+	{Start: "<think>", End: "</think>"},                               // DeepSeek, Granite, ExaOne models
+	{Start: "<|think|>", End: "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
+	{Start: "<|channel>thought", End: "<channel|>"},                   // Gemma 4 models
+	{Start: "<thinking>", End: "</thinking>"},                         // General thinking tag
+	{Start: "[THINK]", End: "[/THINK]"},                               // Magistral models
+}
+
+// ClosingTokenForStart returns the closing reasoning tag that pairs with the
+// given start token, searching custom config TagPairs first then the built-in
+// defaults. Returns "" when startToken is empty or unrecognized.
+//
+// Used by the non-streaming autoparser fallback to decide whether a complete
+// response that began with a prefilled thinking token actually closed its
+// reasoning block: only then is synthesizing the start token (so the standard
+// extractor can pair it with the model's close tag) safe. A complete response
+// with no closing tag is a direct answer, not unclosed reasoning.
+func ClosingTokenForStart(startToken string, config *Config) string {
+	if startToken == "" {
+		return ""
+	}
+	if config != nil {
+		for _, pair := range config.TagPairs {
+			if pair.Start == startToken {
+				return pair.End
+			}
+		}
+	}
+	for _, pair := range defaultReasoningTagPairs {
+		if pair.Start == startToken {
+			return pair.End
+		}
+	}
+	return ""
+}
+
 // ExtractReasoning extracts reasoning content from thinking tags and returns
 // both the extracted reasoning and the cleaned content (with tags removed).
 // It handles <thinking>...</thinking> and <think>...</think> tags.
@@ -145,22 +216,7 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
 	var cleanedParts []string
 	remaining := content

-	// Define default tag pairs to look for (matching llama.cpp's chat-parser.cpp)
-	defaultTagPairs := []struct {
-		start string
-		end   string
-	}{
-		{"<|START_THINKING|>", "<|END_THINKING|>"},            // Command-R models
-		{"<|inner_prefix|>", "<|inner_suffix|>"},              // Apertus models
-		{"<seed:think>", "</seed:think>"},                     // Seed models
-		{"<think>", "</think>"},                               // DeepSeek, Granite, ExaOne models
-		{"<|think|>", "<|end|><|begin|>assistant<|content|>"}, // Solar Open models (complex end)
-		{"<|channel>thought", "<channel|>"},                    // Gemma 4 models
-		{"<thinking>", "</thinking>"},                         // General thinking tag
-		{"[THINK]", "[/THINK]"},                               // Magistral models
-	}
-
-	// Merge custom tag pairs with default tag pairs (custom pairs first for priority)
+	// Merge custom tag pairs (highest priority) with the built-in defaults.
 	var tagPairs []struct {
 		start string
 		end   string
@@ -175,9 +231,11 @@ func ExtractReasoning(content string, config *Config) (reasoning string, cleaned
 			}
 		}
 	}
-	// Add default tag pairs
-	for _, pair := range defaultTagPairs {
-		tagPairs = append(tagPairs, pair)
+	for _, pair := range defaultReasoningTagPairs {
+		tagPairs = append(tagPairs, struct {
+			start string
+			end   string
+		}{pair.Start, pair.End})
 	}

 	// Track the last position we've processed
--- a/pkg/reasoning/reasoning_test.go
+++ b/pkg/reasoning/reasoning_test.go
@@ -1175,6 +1175,55 @@ var _ = Describe("Custom Tokens and Tag Pairs Integration", func() {
 	})
 })

+var _ = Describe("ClosingTokenForStart", func() {
+	It("returns the default closing tag for a known start token", func() {
+		Expect(ClosingTokenForStart("<think>", nil)).To(Equal("</think>"))
+		Expect(ClosingTokenForStart("<thinking>", nil)).To(Equal("</thinking>"))
+		Expect(ClosingTokenForStart("[THINK]", nil)).To(Equal("[/THINK]"))
+	})
+
+	It("returns empty for an empty or unknown start token", func() {
+		Expect(ClosingTokenForStart("", nil)).To(BeEmpty())
+		Expect(ClosingTokenForStart("<nope>", nil)).To(BeEmpty())
+	})
+
+	It("prefers custom config tag pairs over the defaults", func() {
+		cfg := &Config{TagPairs: []TagPair{{Start: "<think>", End: "<<END>>"}}}
+		Expect(ClosingTokenForStart("<think>", cfg)).To(Equal("<<END>>"))
+	})
+})
+
+var _ = Describe("ExtractReasoningComplete", func() {
+	const startToken = "<think>"
+
+	It("keeps a tag-less answer as content when a start token is prefilled but no close tag is present", func() {
+		// The bug guard: prompt-prefilled <think>, model answered directly with
+		// no reasoning. The synthetic prefill must not swallow it as reasoning.
+		reasoning, content := ExtractReasoningComplete("hello", startToken, Config{})
+		Expect(reasoning).To(BeEmpty())
+		Expect(content).To(Equal("hello"))
+	})
+
+	It("extracts reasoning when the model emits only the closing tag (legitimate prefill)", func() {
+		reasoning, content := ExtractReasoningComplete("the rationale\n</think>\n\nthe answer", startToken, Config{})
+		Expect(reasoning).To(ContainSubstring("the rationale"))
+		Expect(content).To(ContainSubstring("the answer"))
+		Expect(content).ToNot(ContainSubstring("</think>"))
+	})
+
+	It("extracts a fully-tagged block regardless of the prefill token", func() {
+		reasoning, content := ExtractReasoningComplete("<think>r</think>answer", startToken, Config{})
+		Expect(reasoning).To(Equal("r"))
+		Expect(content).To(Equal("answer"))
+	})
+
+	It("behaves like ExtractReasoningWithConfig when no start token is prefilled", func() {
+		reasoning, content := ExtractReasoningComplete("<think>r</think>answer", "", Config{})
+		Expect(reasoning).To(Equal("r"))
+		Expect(content).To(Equal("answer"))
+	})
+})
+
 // Helper function to create bool pointers for test configs
 func boolPtr(b bool) *bool {
 	return &b
Author	SHA1	Message	Date
mudler	188cad1ee7	feat(gallery): add locate-anything-3b model to the gallery index Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-11 23:04:43 +00:00
mudler	f4020820e5	test(backend): locate-anything-cpp Load+Detect wire test Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-11 23:04:43 +00:00
mudler	d7be691823	feat(gallery): locate-anything gallery entry + model importer Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-11 23:04:43 +00:00
mudler	55058a52cc	ci(backend): register locate-anything-cpp in build matrix Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-11 23:04:43 +00:00
mudler	8952a0a0e0	feat(backend): add locate-anything-cpp backend (open-vocab detection via la_capi) A Go/purego backend wrapping locate-anything.cpp's la_capi C ABI, implementing the gRPC Detect RPC: image + open-vocabulary text prompt -> labeled boxes. Mirrors backend/go/rfdetr-cpp; static-links ggml into a per-CPU-variant .so. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-11 23:04:43 +00:00
LocalAI [bot]	fba8c9c498	fix(distributed): track in-flight for non-LLM inference methods (VAD, diarize, voice, ...) (#10238 ) fix(distributed): track in-flight for non-LLM inference methods InFlightTrackingClient only wrapped a subset of the grpc.Backend inference methods (Predict, Embeddings, TTS, AudioTranscription, Detect, Rerank, ...). Methods like VAD were left as embedded passthrough, so track() never ran for them. In distributed mode every model is loaded with in_flight=1 as a reservation; that reservation is only released by the OnFirstComplete callback, which fires after the first tracked inference call completes. A VAD-only model (e.g. silero-vad) never calls a tracked method, so the reservation is never released and in-flight stays pinned at 1 forever - which also blocks the router's idle-eviction logic. Wrap the remaining unary inference methods (VAD, Diarize, Face, Voice, TokenClassify, Score, AudioEncode, AudioDecode, AudioTransform) with the same track()/reconcile() pattern. The three bidi-stream constructors (AudioTransformStream, AudioToAudioStream, Forward) are deliberately left as passthrough - their inference spans the stream lifetime, not the constructor call, so track() there would fire onFirstComplete before any data flows. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-10 16:29:50 +02:00
LocalAI [bot]	6b2badb837	chore: ⬆️ Update CrispStrobe/CrispASR to `c29f6653a516a3001d923944dad8892072cc7334` (#10236 ) ⬆️ Update CrispStrobe/CrispASR Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 16:16:24 +02:00
LocalAI [bot]	8b8506d01a	chore: ⬆️ Update ggml-org/llama.cpp to `039e20a2db9e87b2477c76cc04905f3e1acad77f` (#10223 ) ⬆️ Update ggml-org/llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 12:22:03 +02:00
LocalAI [bot]	6910a0bb48	chore: ⬆️ Update antirez/ds4 to `91bafb5acd5a6cf00b1e55ef68bf40ddd207bee7` (#10234 ) ⬆️ Update antirez/ds4 Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 12:08:19 +02:00
LocalAI [bot]	cffd03b522	chore: ⬆️ Update ikawrakow/ik_llama.cpp to `e6f8112f3ba126eed3ff5b30cdd08085414a7516` (#10233 ) ⬆️ Update ikawrakow/ik_llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 12:07:49 +02:00
LocalAI [bot]	bf448d3794	chore: ⬆️ Update ggml-org/whisper.cpp to `df7638d8229a243af8a4b5a8ae557e0d74e0a0ae` (#10220 ) ⬆️ Update ggml-org/whisper.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 01:16:29 +02:00
LocalAI [bot]	1d4a12f7c0	chore: ⬆️ Update CrispStrobe/CrispASR to `97cad527d247edefc904e6c40c4cf5ee78bed055` (#10221 ) ⬆️ Update CrispStrobe/CrispASR Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 01:16:17 +02:00
LocalAI [bot]	186d62801d	chore: ⬆️ Update leejet/stable-diffusion.cpp to `19bdfe22d255d5b4dff39d449318b9bc5ea2317f` (#10222 ) ⬆️ Update leejet/stable-diffusion.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 01:16:06 +02:00
LocalAI [bot]	da4ed05429	chore: ⬆️ Update ikawrakow/ik_llama.cpp to `2768b6251548b78b6610e95edad13f888ad95982` (#10219 ) ⬆️ Update ikawrakow/ik_llama.cpp Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 01:15:54 +02:00
LocalAI [bot]	ec1eea4f45	chore: ⬆️ Update antirez/ds4 to `512d07cb08f234b704b5a5959aa9e2d4c466eeb0` (#10224 ) ⬆️ Update antirez/ds4 Signed-off-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> Co-authored-by: mudler <2420543+mudler@users.noreply.github.com>	2026-06-10 01:15:42 +02:00
LocalAI [bot]	b203b32e57	feat(realtime): make WebRTC ICE candidates configurable (#10231 ) The /v1/realtime WebRTC handler created the peer connection with a bare webrtc.Configuration and no SettingEngine, so pion gathered a host ICE candidate for every local interface. Under Docker host networking that includes bridge addresses (docker0/veth, 172.x) a remote browser cannot route to; the call establishes on a good pair and then drops once ICE consent freshness checks fail on the unreachable candidates. Add two opt-in knobs, applied via a pion SettingEngine: - LOCALAI_WEBRTC_NAT_1TO1_IPS: advertise these IPs as the host candidates (e.g. the host LAN IP) - LOCALAI_WEBRTC_ICE_INTERFACES: restrict ICE gathering to these interfaces Defaults are unchanged (empty => current all-interface behavior). Assisted-by: Claude:claude-opus-4-8 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-09 22:28:03 +02:00
Ching	48a8ce98aa	fix(cli): handle chat output errors (#10229 ) Propagate terminal write errors from the chat prompt and explicitly ignore stream close errors during cleanup. Update chat tests to assert response writer errors so errcheck passes without hiding failed writes. Tests: - go test -count=1 ./core/cli/chat - go test -count=1 ./core/cli Assisted-by: Codex:GPT-5 Signed-off-by: Ching Kao <0980124jim@gmail.com>	2026-06-09 19:10:24 +02:00
Ching	8344d1c865	feat(cli): add interactive chat mode (#10226 ) Add an opt-in `local-ai chat` command for testing chat models directly from the terminal without manually sending curl requests. The command connects to a running LocalAI server, lists available models through the existing OpenAI-compatible API, streams chat completions, and supports interactive commands such as `/models`, `/model`, `/clear`, and `/exit`. Keep `local-ai run` focused on the server lifecycle so the web UI, API clients, and multiple chat terminals can coexist against the same server. Document the new command and terminal workflow in the README and CLI docs. Tests: - go test -count=1 ./core/cli/chat - go test -count=1 ./core/cli Assisted-by: Codex:GPT-5 Signed-off-by: Ching Kao <0980124jim@gmail.com>	2026-06-09 14:58:44 +00:00
Pete	d2e6b93369	feat(agents): surface KB source citations in RAG responses (#10228 ) * dev knowledge.go structure Signed-off-by: Pete Chen <petechentw@gmail.com> * feat(agents): append KB source citations to responses Render structured KB citations as a Sources block after agent responses, linking each source to the existing raw collection entry endpoint. Keep long-term memory writes on the original model response so citation blocks do not get stored back into the knowledge base. Tested with: go test ./core/services/agents Assisted-by: Codex:gpt-5 Signed-off-by: Pete Chen <petechentw@gmail.com> * Collect KB citations from tool searches Signed-off-by: Pete Chen <petechentw@gmail.com> * fix(agents): append KB sources in local chats Apply the shared KB citation post-processing to standalone LocalAGI chat responses so the React agent chat receives the same clickable Sources block as the native executor path. Also fix the run target to use the current cmd/local-ai entrypoint. Assisted-by: Codex:gpt-5 Signed-off-by: Pete Chen <petechentw@gmail.com> --------- Signed-off-by: Pete Chen <petechentw@gmail.com> Co-authored-by: shihyunhuang <shihyunhuang88@gmail.com> Co-authored-by: TLoE419 <tloemizuchizu@gmail.com> Co-authored-by: Ching Kao <0980124jim@gmail.com>	2026-06-09 16:32:56 +02:00
LocalAI [bot]	e1ec03d33f	fix(reasoning): stop prefilled <think> from swallowing tag-less answers (#10225 ) * fix(reasoning): stop prefilled <think> from swallowing tag-less answers When a chat template injects the thinking start token into the prompt (so DetectThinkingStartToken returns e.g. "<think>"), the model's output begins inside a reasoning block and carries only the closing tag. The non-jinja autoparser fallback (peg-native "pure content" mode, issue #9985) prepends the start token so the extractor can pair it with the model's </think>. But on a COMPLETE response that contains no closing tag, the model answered directly with no reasoning at all. Prepending the start token there manufactures an unclosed block that swallows the entire answer into reasoning, leaving the OpenAI `content` field empty. This breaks short/direct answers — session names, JSON summaries, any terse completion where the model skips the think block — which come back with empty content. Regression surfaced by #9991, which added the defensive prefill extraction to the complete-response paths. Add reasoning.ExtractReasoningComplete: it only honors a prefilled start token when the response actually contains the matching closing tag (proof a reasoning block exists). Genuine reasoning tags already in the content still extract; tag-less content stays content. Apply it at every complete-response site (applyAutoparserOverride, realtime, openresponses). The streaming per-token extractor is intentionally left on ExtractReasoningWithConfig — mid-stream an as-yet-unclosed block is legitimate and must surface as reasoning deltas. Also adds reasoning.ClosingTokenForStart and hoists the default reasoning tag pairs to package scope so both helpers share one source of truth. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> * test(reasoning): cover the enable_thinking=false non-thinking-mode regression Adds the end-to-end case that actually broke session summaries / auto-titles and was not covered before: a request with enable_thinking=false against a <think>-capable model. In non-thinking mode the model emits no reasoning block, so llama.cpp's autoparser returns ChatDeltas with content set and reasoning_content empty (verified against stock llama-server: same model with chat_template_kwargs.enable_thinking=false returns reasoning_content=null, content="hello"). thinkingStartToken is still "<think>" because it is detected per-model from the enable_thinking=true render, so the old code prepended it and swallowed the answer. The test fails without the ExtractReasoningComplete gate. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Claude Opus 4.8 (1M context) <noreply@anthropic.com>	2026-06-09 09:02:04 +02:00
LocalAI [bot]	9323f4b5ca	feat(llama-cpp): video input support (mtmd #24269 ) (#10216 ) * chore(llama-cpp): bump to 8f83d6c for mtmd video input support Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(llama-cpp): forward video input to mtmd (template + non-template paths) Wire request->videos() into grpc-server.cpp mirroring the existing image and audio handling: a video_data build + non-template files extraction, and input_video chat chunks on the tokenizer-template path. allow_video is auto-set at model load by the vendored upstream chat_params. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat(ui): add video attachment support to the chat UI Mirror the image/audio attachment path for video: emit video_url content parts, accept video/* in the picker, keep video files as base64, show a film icon badge, and render attached video inline with a <video> player. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix(llama-cpp): patch mtmd video stdin double-close (heap crash) Upstream mtmd video input (ggml-org/llama.cpp#24269) double-fcloses the ffmpeg/ffprobe stdin FILE: feed_stdin() fclose()s the FILE returned by subprocess_stdin() (which is sp->stdin_file), then subprocess_destroy() fclose()s the same pointer again -> heap corruption that aborts the backend on any base64 input_video request (the CLI --video file path is unaffected). Vendor a one-line fix (null sp->stdin_file after fclose) via prepare.sh's patches/ until upstream merges it. Verified e2e with gemma-4-e2b-it-qat-q4_0: video frames decode via ffmpeg and the model answers correctly (red clip -> 'Red', blue -> 'Blue'). Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * chore(llama-cpp): re-pin to upstream #24316, drop vendored stdin patch Upstream replaced the ad-hoc video stdin handling with a proper RAII refactor (ggml-org/llama.cpp#24316, "mtmd: refactor video subproc handling"), which includes the same `sp->stdin_file = nullptr` guard our patch added (plus join-before-destroy ordering). Re-pin LLAMA_VERSION to that branch head and drop patches/0001 - it's now redundant. Verified e2e with gemma-4-e2b-it-qat-q4_0: no crash, video frames decode and the model answers correctly (red clip -> "Red", blue -> "Blue"). NOTE: #24316 is not yet merged, so this pins to its branch-head commit (28ca1e60). Re-pin to the squash-merge commit on master once it lands, otherwise `git fetch` may lose the commit after the branch is deleted. Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>	2026-06-08 23:17:50 +02:00