diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index df5512283..e18e38b62 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -121,3 +121,19 @@ jobs: detached: true connect-timeout-seconds: 180 limit-access-to-actor: true + + # Fast standalone unit tests for the backends' pure C++ helpers - currently the + # llama-cpp message reconstruction (backend/cpp/llama-cpp/message_content.h), + # which guards the OpenAI chat content normalization (mudler/LocalAI#10524, + # #7324, #7528). The runner discovers every *_test.cpp under backend/cpp/, so + # new pure-C++ unit tests are picked up with no CI changes. These need only the + # C++ stdlib + nlohmann/json, so they run on every PR without the full + # llama.cpp + gRPC backend build. (The same suite is also wired as an opt-in + # CMake/ctest target, -DLLAMA_GRPC_BUILD_TESTS=ON, for in-backend-build runs.) + tests-backend-cpp: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v7 + - name: Run backend C++ unit tests + run: make test-backend-cpp diff --git a/Makefile b/Makefile index a9909d553..9f01273ed 100644 --- a/Makefile +++ b/Makefile @@ -103,7 +103,7 @@ COVERAGE_E2E_LABELS?=!real-models COVERAGE_EXCLUDE_RE?=grpc/proto/.*[.]pb[.]go -.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all +.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-backend-cpp test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all all: help @@ -201,6 +201,13 @@ test: prepare-test OPUS_SHIM_LIBRARY=$(abspath ./pkg/opus/shim/libopusshim.so) \ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) +## Compiles and runs the standalone C++ unit tests for the backends (pure +## helpers that depend only on the stdlib + nlohmann/json, no full backend +## build). Discovers every *_test.cpp under backend/cpp/ - see +## backend/cpp/run-unit-tests.sh. Set NLOHMANN_INCLUDE to skip the header fetch. +test-backend-cpp: + bash backend/cpp/run-unit-tests.sh + ## Runs the core suite ($(TEST_PATHS)) with statement-coverage instrumentation ## and writes a merged profile to $(COVERAGE_PROFILE). Deliberately omits ## --fail-fast so a single failure doesn't truncate the coverage number, and diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt index bdf20802a..8b8d2e2d5 100644 --- a/backend/cpp/llama-cpp/CMakeLists.txt +++ b/backend/cpp/llama-cpp/CMakeLists.txt @@ -87,3 +87,18 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) endif() + +# Unit test for the message-content normalization helper (message_content.h). +# Off by default so the normal backend build is untouched; enable with +# -DLLAMA_GRPC_BUILD_TESTS=ON and run via ctest. It reuses llama.cpp's vendored +# (propagated by the common helpers library) so it has no +# extra dependency beyond what the backend already builds against. +option(LLAMA_GRPC_BUILD_TESTS "Build grpc-server unit tests" OFF) +if(LLAMA_GRPC_BUILD_TESTS) + enable_testing() + add_executable(message_content_test message_content_test.cpp message_content.h) + target_include_directories(message_content_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(message_content_test PRIVATE ${_LLAMA_COMMON_TARGET}) + target_compile_features(message_content_test PRIVATE cxx_std_17) + add_test(NAME message_content_test COMMAND message_content_test) +endif() diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 3e0eeb503..3c45302e5 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -39,6 +39,7 @@ #include "common.h" #include "arg.h" #include "chat-auto-parser.h" +#include "message_content.h" #include #include #include @@ -1728,242 +1729,20 @@ public: for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); - json msg_json; - msg_json["role"] = msg.role(); - - bool is_last_user_msg = (i == last_user_msg_idx); - bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0); - - // Handle content - can be string, null, or array - // For multimodal content, we'll embed images/audio from separate fields - if (!msg.content().empty()) { - // Try to parse content as JSON to see if it's already an array - json content_val; - try { - content_val = json::parse(msg.content()); - // Handle null values - convert to empty string to avoid template errors - if (content_val.is_null()) { - content_val = ""; - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - content_val = msg.content(); - } - - // If content is an object (e.g., from tool call failures), convert to string - if (content_val.is_object()) { - content_val = content_val.dump(); - } - - // If content is a string and this is the last user message with images/audio, combine them - if (content_val.is_string() && is_last_user_msg && has_images_or_audio) { - json content_array = json::array(); - // Add text first - content_array.push_back({{"type", "text"}, {"text", content_val.get()}}); - // Add images - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - // Add audios - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - } else { - // Use content as-is (already array or not last user message) - // Ensure null values are converted to empty string - if (content_val.is_null()) { - msg_json["content"] = ""; - } else { - msg_json["content"] = content_val; - } - } - } else if (is_last_user_msg && has_images_or_audio) { - // If no content but this is the last user message with images/audio, create content array - json content_array = json::array(); - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - } else if (msg.role() == "tool") { - // Tool role messages must have content field set, even if empty - // Jinja templates expect content to be a string, not null or object - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0); - if (msg.content().empty()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): empty content, set to empty string\n", i); - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): content exists: %s\n", - i, msg.content().substr(0, std::min(200, msg.content().size())).c_str()); - // Content exists, parse and ensure it's a string - json content_val; - try { - content_val = json::parse(msg.content()); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): parsed JSON, type=%s\n", - i, content_val.is_null() ? "null" : - content_val.is_object() ? "object" : - content_val.is_string() ? "string" : - content_val.is_array() ? "array" : "other"); - // Handle null values - Jinja templates expect content to be a string, not null - if (content_val.is_null()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): null content, converted to empty string\n", i); - } else if (content_val.is_object()) { - // If content is an object (e.g., from tool call failures/errors), convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): object content, converted to string: %s\n", - i, content_val.dump().substr(0, std::min(200, content_val.dump().size())).c_str()); - } else if (content_val.is_string()) { - msg_json["content"] = content_val.get(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): string content, using as-is\n", i); - } else { - // For arrays or other types, convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): %s content, converted to string\n", - i, content_val.is_array() ? "array" : "other type"); - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - msg_json["content"] = msg.content(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): not JSON, using as string\n", i); - } - } - } else { - // Ensure all messages have content set (fallback for any unhandled cases) - // Jinja templates expect content to be present, default to empty string if not set - if (!msg_json.contains("content")) { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (role=%s): no content field, adding empty string\n", - i, msg.role().c_str()); - msg_json["content"] = ""; - } + llama_grpc::ReconstructedMessageInput rin; + rin.role = msg.role(); + rin.content = msg.content(); + rin.name = msg.name(); + rin.tool_call_id = msg.tool_call_id(); + rin.reasoning_content = msg.reasoning_content(); + rin.tool_calls = msg.tool_calls(); + rin.is_last_user_msg = (i == last_user_msg_idx); + if (rin.is_last_user_msg) { + for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j)); + for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j)); + for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j)); } - - // Add optional fields for OpenAI-compatible message format - if (!msg.name().empty()) { - msg_json["name"] = msg.name(); - } - if (!msg.tool_call_id().empty()) { - msg_json["tool_call_id"] = msg.tool_call_id(); - } - if (!msg.reasoning_content().empty()) { - msg_json["reasoning_content"] = msg.reasoning_content(); - } - if (!msg.tool_calls().empty()) { - // Parse tool_calls JSON string and add to message - try { - json tool_calls = json::parse(msg.tool_calls()); - msg_json["tool_calls"] = tool_calls; - SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str()); - // IMPORTANT: If message has tool_calls but content is empty or not set, - // set content to space " " instead of empty string "", because llama.cpp's - // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312), - // which causes template errors when accessing message.content[:tool_start_length] - if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get().empty())) { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d has tool_calls but empty content, setting to space\n", i); - msg_json["content"] = " "; - } - // Log each tool call with name and arguments - if (tool_calls.is_array()) { - for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) { - const auto& tc = tool_calls[tc_idx]; - std::string tool_name = "unknown"; - std::string tool_args = "{}"; - if (tc.contains("function")) { - const auto& func = tc["function"]; - if (func.contains("name")) { - tool_name = func["name"].get(); - } - if (func.contains("arguments")) { - tool_args = func["arguments"].is_string() ? - func["arguments"].get() : - func["arguments"].dump(); - } - } else if (tc.contains("name")) { - tool_name = tc["name"].get(); - if (tc.contains("arguments")) { - tool_args = tc["arguments"].is_string() ? - tc["arguments"].get() : - tc["arguments"].dump(); - } - } - SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d, tool_call %zu: name=%s, arguments=%s\n", - i, tc_idx, tool_name.c_str(), tool_args.c_str()); - } - } - } catch (const json::parse_error& e) { - SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what()); - } - } - - // Debug: Log final content state before adding to array - if (msg_json.contains("content")) { - if (msg_json["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i); - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content type=%s, has_value=%d\n", - i, msg_json["content"].is_string() ? "string" : - msg_json["content"].is_array() ? "array" : - msg_json["content"].is_object() ? "object" : "other", - msg_json["content"].is_null() ? 0 : 1); - } - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i); - } - - messages_json.push_back(msg_json); + messages_json.push_back(llama_grpc::build_reconstructed_message(rin)); } // Final safety check: Ensure no message has null content (Jinja templates require strings) @@ -2184,36 +1963,7 @@ public: if (body_json.contains("messages") && body_json["messages"].is_array()) { SRV_INF("[CONTENT DEBUG] PredictStream: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size()); for (size_t idx = 0; idx < body_json["messages"].size(); idx++) { - auto& msg = body_json["messages"][idx]; - std::string role_str = msg.contains("role") ? msg["role"].get() : "unknown"; - if (msg.contains("content")) { - if (msg["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Fix null content - } else if (role_str == "tool" && msg["content"].is_array()) { - // Tool messages must have string content, not array - // oaicompat_chat_params_parse expects tool messages to have string content - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx); - msg["content"] = msg["content"].dump(); - } else if (!msg["content"].is_string() && !msg["content"].is_array()) { - // If content is object or other non-string type, convert to string for templates - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str()); - if (msg["content"].is_object()) { - msg["content"] = msg["content"].dump(); - } else { - msg["content"] = ""; - } - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n", - idx, role_str.c_str(), - msg["content"].is_string() ? "string" : - msg["content"].is_array() ? "array" : - msg["content"].is_object() ? "object" : "other"); - } - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Add missing content - } + llama_grpc::normalize_template_message(body_json["messages"][idx]); } } @@ -2545,264 +2295,20 @@ public: SRV_INF("[CONTENT DEBUG] Predict: Processing %d messages\n", request->messages_size()); for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); - json msg_json; - msg_json["role"] = msg.role(); - - SRV_INF("[CONTENT DEBUG] Predict: Message %d: role=%s, content_empty=%d, content_length=%zu\n", - i, msg.role().c_str(), msg.content().empty() ? 1 : 0, msg.content().size()); - if (!msg.content().empty()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d content (first 200 chars): %s\n", - i, msg.content().substr(0, std::min(200, msg.content().size())).c_str()); + llama_grpc::ReconstructedMessageInput rin; + rin.role = msg.role(); + rin.content = msg.content(); + rin.name = msg.name(); + rin.tool_call_id = msg.tool_call_id(); + rin.reasoning_content = msg.reasoning_content(); + rin.tool_calls = msg.tool_calls(); + rin.is_last_user_msg = (i == last_user_msg_idx); + if (rin.is_last_user_msg) { + for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j)); + for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j)); + for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j)); } - - bool is_last_user_msg = (i == last_user_msg_idx); - bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0); - - // Handle content - can be string, null, or array - // For multimodal content, we'll embed images/audio from separate fields - if (!msg.content().empty()) { - // Try to parse content as JSON to see if it's already an array - json content_val; - try { - content_val = json::parse(msg.content()); - // Handle null values - convert to empty string to avoid template errors - if (content_val.is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d parsed JSON is null, converting to empty string\n", i); - content_val = ""; - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - content_val = msg.content(); - } - - // If content is an object (e.g., from tool call failures), convert to string - if (content_val.is_object()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d content is object, converting to string\n", i); - content_val = content_val.dump(); - } - - // If content is a string and this is the last user message with images/audio, combine them - if (content_val.is_string() && is_last_user_msg && has_images_or_audio) { - json content_array = json::array(); - // Add text first - content_array.push_back({{"type", "text"}, {"text", content_val.get()}}); - // Add images - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - // Add audios - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - } else { - // Use content as-is (already array or not last user message) - // Ensure null values are converted to empty string - if (content_val.is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d content_val was null, setting to empty string\n", i); - msg_json["content"] = ""; - } else { - msg_json["content"] = content_val; - SRV_INF("[CONTENT DEBUG] Predict: Message %d content set, type=%s\n", - i, content_val.is_string() ? "string" : - content_val.is_array() ? "array" : - content_val.is_object() ? "object" : "other"); - } - } - } else if (is_last_user_msg && has_images_or_audio) { - // If no content but this is the last user message with images/audio, create content array - json content_array = json::array(); - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i); - } else if (!msg.tool_calls().empty()) { - // Tool call messages may have null content, but templates expect string - // IMPORTANT: Set to space " " instead of empty string "", because llama.cpp's - // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312), - // which causes template errors when accessing message.content[:tool_start_length] - SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls, setting content to space (not empty string)\n", i); - msg_json["content"] = " "; - } else if (msg.role() == "tool") { - // Tool role messages must have content field set, even if empty - // Jinja templates expect content to be a string, not null or object - SRV_INF("[CONTENT DEBUG] Predict: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0); - if (msg.content().empty()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): empty content, set to empty string\n", i); - } else { - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): content exists: %s\n", - i, msg.content().substr(0, std::min(200, msg.content().size())).c_str()); - // Content exists, parse and ensure it's a string - json content_val; - try { - content_val = json::parse(msg.content()); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): parsed JSON, type=%s\n", - i, content_val.is_null() ? "null" : - content_val.is_object() ? "object" : - content_val.is_string() ? "string" : - content_val.is_array() ? "array" : "other"); - // Handle null values - Jinja templates expect content to be a string, not null - if (content_val.is_null()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): null content, converted to empty string\n", i); - } else if (content_val.is_object()) { - // If content is an object (e.g., from tool call failures/errors), convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): object content, converted to string: %s\n", - i, content_val.dump().substr(0, std::min(200, content_val.dump().size())).c_str()); - } else if (content_val.is_string()) { - msg_json["content"] = content_val.get(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): string content, using as-is\n", i); - } else { - // For arrays or other types, convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): %s content, converted to string\n", - i, content_val.is_array() ? "array" : "other type"); - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - msg_json["content"] = msg.content(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): not JSON, using as string\n", i); - } - } - } else { - // Ensure all messages have content set (fallback for any unhandled cases) - // Jinja templates expect content to be present, default to empty string if not set - if (!msg_json.contains("content")) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d (role=%s): no content field, adding empty string\n", - i, msg.role().c_str()); - msg_json["content"] = ""; - } - } - - // Add optional fields for OpenAI-compatible message format - if (!msg.name().empty()) { - msg_json["name"] = msg.name(); - } - if (!msg.tool_call_id().empty()) { - msg_json["tool_call_id"] = msg.tool_call_id(); - } - if (!msg.reasoning_content().empty()) { - msg_json["reasoning_content"] = msg.reasoning_content(); - } - if (!msg.tool_calls().empty()) { - // Parse tool_calls JSON string and add to message - try { - json tool_calls = json::parse(msg.tool_calls()); - msg_json["tool_calls"] = tool_calls; - SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str()); - // IMPORTANT: If message has tool_calls but content is empty or not set, - // set content to space " " instead of empty string "", because llama.cpp's - // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312), - // which causes template errors when accessing message.content[:tool_start_length] - if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get().empty())) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls but empty content, setting to space\n", i); - msg_json["content"] = " "; - } - // Log each tool call with name and arguments - if (tool_calls.is_array()) { - for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) { - const auto& tc = tool_calls[tc_idx]; - std::string tool_name = "unknown"; - std::string tool_args = "{}"; - if (tc.contains("function")) { - const auto& func = tc["function"]; - if (func.contains("name")) { - tool_name = func["name"].get(); - } - if (func.contains("arguments")) { - tool_args = func["arguments"].is_string() ? - func["arguments"].get() : - func["arguments"].dump(); - } - } else if (tc.contains("name")) { - tool_name = tc["name"].get(); - if (tc.contains("arguments")) { - tool_args = tc["arguments"].is_string() ? - tc["arguments"].get() : - tc["arguments"].dump(); - } - } - SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d, tool_call %zu: name=%s, arguments=%s\n", - i, tc_idx, tool_name.c_str(), tool_args.c_str()); - } - } - } catch (const json::parse_error& e) { - SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what()); - } - } - - // Debug: Log final content state before adding to array - if (msg_json.contains("content")) { - if (msg_json["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i); - } else { - SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content type=%s, has_value=%d\n", - i, msg_json["content"].is_string() ? "string" : - msg_json["content"].is_array() ? "array" : - msg_json["content"].is_object() ? "object" : "other", - msg_json["content"].is_null() ? 0 : 1); - } - } else { - SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i); - } - - messages_json.push_back(msg_json); + messages_json.push_back(llama_grpc::build_reconstructed_message(rin)); } // Final safety check: Ensure no message has null content (Jinja templates require strings) @@ -3023,36 +2529,7 @@ public: if (body_json.contains("messages") && body_json["messages"].is_array()) { SRV_INF("[CONTENT DEBUG] Predict: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size()); for (size_t idx = 0; idx < body_json["messages"].size(); idx++) { - auto& msg = body_json["messages"][idx]; - std::string role_str = msg.contains("role") ? msg["role"].get() : "unknown"; - if (msg.contains("content")) { - if (msg["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Fix null content - } else if (role_str == "tool" && msg["content"].is_array()) { - // Tool messages must have string content, not array - // oaicompat_chat_params_parse expects tool messages to have string content - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx); - msg["content"] = msg["content"].dump(); - } else if (!msg["content"].is_string() && !msg["content"].is_array()) { - // If content is object or other non-string type, convert to string for templates - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str()); - if (msg["content"].is_object()) { - msg["content"] = msg["content"].dump(); - } else { - msg["content"] = ""; - } - } else { - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n", - idx, role_str.c_str(), - msg["content"].is_string() ? "string" : - msg["content"].is_array() ? "array" : - msg["content"].is_object() ? "object" : "other"); - } - } else { - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Add missing content - } + llama_grpc::normalize_template_message(body_json["messages"][idx]); } } diff --git a/backend/cpp/llama-cpp/message_content.h b/backend/cpp/llama-cpp/message_content.h new file mode 100644 index 000000000..4c7317ecd --- /dev/null +++ b/backend/cpp/llama-cpp/message_content.h @@ -0,0 +1,192 @@ +#pragma once + +#include +#include + +#include + +namespace llama_grpc { + +// Normalizes a proto message's content string into the JSON value used when +// reconstructing OpenAI-format messages for the tokenizer (jinja) template. +// +// Shared by the streaming (PredictStream) and non-streaming (Predict) message +// reconstruction paths so the two cannot drift. +// +// LocalAI's Go layer (schema.Messages.ToProto) always sends content as a plain +// text string; multimodal media travels in separate proto fields, never inside +// content. So user/system/developer content is *only ever* opaque text and must +// NOT be JSON-sniffed: a prompt that merely looks like JSON (e.g. an ingredient +// list ["1/4 cup sugar", ...]) would otherwise be reinterpreted as structured +// content parts and rejected by oaicompat_chat_params_parse with +// "unsupported content[].type" (https://github.com/mudler/LocalAI/issues/10524). +// (developer is OpenAI's modern system alias - same "human-authored text" nature.) +// +// For assistant/tool messages we still collapse a literal JSON null/object +// (tool-call bookkeeping) to a string, but we never turn a plain string into an +// array/scalar. The array defense is therefore role-independent (arrays/scalars +// fall through for every role); the role gate only governs the null/object case. +inline nlohmann::ordered_json normalize_message_content(const std::string& role, + const std::string& content) { + nlohmann::ordered_json content_val = content; + if (role != "user" && role != "system" && role != "developer") { + try { + nlohmann::ordered_json parsed = nlohmann::ordered_json::parse(content); + if (parsed.is_null()) { + content_val = ""; + } else if (parsed.is_object()) { + content_val = parsed.dump(); + } + // arrays / scalars: keep the original plain-text string as-is + } catch (const nlohmann::ordered_json::parse_error&) { + // Not JSON, already the plain string + } + } + return content_val; +} + +// Final safety pass applied to each reconstructed OpenAI message right before it +// is handed to oaicompat_chat_params_parse (jinja templating). Jinja templates +// assume content is a string: a literal null breaks slicing such as +// message.content[:N] (#7324), and a tool message with array content is rejected +// (#7528). A multimodal user message legitimately carries a typed-part array +// ({type:text}, {type:image_url}, ...), which must be left intact. Shared by the +// streaming and non-streaming paths so this invariant cannot drift between them. +inline void normalize_template_message(nlohmann::ordered_json& msg) { + if (!msg.contains("content")) { + msg["content"] = ""; // templates expect the field to exist + return; + } + nlohmann::ordered_json& content = msg["content"]; + const std::string role = (msg.contains("role") && msg["role"].is_string()) + ? msg["role"].get() + : std::string(); + if (content.is_null()) { + content = ""; // #7324: null would crash content[:N] slicing + } else if (role == "tool" && content.is_array()) { + content = content.dump(); // #7528: tool messages must have string content + } else if (!content.is_string() && !content.is_array()) { + if (content.is_object()) { + content = content.dump(); // tool-call bookkeeping object -> string + } else { + content = ""; // other scalar (number/bool) -> empty + } + } + // string, or a non-tool (multimodal) typed-part array: leave untouched +} + +// One proto message's data, flattened to plain types so the reconstruction logic +// can be shared and unit-tested without protobuf. The streaming and non-streaming +// predict paths both populate this from proto::Message + the request's media. +struct ReconstructedMessageInput { + std::string role; + std::string content; // proto.Message.content (always a plain string) + std::string name; + std::string tool_call_id; + std::string reasoning_content; + std::string tool_calls; // tool_calls as a JSON string, or empty + bool is_last_user_msg = false; // attach request media to this message + std::vector images; // base64 (jpeg) + std::vector audios; // base64 (wav) + std::vector videos; // base64 +}; + +// Appends the request's media as OpenAI typed content parts. Imperative (not +// brace-init) to avoid nlohmann's object-vs-array initializer-list ambiguity. +inline void append_media_parts(nlohmann::ordered_json& content_array, + const std::vector& images, + const std::vector& audios, + const std::vector& videos) { + for (const auto& img : images) { + nlohmann::ordered_json image_chunk; + image_chunk["type"] = "image_url"; + nlohmann::ordered_json image_url; + image_url["url"] = "data:image/jpeg;base64," + img; + image_chunk["image_url"] = image_url; + content_array.push_back(image_chunk); + } + for (const auto& aud : audios) { + nlohmann::ordered_json audio_chunk; + audio_chunk["type"] = "input_audio"; + nlohmann::ordered_json input_audio; + input_audio["data"] = aud; + input_audio["format"] = "wav"; // default; could be made configurable + audio_chunk["input_audio"] = input_audio; + content_array.push_back(audio_chunk); + } + for (const auto& vid : videos) { + nlohmann::ordered_json video_chunk; + video_chunk["type"] = "input_video"; + nlohmann::ordered_json input_video; + input_video["data"] = vid; + video_chunk["input_video"] = input_video; + content_array.push_back(video_chunk); + } +} + +// Reconstructs a single OpenAI-format message (the object fed to +// oaicompat_chat_params_parse) from a proto message. Shared by PredictStream and +// Predict so the content/multimodal/tool_calls handling cannot drift between the +// two stream modes (it previously lived as two ~150-line copies with a redundant +// Predict-only tool_calls->" " branch). Guarantees content is always a string or +// a typed-part array, never null/missing. +inline nlohmann::ordered_json build_reconstructed_message(const ReconstructedMessageInput& in) { + nlohmann::ordered_json msg_json; + msg_json["role"] = in.role; + const bool has_media = !in.images.empty() || !in.audios.empty() || !in.videos.empty(); + + if (!in.content.empty()) { + nlohmann::ordered_json content_val = normalize_message_content(in.role, in.content); + if (content_val.is_string() && in.is_last_user_msg && has_media) { + // Last user message + media: build a typed-part array (text first). + nlohmann::ordered_json content_array = nlohmann::ordered_json::array(); + nlohmann::ordered_json text_part; + text_part["type"] = "text"; + text_part["text"] = content_val.get(); + content_array.push_back(text_part); + append_media_parts(content_array, in.images, in.audios, in.videos); + msg_json["content"] = content_array; + } else if (content_val.is_null()) { + msg_json["content"] = ""; + } else { + msg_json["content"] = content_val; + } + } else if (in.is_last_user_msg && has_media) { + // No text but media on the last user message: media-only typed array. + nlohmann::ordered_json content_array = nlohmann::ordered_json::array(); + append_media_parts(content_array, in.images, in.audios, in.videos); + msg_json["content"] = content_array; + } else { + // Empty content (any role, incl. tool/assistant): templates need a string. + msg_json["content"] = ""; + } + + if (!in.name.empty()) { + msg_json["name"] = in.name; + } + if (!in.tool_call_id.empty()) { + msg_json["tool_call_id"] = in.tool_call_id; + } + if (!in.reasoning_content.empty()) { + msg_json["reasoning_content"] = in.reasoning_content; + } + if (!in.tool_calls.empty()) { + try { + nlohmann::ordered_json tool_calls = nlohmann::ordered_json::parse(in.tool_calls); + msg_json["tool_calls"] = tool_calls; + // tool_calls + empty/blank content: use " " not "", because llama.cpp's + // common_chat_msgs_to_json_oaicompat turns "" into null, which breaks + // templates that slice message.content[:tool_start_length] (#7324). + if (!msg_json.contains("content") || + (msg_json["content"].is_string() && msg_json["content"].get().empty())) { + msg_json["content"] = " "; + } + } catch (const nlohmann::ordered_json::parse_error&) { + // Malformed tool_calls JSON: leave content as-is (prior behavior). + } + } + + return msg_json; +} + +} // namespace llama_grpc diff --git a/backend/cpp/llama-cpp/message_content_test.cpp b/backend/cpp/llama-cpp/message_content_test.cpp new file mode 100644 index 000000000..7e9a5383a --- /dev/null +++ b/backend/cpp/llama-cpp/message_content_test.cpp @@ -0,0 +1,234 @@ +// Unit tests for the shared message-reconstruction helpers (message_content.h). +// +// Build & run standalone (nlohmann/json single header on the include path): +// g++ -std=c++17 -I message_content_test.cpp -o t && ./t +// or via CMake: -DLLAMA_GRPC_BUILD_TESTS=ON then ctest. +// +// Regression coverage for: +// #10524 - a user/system prompt that is itself a JSON-array string must stay +// plain text, never be reinterpreted as OpenAI structured parts. +// #7324 - assistant/tool null content -> "" (templates slice content[:N]); +// assistant+tool_calls+empty content -> " " (not "", which becomes null). +// #7528 - tool message array content must reach the template as a string. +// multimodal - last user message text + media -> typed-part array, media kept. + +#include +#include +#include + +#include "message_content.h" + +using nlohmann::ordered_json; +using llama_grpc::normalize_message_content; +using llama_grpc::normalize_template_message; +using llama_grpc::build_reconstructed_message; +using llama_grpc::ReconstructedMessageInput; + +static int failures = 0; + +static void check(bool ok, const std::string& name, const std::string& detail = "") { + if (!ok) { + std::cerr << "FAIL " << name << (detail.empty() ? "" : ": " + detail) << "\n"; + failures++; + } +} + +// ---- normalize_message_content ------------------------------------------- + +static void expect_norm_string(const char* name, const std::string& role, + const std::string& content, const std::string& want) { + auto got = normalize_message_content(role, content); + if (!got.is_string()) { + check(false, name, "expected a JSON string, got " + + std::string(got.is_array() ? "array" : got.is_object() ? "object" : "other") + + " (" + got.dump() + ")"); + return; + } + check(got.get() == want, name, "expected \"" + want + "\", got \"" + got.get() + "\""); +} + +static void test_normalize() { + const std::string ingredients = R"(["1/4 cup brown sugar, packed","1 pound ground beef"])"; + + // #10524 - JSON-array text must stay a string. Role-INDEPENDENT array defense. + for (const char* role : {"user", "system", "developer", "function", "assistant", "tool"}) { + expect_norm_string((std::string("json_array_stays_text:") + role).c_str(), role, ingredients, ingredients); + } + + // #10524 - user/system/developer JSON-object text stays verbatim (NOT re-dumped). + expect_norm_string("user_json_object_verbatim", "user", R"({"a":1})", R"({"a":1})"); + expect_norm_string("system_json_object_verbatim", "system", R"({"a":1})", R"({"a":1})"); + expect_norm_string("developer_json_object_verbatim", "developer", R"({"a":1})", R"({"a":1})"); + + // Plain text unchanged for all roles. + expect_norm_string("user_plain_text", "user", "hello world", "hello world"); + expect_norm_string("assistant_non_json_text_kept", "assistant", "hi [unclosed", "hi [unclosed"); + + // #7324 boundary - user/system/developer literal "null" preserved (never parsed). + expect_norm_string("user_literal_null_stays", "user", "null", "null"); + expect_norm_string("system_literal_null_stays", "system", "null", "null"); + expect_norm_string("developer_literal_null_stays", "developer", "null", "null"); + + // #7324 - assistant/tool literal null collapses to empty string. + expect_norm_string("assistant_null_to_empty", "assistant", "null", ""); + expect_norm_string("tool_null_to_empty", "tool", "null", ""); + + // #7324/#7528 - assistant/tool object bookkeeping stringified (stays a string). + check(normalize_message_content("assistant", R"({"tool":"x"})").is_string(), "assistant_object_stringified"); + check(normalize_message_content("tool", R"({"error":"boom"})").is_string(), "tool_object_stringified"); + + // #10524-family - a bare scalar that parses as a JSON number stays the string. + expect_norm_string("assistant_scalar_number_stays_string", "assistant", "42", "42"); + + // baseline - empty content stays empty. + expect_norm_string("user_empty_stays_empty", "user", "", ""); +} + +// ---- normalize_template_message (BEFORE TEMPLATE sanitizer) --------------- + +static void test_template_sanitizer() { + // #7528 - a tool message with an ACTUAL array becomes a string. + { + ordered_json msg = {{"role", "tool"}, {"content", ordered_json::array({{{"type", "text"}, {"text", "r"}}})}}; + normalize_template_message(msg); + check(msg["content"].is_string(), "before_template_tool_array_to_string", "got " + msg["content"].dump()); + } + // #7324 - null content -> "" for any role. + { + ordered_json msg = {{"role", "assistant"}, {"content", nullptr}}; + normalize_template_message(msg); + check(msg["content"].is_string() && msg["content"] == "", "before_template_null_to_empty"); + } + // object content -> dumped string (would otherwise throw at the template). + { + ordered_json msg = {{"role", "assistant"}, {"content", {{"x", 1}}}}; + normalize_template_message(msg); + check(msg["content"].is_string(), "before_template_object_to_string", "got " + msg["content"].dump()); + } + // missing content field -> "". + { + ordered_json msg = {{"role", "user"}}; + normalize_template_message(msg); + check(msg.contains("content") && msg["content"] == "", "before_template_missing_to_empty"); + } + // multimodal: a well-typed user array must be left UNTOUCHED (role!=tool). + { + ordered_json parts = ordered_json::array(); + parts.push_back({{"type", "text"}, {"text", "x"}}); + ordered_json img; img["type"] = "image_url"; img["image_url"] = {{"url", "data:..."}}; + parts.push_back(img); + ordered_json msg = {{"role", "user"}, {"content", parts}}; + normalize_template_message(msg); + check(msg["content"].is_array() && msg["content"].size() == 2, "before_template_user_typed_array_preserved", + "got " + msg["content"].dump()); + } + // a plain string is left untouched. + { + ordered_json msg = {{"role", "user"}, {"content", "hello"}}; + normalize_template_message(msg); + check(msg["content"] == "hello", "before_template_string_untouched"); + } +} + +// ---- build_reconstructed_message ---------------------------------------- + +static void test_reconstruction() { + const std::string ingredients = R"(["1/4 cup brown sugar","1 pound ground beef"])"; + + // #10524 end-state - user JSON-array text, no media -> string content. + { + ReconstructedMessageInput in; + in.role = "user"; in.content = ingredients; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == ingredients, "recon_user_json_array_string", + "got " + m["content"].dump()); + } + // multimodal - user text + one image on last user msg -> typed array, image kept. + { + ReconstructedMessageInput in; + in.role = "user"; in.content = ingredients; in.is_last_user_msg = true; + in.images.push_back("BASE64IMG"); + auto m = build_reconstructed_message(in); + check(m["content"].is_array() && m["content"].size() == 2, "recon_multimodal_text_plus_image", + "got " + m["content"].dump()); + check(m["content"][0]["type"] == "text" && m["content"][0]["text"] == ingredients, "recon_multimodal_text_first"); + check(m["content"][1]["type"] == "image_url", "recon_multimodal_image_kept"); + } + // multimodal media-only - empty text + image on last user msg. + { + ReconstructedMessageInput in; + in.role = "user"; in.content = ""; in.is_last_user_msg = true; + in.images.push_back("BASE64IMG"); + auto m = build_reconstructed_message(in); + check(m["content"].is_array() && m["content"].size() == 1 && m["content"][0]["type"] == "image_url", + "recon_media_only", "got " + m["content"].dump()); + } + // #7528 - tool array-string content stays a string. + { + ReconstructedMessageInput in; + in.role = "tool"; in.content = R"(["a","b"])"; in.tool_call_id = "call_1"; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == R"(["a","b"])", "recon_tool_array_string", + "got " + m["content"].dump()); + check(m["tool_call_id"] == "call_1", "recon_tool_call_id_set"); + } + // tool empty content -> "". + { + ReconstructedMessageInput in; + in.role = "tool"; in.content = ""; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == "", "recon_tool_empty_to_string"); + } + // #7324 - assistant + tool_calls + empty content -> " " (single space, not ""). + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = ""; + in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])"; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == " ", "recon_toolcalls_empty_content_space", + "got " + m["content"].dump()); + check(m["tool_calls"].is_array() && m["tool_calls"].size() == 1, "recon_toolcalls_parsed"); + } + // assistant + tool_calls + real content keeps the content. + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = "I'll call f"; + in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])"; + auto m = build_reconstructed_message(in); + check(m["content"] == "I'll call f", "recon_toolcalls_with_content_kept"); + } + // assistant null content -> "". + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = "null"; + auto m = build_reconstructed_message(in); + check(m["content"] == "", "recon_assistant_null_to_empty"); + } + // malformed tool_calls JSON must not throw; content preserved. + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = "hi"; in.tool_calls = "{not json"; + auto m = build_reconstructed_message(in); + check(m["content"] == "hi" && !m.contains("tool_calls"), "recon_malformed_toolcalls_safe"); + } + // optional fields: name + reasoning carried through. + { + ReconstructedMessageInput in; + in.role = "tool"; in.content = "result"; in.name = "get_weather"; in.reasoning_content = "thinking"; + auto m = build_reconstructed_message(in); + check(m["name"] == "get_weather" && m["reasoning_content"] == "thinking", "recon_optional_fields"); + } +} + +int main() { + test_normalize(); + test_template_sanitizer(); + test_reconstruction(); + + if (failures == 0) { + std::cout << "OK: all message_content tests passed\n"; + return 0; + } + std::cerr << failures << " test(s) failed\n"; + return 1; +} diff --git a/backend/cpp/llama-cpp/prepare.sh b/backend/cpp/llama-cpp/prepare.sh index 2a8a88f66..370af4215 100644 --- a/backend/cpp/llama-cpp/prepare.sh +++ b/backend/cpp/llama-cpp/prepare.sh @@ -36,6 +36,10 @@ done cp -r CMakeLists.txt llama.cpp/tools/grpc-server/ cp -r grpc-server.cpp llama.cpp/tools/grpc-server/ +# Shared message-reconstruction helpers (included by grpc-server.cpp) and their +# unit test (compiled only when -DLLAMA_GRPC_BUILD_TESTS=ON). +cp -r message_content.h llama.cpp/tools/grpc-server/ +cp -r message_content_test.cpp llama.cpp/tools/grpc-server/ cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/ cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/ diff --git a/backend/cpp/run-unit-tests.sh b/backend/cpp/run-unit-tests.sh new file mode 100755 index 000000000..3f63faa40 --- /dev/null +++ b/backend/cpp/run-unit-tests.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# +# Discovers and runs every standalone C++ unit test under backend/cpp/. +# +# A "standalone" unit test is a *_test.cpp that depends only on the C++ standard +# library and nlohmann/json (single header) - i.e. it exercises pure helpers and +# does not need the full llama.cpp + gRPC backend build. Tests that DO need the +# backend build use the CMake/ctest path (e.g. -DLLAMA_GRPC_BUILD_TESTS=ON) +# instead and are skipped here. +# +# This keeps CI generic: adding a new pure-C++ unit test file named *_test.cpp in +# an active backend source dir is picked up automatically, with no CI edits. +# +# Env: +# NLOHMANN_INCLUDE include dir that contains nlohmann/json.hpp. If unset, the +# nlohmann/json single header is fetched to a temp dir. +# CXX compiler (default: g++). +# JSON_VERSION nlohmann/json tag to fetch when NLOHMANN_INCLUDE is unset +# (default: v3.11.3). +set -uo pipefail + +ROOT="$(cd "$(dirname "$0")" && pwd)" +CXX="${CXX:-g++}" +JSON_VERSION="${JSON_VERSION:-v3.11.3}" + +JSON_INC="${NLOHMANN_INCLUDE:-}" +if [ -z "$JSON_INC" ]; then + JSON_INC="$(mktemp -d)" + mkdir -p "$JSON_INC/nlohmann" + echo "Fetching nlohmann/json ${JSON_VERSION} single header..." + if ! curl -L -sf \ + "https://raw.githubusercontent.com/nlohmann/json/${JSON_VERSION}/single_include/nlohmann/json.hpp" \ + -o "$JSON_INC/nlohmann/json.hpp"; then + echo "ERROR: failed to fetch nlohmann/json header" >&2 + exit 1 + fi +fi + +# Active source dirs only - exclude per-variant build copies, dev snapshots and +# the vendored upstream llama.cpp tree. +mapfile -t tests < <(find "$ROOT" -name '*_test.cpp' \ + -not -path '*/llama.cpp/*' \ + -not -path '*-build/*' \ + -not -path '*-dev/*' \ + -not -path '*fallback*' | sort) + +if [ "${#tests[@]}" -eq 0 ]; then + echo "No standalone C++ unit tests found under $ROOT" + exit 0 +fi + +fail=0 +for test_src in "${tests[@]}"; do + name="$(basename "$test_src" .cpp)" + bin="$(mktemp -d)/$name" + echo "==> $test_src" + if ! "$CXX" -std=c++17 -Wall -Wextra \ + -I"$JSON_INC" -I"$(dirname "$test_src")" \ + "$test_src" -o "$bin"; then + echo "COMPILE FAILED: $test_src" >&2 + fail=1 + continue + fi + if ! "$bin"; then + echo "TEST FAILED: $test_src" >&2 + fail=1 + fi +done + +echo "Ran ${#tests[@]} standalone C++ unit test file(s)" +exit "$fail" diff --git a/backend/go/parakeet-cpp/Makefile b/backend/go/parakeet-cpp/Makefile index 7fc46f8e2..bf7450531 100644 --- a/backend/go/parakeet-cpp/Makefile +++ b/backend/go/parakeet-cpp/Makefile @@ -1,6 +1,6 @@ # parakeet-cpp backend Makefile. # -# Upstream pin lives below as PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a +# Upstream pin lives below as PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9 # (.github/bump_deps.sh) can find and update it - matches the # whisper.cpp / ds4 / vibevoice-cpp convention. # @@ -15,7 +15,7 @@ # That's what the L0 smoke test uses. The default target below does the # proper clone-at-pin + cmake build so CI doesn't need a side-checkout. -PARAKEET_VERSION?=89f5e2977b4d8bccd45e7bcc6f2ef7c4ed49e89a +PARAKEET_VERSION?=f469a57270a1cc4554acb15febf60e56619673b9 PARAKEET_REPO?=https://github.com/mudler/parakeet.cpp GOCMD?=go diff --git a/backend/python/fish-speech/requirements.txt b/backend/python/fish-speech/requirements.txt index 1be3c8250..528abf737 100644 --- a/backend/python/fish-speech/requirements.txt +++ b/backend/python/fish-speech/requirements.txt @@ -7,3 +7,7 @@ setuptools six scipy numpy +# fish-speech is installed editable with --no-build-isolation, so the build +# backends of its transitive deps must already be in the venv. One of them +# builds a Rust extension and needs setuptools-rust present at metadata time. +setuptools-rust diff --git a/backend/python/llama-cpp-quantization/install.sh b/backend/python/llama-cpp-quantization/install.sh index 05ac24f70..a9001ffaa 100755 --- a/backend/python/llama-cpp-quantization/install.sh +++ b/backend/python/llama-cpp-quantization/install.sh @@ -11,14 +11,31 @@ fi EXTRA_PIP_INSTALL_FLAGS+=" --upgrade " installRequirements -# Fetch convert_hf_to_gguf.py from llama.cpp +# Fetch convert_hf_to_gguf.py from llama.cpp. +# Upstream split the model-specific logic out of the single file into a +# sibling `conversion/` package (convert_hf_to_gguf.py now does +# `from conversion import ...`), so a single-file download no longer runs — +# it fails with `ModuleNotFoundError: No module named 'conversion'`. We clone +# the repo and copy both the script and the package; Python puts the script's +# own directory on sys.path[0], so the package resolves when placed beside it. LLAMA_CPP_CONVERT_VERSION="${LLAMA_CPP_CONVERT_VERSION:-master}" +LLAMA_CPP_SRC="${EDIR}/llama.cpp" CONVERT_SCRIPT="${EDIR}/convert_hf_to_gguf.py" -if [ ! -f "${CONVERT_SCRIPT}" ]; then - echo "Downloading convert_hf_to_gguf.py from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..." - curl -L --fail --retry 3 \ - "https://raw.githubusercontent.com/ggml-org/llama.cpp/${LLAMA_CPP_CONVERT_VERSION}/convert_hf_to_gguf.py" \ - -o "${CONVERT_SCRIPT}" || echo "Warning: Failed to download convert_hf_to_gguf.py." + +cloneLlamaCpp() { + if [ ! -d "${LLAMA_CPP_SRC}/.git" ]; then + git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \ + https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \ + git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" + fi +} + +if [ ! -f "${CONVERT_SCRIPT}" ] || [ ! -d "${EDIR}/conversion" ]; then + echo "Fetching convert_hf_to_gguf.py + conversion/ from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..." + cloneLlamaCpp + cp "${LLAMA_CPP_SRC}/convert_hf_to_gguf.py" "${CONVERT_SCRIPT}" + rm -rf "${EDIR}/conversion" + cp -r "${LLAMA_CPP_SRC}/conversion" "${EDIR}/conversion" fi # Install gguf package from the same llama.cpp commit to keep them in sync @@ -41,12 +58,7 @@ QUANTIZE_BIN="${EDIR}/llama-quantize" if [ ! -x "${QUANTIZE_BIN}" ] && ! command -v llama-quantize &>/dev/null; then if command -v cmake &>/dev/null; then echo "Building llama-quantize from llama.cpp (${LLAMA_CPP_CONVERT_VERSION})..." - LLAMA_CPP_SRC="${EDIR}/llama.cpp" - if [ ! -d "${LLAMA_CPP_SRC}" ]; then - git clone --depth 1 --branch "${LLAMA_CPP_CONVERT_VERSION}" \ - https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" 2>/dev/null || \ - git clone --depth 1 https://github.com/ggml-org/llama.cpp.git "${LLAMA_CPP_SRC}" - fi + cloneLlamaCpp # reuses the clone fetched for convert_hf_to_gguf.py cmake -B "${LLAMA_CPP_SRC}/build" -S "${LLAMA_CPP_SRC}" -DGGML_NATIVE=OFF -DBUILD_SHARED_LIBS=OFF cmake --build "${LLAMA_CPP_SRC}/build" --target llama-quantize -j"$(nproc 2>/dev/null || echo 2)" cp "${LLAMA_CPP_SRC}/build/bin/llama-quantize" "${QUANTIZE_BIN}" diff --git a/backend/python/sglang/install.sh b/backend/python/sglang/install.sh index 928f7bd11..68812f8a7 100755 --- a/backend/python/sglang/install.sh +++ b/backend/python/sglang/install.sh @@ -85,9 +85,15 @@ if [ "x${BUILD_TYPE}" == "x" ] || [ "x${FROM_SOURCE:-}" == "xtrue" ]; then # The resulting binary still requires an AVX-512 capable CPU at runtime, # same constraint sglang upstream documents in docker/xeon.Dockerfile. + # Pin the source build to the same release the GPU path floors on + # (0.5.11, see requirements-cublas12-after.txt). An unpinned master clone + # pulls in newer CPU kernels (e.g. mamba/fla.cpp) that fail to compile + # (constexpr non-constant + kineto_LIBRARY-NOTFOUND). Bump deliberately. + SGLANG_VERSION="${SGLANG_VERSION:-v0.5.11}" _sgl_src=$(mktemp -d) trap 'rm -rf "${_sgl_src}"' EXIT - git clone --depth 1 https://github.com/sgl-project/sglang "${_sgl_src}/sglang" + git clone --depth 1 --branch "${SGLANG_VERSION}" \ + https://github.com/sgl-project/sglang "${_sgl_src}/sglang" # Patch -march=native → -march=sapphirerapids in the CPU kernel CMakeLists sed -i 's/-march=native/-march=sapphirerapids/g' \ diff --git a/backend/rust/kokoros/src/service.rs b/backend/rust/kokoros/src/service.rs index b980feb52..ef361b9dc 100644 --- a/backend/rust/kokoros/src/service.rs +++ b/backend/rust/kokoros/src/service.rs @@ -570,6 +570,43 @@ impl Backend for KokorosService { ) -> Result, Status> { Err(Status::unimplemented("Not supported")) } + + async fn sound_detection( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + async fn depth( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + async fn token_classify( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + async fn score( + &self, + _: Request, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } + + type ForwardStream = ReceiverStream>; + + async fn forward( + &self, + _: Request>, + ) -> Result, Status> { + Err(Status::unimplemented("Not supported")) + } } #[cfg(test)] diff --git a/core/application/startup.go b/core/application/startup.go index 1e5a7a73b..a71f8d0ea 100644 --- a/core/application/startup.go +++ b/core/application/startup.go @@ -16,6 +16,7 @@ import ( "github.com/mudler/LocalAI/core/services/galleryop" "github.com/mudler/LocalAI/core/services/jobs" "github.com/mudler/LocalAI/core/services/messaging" + "github.com/mudler/LocalAI/core/services/modeladmin" "github.com/mudler/LocalAI/core/services/monitoring" "github.com/mudler/LocalAI/core/services/nodes" "github.com/mudler/LocalAI/core/services/routing/admission" @@ -330,9 +331,14 @@ func New(opts ...config.AppOption) (*Application, error) { gs := application.galleryService sys := options.SystemState cfgLoaderOpts := options.ToConfigLoaderOptions() - gs.OnModelsChanged = func(_ messaging.CacheInvalidateEvent) { - if err := application.ModelConfigLoader().LoadModelConfigsFromPath(sys.Model.ModelsPath, cfgLoaderOpts...); err != nil { - xlog.Warn("Failed to reload model configs after peer invalidation", "error", err) + gs.OnModelsChanged = func(evt messaging.CacheInvalidateEvent) { + // ApplyRemoteChange honors the op: a "delete" prunes the element + // (a reload-from-path is additive and cannot drop it), anything + // else reloads from disk; a named element's running instance is + // shut down so the new config takes effect. The originating + // replica reloads inline and never depends on this path. + if err := modeladmin.ApplyRemoteChange(application.ModelConfigLoader(), application.modelLoader, sys.Model.ModelsPath, evt, cfgLoaderOpts...); err != nil { + xlog.Warn("Failed to apply peer model config change", "error", err) } } if err := application.galleryService.SubscribeBroadcasts(); err != nil { diff --git a/core/http/endpoints/localai/config_meta.go b/core/http/endpoints/localai/config_meta.go index b45720b78..3db694512 100644 --- a/core/http/endpoints/localai/config_meta.go +++ b/core/http/endpoints/localai/config_meta.go @@ -155,7 +155,7 @@ func AutocompleteEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, a // @Param name path string true "Model name" // @Success 200 {object} map[string]any "success message" // @Router /api/models/config-json/{name} [patch] -func PatchConfigEndpoint(cl *config.ModelConfigLoader, _ *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { +func PatchConfigEndpoint(cl *config.ModelConfigLoader, _ *model.ModelLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc { svc := modeladmin.NewConfigService(cl, appConfig) return func(c echo.Context) error { modelName := c.Param("name") @@ -173,6 +173,14 @@ func PatchConfigEndpoint(cl *config.ModelConfigLoader, _ *model.ModelLoader, app if _, err := svc.PatchConfig(c.Request().Context(), modelName, patchMap); err != nil { return c.JSON(httpStatusForModelAdminError(err), map[string]any{"error": err.Error()}) } + + // Patch rewrites the config on disk and reloads only the local loader; + // tell peers to refresh so the change is consistent across replicas. + // No-op in standalone mode. + if gs != nil { + gs.BroadcastModelsChanged(modelName, "install") + } + return c.JSON(http.StatusOK, map[string]any{ "success": true, "message": fmt.Sprintf("Model '%s' updated successfully", modelName), diff --git a/core/http/endpoints/localai/config_meta_test.go b/core/http/endpoints/localai/config_meta_test.go index f56c14b00..e60f7e08d 100644 --- a/core/http/endpoints/localai/config_meta_test.go +++ b/core/http/endpoints/localai/config_meta_test.go @@ -45,7 +45,7 @@ var _ = Describe("Config Metadata Endpoints", func() { app = echo.New() app.GET("/api/models/config-metadata", ConfigMetadataEndpoint()) app.GET("/api/models/config-metadata/autocomplete/:provider", AutocompleteEndpoint(configLoader, modelLoader, appConfig)) - app.PATCH("/api/models/config-json/:name", PatchConfigEndpoint(configLoader, modelLoader, appConfig)) + app.PATCH("/api/models/config-json/:name", PatchConfigEndpoint(configLoader, modelLoader, nil, appConfig)) }) AfterEach(func() { diff --git a/core/http/endpoints/localai/edit_model.go b/core/http/endpoints/localai/edit_model.go index 4cc0477fb..5dd573751 100644 --- a/core/http/endpoints/localai/edit_model.go +++ b/core/http/endpoints/localai/edit_model.go @@ -10,6 +10,7 @@ import ( "github.com/labstack/echo/v4" "github.com/mudler/LocalAI/core/config" httpUtils "github.com/mudler/LocalAI/core/http/middleware" + "github.com/mudler/LocalAI/core/services/galleryop" "github.com/mudler/LocalAI/core/services/modeladmin" "github.com/mudler/LocalAI/internal" "github.com/mudler/LocalAI/pkg/model" @@ -55,7 +56,7 @@ func GetEditModelPage(cl *config.ModelConfigLoader, appConfig *config.Applicatio } // EditModelEndpoint handles updating existing model configurations -func EditModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { +func EditModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc { svc := modeladmin.NewConfigService(cl, appConfig) return func(c echo.Context) error { modelName := c.Param("name") @@ -70,6 +71,17 @@ func EditModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appC if err != nil { return c.JSON(httpStatusForModelAdminError(err), ModelResponse{Success: false, Error: err.Error()}) } + + // Tell peer replicas to refresh their in-memory config: this endpoint + // only reloaded the local loader. A rename is a delete of the old name + // plus an install of the new one. No-op in standalone mode. + if gs != nil { + if result.Renamed { + gs.BroadcastModelsChanged(result.OldName, "delete") + } + gs.BroadcastModelsChanged(result.NewName, "install") + } + msg := fmt.Sprintf("Model '%s' updated successfully. Model has been reloaded with new configuration.", result.NewName) if result.Renamed { msg = fmt.Sprintf("Model '%s' renamed to '%s' and updated successfully.", result.OldName, result.NewName) diff --git a/core/http/endpoints/localai/edit_model_test.go b/core/http/endpoints/localai/edit_model_test.go index 55328dc39..54ad2d5ec 100644 --- a/core/http/endpoints/localai/edit_model_test.go +++ b/core/http/endpoints/localai/edit_model_test.go @@ -56,7 +56,7 @@ var _ = Describe("Edit Model test", func() { app := echo.New() // Set up a simple renderer for the test app.Renderer = &testRenderer{} - app.POST("/import-model", ImportModelEndpoint(modelConfigLoader, applicationConfig)) + app.POST("/import-model", ImportModelEndpoint(modelConfigLoader, nil, applicationConfig)) app.GET("/edit-model/:name", GetEditModelPage(modelConfigLoader, applicationConfig)) requestBody := bytes.NewBufferString(`{"name": "foo", "backend": "foo", "model": "foo"}`) @@ -106,7 +106,7 @@ var _ = Describe("Edit Model test", func() { Expect(exists).To(BeTrue()) app := echo.New() - app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, applicationConfig)) + app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, nil, applicationConfig)) newYAML := "name: newname\nbackend: llama\nmodel: foo\n" req := httptest.NewRequest("POST", "/models/edit/oldname", bytes.NewBufferString(newYAML)) @@ -163,7 +163,7 @@ var _ = Describe("Edit Model test", func() { Expect(modelConfigLoader.LoadModelConfigsFromPath(tempDir)).To(Succeed()) app := echo.New() - app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, applicationConfig)) + app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, nil, applicationConfig)) req := httptest.NewRequest( "POST", @@ -204,7 +204,7 @@ var _ = Describe("Edit Model test", func() { Expect(modelConfigLoader.LoadModelConfigsFromPath(tempDir)).To(Succeed()) app := echo.New() - app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, applicationConfig)) + app.POST("/models/edit/:name", EditModelEndpoint(modelConfigLoader, modelLoader, nil, applicationConfig)) req := httptest.NewRequest( "POST", diff --git a/core/http/endpoints/localai/import_model.go b/core/http/endpoints/localai/import_model.go index 54a80a9cc..21b7673da 100644 --- a/core/http/endpoints/localai/import_model.go +++ b/core/http/endpoints/localai/import_model.go @@ -125,7 +125,7 @@ func ImportModelURIEndpoint(cl *config.ModelConfigLoader, appConfig *config.Appl } // ImportModelEndpoint handles creating new model configurations -func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { +func ImportModelEndpoint(cl *config.ModelConfigLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc { return func(c echo.Context) error { // Get the raw body body, err := io.ReadAll(c.Request().Body) @@ -245,6 +245,13 @@ func ImportModelEndpoint(cl *config.ModelConfigLoader, appConfig *config.Applica } return c.JSON(http.StatusInternalServerError, response) } + // Tell peer replicas to load the newly-created config from the shared + // models dir: this endpoint only reloaded the local loader. No-op in + // standalone mode. + if gs != nil { + gs.BroadcastModelsChanged(modelConfig.Name, "install") + } + // Return success response response := ModelResponse{ Success: true, diff --git a/core/http/endpoints/localai/toggle_model.go b/core/http/endpoints/localai/toggle_model.go index e4e71ca64..545fdc8af 100644 --- a/core/http/endpoints/localai/toggle_model.go +++ b/core/http/endpoints/localai/toggle_model.go @@ -7,6 +7,7 @@ import ( "github.com/labstack/echo/v4" "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/services/galleryop" "github.com/mudler/LocalAI/core/services/modeladmin" "github.com/mudler/LocalAI/pkg/model" ) @@ -24,7 +25,7 @@ import ( // @Failure 404 {object} ModelResponse // @Failure 500 {object} ModelResponse // @Router /api/models/{name}/{action} [put] -func ToggleStateModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, appConfig *config.ApplicationConfig) echo.HandlerFunc { +func ToggleStateModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoader, gs *galleryop.GalleryService, appConfig *config.ApplicationConfig) echo.HandlerFunc { svc := modeladmin.NewConfigService(cl, appConfig) return func(c echo.Context) error { modelName := c.Param("name") @@ -36,6 +37,14 @@ func ToggleStateModelEndpoint(cl *config.ModelConfigLoader, ml *model.ModelLoade if err != nil { return c.JSON(httpStatusForModelAdminError(err), ModelResponse{Success: false, Error: err.Error()}) } + + // Enabling/disabling rewrites the config on disk and reloads only the + // local loader; tell peers to refresh so the model's availability is + // consistent across replicas. No-op in standalone mode. + if gs != nil { + gs.BroadcastModelsChanged(modelName, "install") + } + msg := fmt.Sprintf("Model '%s' has been %sd successfully.", modelName, action) if action == modeladmin.ActionDisable { msg += " The model will not be loaded on demand until re-enabled." diff --git a/core/http/routes/localai.go b/core/http/routes/localai.go index 212f379f0..763623a7f 100644 --- a/core/http/routes/localai.go +++ b/core/http/routes/localai.go @@ -72,19 +72,19 @@ func RegisterLocalAIRoutes(router *echo.Echo, router.POST("/backends/upgrades/check", backendGalleryEndpointService.CheckUpgradesEndpoint(), adminMiddleware) router.POST("/backends/upgrade/:name", backendGalleryEndpointService.UpgradeBackendEndpoint(), adminMiddleware) // Custom model import endpoint - router.POST("/models/import", localai.ImportModelEndpoint(cl, appConfig), adminMiddleware) + router.POST("/models/import", localai.ImportModelEndpoint(cl, galleryService, appConfig), adminMiddleware) // URI model import endpoint router.POST("/models/import-uri", localai.ImportModelURIEndpoint(cl, appConfig, galleryService, opcache), adminMiddleware) // Custom model edit endpoint - router.POST("/models/edit/:name", localai.EditModelEndpoint(cl, ml, appConfig), adminMiddleware) + router.POST("/models/edit/:name", localai.EditModelEndpoint(cl, ml, galleryService, appConfig), adminMiddleware) // List model aliases endpoint router.GET("/api/aliases", localai.ListAliasesEndpoint(cl), adminMiddleware) // Toggle model enable/disable endpoint - router.PUT("/models/toggle-state/:name/:action", localai.ToggleStateModelEndpoint(cl, ml, appConfig), adminMiddleware) + router.PUT("/models/toggle-state/:name/:action", localai.ToggleStateModelEndpoint(cl, ml, galleryService, appConfig), adminMiddleware) // Toggle model pinned status endpoint router.PUT("/models/toggle-pinned/:name/:action", localai.TogglePinnedModelEndpoint(cl, appConfig, func() { diff --git a/core/http/routes/ui_api.go b/core/http/routes/ui_api.go index e26894273..d9c99c6b9 100644 --- a/core/http/routes/ui_api.go +++ b/core/http/routes/ui_api.go @@ -922,7 +922,7 @@ func RegisterUIAPIRoutes(app *echo.Echo, cl *config.ModelConfigLoader, ml *model app.GET("/api/models/config-metadata/autocomplete/:provider", localai.AutocompleteEndpoint(cl, ml, appConfig), adminMiddleware) // PATCH config endpoint - partial update using nested JSON merge - app.PATCH("/api/models/config-json/:name", localai.PatchConfigEndpoint(cl, ml, appConfig), adminMiddleware) + app.PATCH("/api/models/config-json/:name", localai.PatchConfigEndpoint(cl, ml, galleryService, appConfig), adminMiddleware) // VRAM estimation endpoint app.POST("/api/models/vram-estimate", localai.VRAMEstimateEndpoint(cl, appConfig), adminMiddleware) diff --git a/core/schema/message_test.go b/core/schema/message_test.go index da11d9d20..eb471b57b 100644 --- a/core/schema/message_test.go +++ b/core/schema/message_test.go @@ -68,6 +68,32 @@ var _ = Describe("LLM tests", func() { Expect(protoMessages[0].Content).To(Equal("Hello World")) }) + // Regression for mudler/LocalAI#10524: a text part whose inner text is + // itself a JSON-array string (mealie sends an ingredient list) must + // flatten to that exact string verbatim. ToProto must NOT escape or + // restructure it - the C++ backend then treats it as opaque text. This + // pins the precise Go-side input that produced the "unsupported + // content[].type" gRPC error before the backend stopped re-parsing it. + It("flattens a JSON-array-looking text part to the verbatim string (#10524)", func() { + ingredients := `["1/4 cup brown sugar, packed","1 pound ground beef"]` + messages := Messages{ + { + Role: "user", + Content: []any{ + map[string]any{ + "type": "text", + "text": ingredients, + }, + }, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Content).To(Equal(ingredients)) + }) + It("should convert message with tool_calls", func() { messages := Messages{ { diff --git a/core/services/galleryop/distributed_sync_test.go b/core/services/galleryop/distributed_sync_test.go index 7c1087de8..71a96c7ae 100644 --- a/core/services/galleryop/distributed_sync_test.go +++ b/core/services/galleryop/distributed_sync_test.go @@ -404,6 +404,36 @@ var _ = Describe("GalleryService cache invalidation broadcasts", func() { Element: "x", Op: "install", })).To(Succeed()) }) + + It("BroadcastModelsChanged delivers the element and op to a peer's OnModelsChanged", func() { + var ( + mu sync.Mutex + seen []messaging.CacheInvalidateEvent + ) + svcB.OnModelsChanged = func(evt messaging.CacheInvalidateEvent) { + mu.Lock() + seen = append(seen, evt) + mu.Unlock() + } + Expect(svcA.SubscribeBroadcasts()).To(Succeed()) + Expect(svcB.SubscribeBroadcasts()).To(Succeed()) + + // An admin edit on replica A must reach replica B over the same subject + // the gallery path uses, so B refreshes its in-memory config loader. + svcA.BroadcastModelsChanged("my-alias", "install") + + mu.Lock() + defer mu.Unlock() + Expect(seen).To(ContainElement(messaging.CacheInvalidateEvent{ + Element: "my-alias", Op: "install", + })) + }) + + It("BroadcastModelsChanged is a no-op when NATS is not wired (standalone)", func() { + standalone := galleryop.NewGalleryService(&config.ApplicationConfig{}, nil) + // No SetNATSClient: must not panic and must simply do nothing. + Expect(func() { standalone.BroadcastModelsChanged("x", "delete") }).ToNot(Panic()) + }) }) var _ = Describe("GalleryService PostgreSQL hydration", func() { diff --git a/core/services/galleryop/service.go b/core/services/galleryop/service.go index d01d9cc19..abe399088 100644 --- a/core/services/galleryop/service.go +++ b/core/services/galleryop/service.go @@ -201,6 +201,24 @@ func (g *GalleryService) publishCacheInvalidate(subject string, evt messaging.Ca } } +// BroadcastModelsChanged notifies peer replicas that a model config was +// created, edited, or removed out-of-band of the gallery install/delete +// channel (e.g. the admin /models/edit, /models/import and +// /models/toggle-state endpoints, which write the YAML and reload only the +// local in-memory loader). Peers receive it via OnModelsChanged and refresh +// their own ModelConfigLoader so a request load-balanced to any replica sees +// the same config. No-op in standalone mode (no NATS client). +// +// op is "install" for a create/edit (the element must be (re)loaded from +// disk) or "delete" for a removal (the element must be pruned from memory, +// which a reload-from-path cannot do because the loader is additive). +func (g *GalleryService) BroadcastModelsChanged(element, op string) { + g.publishCacheInvalidate(messaging.SubjectCacheInvalidateModels, messaging.CacheInvalidateEvent{ + Element: element, + Op: op, + }) +} + // mergeStatus is the broadcast-side merge: it updates the in-memory map from // a peer's GalleryProgressEvent without re-publishing to NATS or re-writing // to PostgreSQL. UpdateStatus is the local-write entry point and does both; diff --git a/core/services/modeladmin/remote_sync.go b/core/services/modeladmin/remote_sync.go new file mode 100644 index 000000000..5acf5bf9a --- /dev/null +++ b/core/services/modeladmin/remote_sync.go @@ -0,0 +1,53 @@ +package modeladmin + +import ( + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/services/messaging" + "github.com/mudler/LocalAI/pkg/model" + + "github.com/mudler/xlog" +) + +// opDelete is the CacheInvalidateEvent.Op value the gallery delete path and the +// admin delete endpoint use; a delete must prune (a reload-from-path cannot). +const opDelete = "delete" + +// ApplyRemoteChange refreshes this replica's in-memory model state from a peer +// replica's model-config change broadcast (messaging.CacheInvalidateEvent on +// SubjectCacheInvalidateModels). It is the subscriber-side counterpart to +// GalleryService.BroadcastModelsChanged. +// +// The op matters because LoadModelConfigsFromPath is additive: it loads every +// YAML on disk into the loader but never removes an entry whose file is gone. +// So a delete cannot be propagated by a plain reload - the deleted element must +// be explicitly pruned. Specifically: +// +// - op == "delete" with a named element: prune that element from the loader. +// - otherwise: reload all configs from disk (picks up creates and edits). +// +// In both cases, when an element is named, any running instance on this replica +// is shut down (best-effort) so the next request rebuilds it from the new +// config instead of serving the stale one - mirroring what the originating +// replica does on a local edit/delete. +// +// ml may be nil (no running instances to shut down). modelsPath and opts are +// forwarded to LoadModelConfigsFromPath. +func ApplyRemoteChange(cl *config.ModelConfigLoader, ml *model.ModelLoader, modelsPath string, evt messaging.CacheInvalidateEvent, opts ...config.ConfigLoaderOption) error { + if evt.Op == opDelete && evt.Element != "" { + cl.RemoveModelConfig(evt.Element) + } else if err := cl.LoadModelConfigsFromPath(modelsPath, opts...); err != nil { + return err + } + + // Drop any running instance of the affected model so the next request + // rebuilds it from the refreshed config instead of serving the stale one. + // Best-effort: the model may not be loaded on this replica, which surfaces + // as a benign error here. + if ml != nil && evt.Element != "" { + if err := ml.ShutdownModel(evt.Element); err != nil { + xlog.Debug("ApplyRemoteChange: could not shut down model instance (likely not loaded)", + "model", evt.Element, "error", err) + } + } + return nil +} diff --git a/core/services/modeladmin/remote_sync_test.go b/core/services/modeladmin/remote_sync_test.go new file mode 100644 index 000000000..df4907a02 --- /dev/null +++ b/core/services/modeladmin/remote_sync_test.go @@ -0,0 +1,80 @@ +package modeladmin + +import ( + "os" + "path/filepath" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "gopkg.in/yaml.v3" + + "github.com/mudler/LocalAI/core/config" + "github.com/mudler/LocalAI/core/services/messaging" +) + +var _ = Describe("ApplyRemoteChange", func() { + var ( + dir string + loader *config.ModelConfigLoader + ) + + BeforeEach(func() { + dir = GinkgoT().TempDir() + loader = config.NewModelConfigLoader(dir) + }) + + writeYAML := func(name string, body map[string]any) { + body["name"] = name + data, err := yaml.Marshal(body) + Expect(err).ToNot(HaveOccurred()) + Expect(os.WriteFile(filepath.Join(dir, name+".yaml"), data, 0644)).To(Succeed()) + } + + It("loads a peer-created config from disk on an install event", func() { + // Peer wrote the YAML to the shared models dir; this replica has not + // loaded it yet (empty in-memory loader). + writeYAML("peer-alias", map[string]any{"alias": "qwen"}) + _, ok := loader.GetModelConfig("peer-alias") + Expect(ok).To(BeFalse(), "precondition: not yet in memory") + + err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{ + Element: "peer-alias", Op: "install", + }) + Expect(err).ToNot(HaveOccurred()) + + _, ok = loader.GetModelConfig("peer-alias") + Expect(ok).To(BeTrue(), "install event must reload the new config from disk") + }) + + It("prunes a peer-deleted config that a reload-from-path cannot drop", func() { + // Model is present in memory (loaded earlier) but its file is now gone + // from the shared dir. LoadModelConfigsFromPath is additive, so only an + // explicit prune can remove it - this is the cross-replica delete bug. + writeYAML("doomed", map[string]any{"alias": "qwen"}) + Expect(loader.LoadModelConfigsFromPath(dir)).To(Succeed()) + _, ok := loader.GetModelConfig("doomed") + Expect(ok).To(BeTrue(), "precondition: in memory") + Expect(os.Remove(filepath.Join(dir, "doomed.yaml"))).To(Succeed()) + + err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{ + Element: "doomed", Op: "delete", + }) + Expect(err).ToNot(HaveOccurred()) + + _, ok = loader.GetModelConfig("doomed") + Expect(ok).To(BeFalse(), "delete event must prune the element from memory") + }) + + It("does a full reload when no element is named", func() { + writeYAML("m1", map[string]any{"alias": "qwen"}) + writeYAML("m2", map[string]any{"alias": "qwen"}) + + err := ApplyRemoteChange(loader, nil, dir, messaging.CacheInvalidateEvent{}) + Expect(err).ToNot(HaveOccurred()) + + _, ok1 := loader.GetModelConfig("m1") + _, ok2 := loader.GetModelConfig("m2") + Expect(ok1).To(BeTrue()) + Expect(ok2).To(BeTrue()) + }) +}) diff --git a/docs/data/version.json b/docs/data/version.json index f8cc39cee..944cb9836 100644 --- a/docs/data/version.json +++ b/docs/data/version.json @@ -1,3 +1,3 @@ { - "version": "v4.5.0" + "version": "v4.5.2" } diff --git a/gallery/index.yaml b/gallery/index.yaml index f45939146..e8fa3627f 100644 --- a/gallery/index.yaml +++ b/gallery/index.yaml @@ -221,6 +221,54 @@ - filename: llama-cpp/models/Qwen3.6-35B-A3B-NVFP4-MTP-GGUF/Qwen3.6-35B-A3B-NVFP4-MTP-TURBO.gguf sha256: f3d2fdc74e3ef19925ccbf794b04d7f6f11fb12eba7722b7749219d0cc5c36ed uri: https://huggingface.co/michaelw9999/Qwen3.6-35B-A3B-NVFP4-MTP-GGUF/resolve/main/Qwen3.6-35B-A3B-NVFP4-MTP-TURBO.gguf +- name: "qwen-agentworld-35b-a3b" + url: "github:mudler/LocalAI/gallery/virtual.yaml@master" + urls: + - https://huggingface.co/unsloth/Qwen-AgentWorld-35B-A3B-GGUF + description: | + # Qwen-AgentWorld-35B-A3B + + 📑 Technical Report | + 📖 Blog | + 🤗 Hugging Face | + 🤖 ModelScope | + 💻 GitHub | + 🖥️ Demo + + > [!Note] + > This repository contains the model weights and configuration files for **Qwen-AgentWorld-35B-A3B**, a native language world model trained for agentic environment simulation. + > + > These artifacts are compatible with Hugging Face Transformers, vLLM, SGLang, etc. + + **Qwen-AgentWorld** is the first language world model to cover seven agent interaction domains within a single model. It simulates agentic environments via long chain-of-thought reasoning, predicting the next environment state given an agent's action and interaction history. Trained through a three-stage pipeline — CPT injects environment knowledge, SFT activates next-state-prediction reasoning, RL sharpens simulation fidelity — Qwen-AgentWorld is a **native world model**: environment modeling is the training objective from the CPT stage onward, not a post-hoc add-on. + + ## Highlights + + ... + license: "apache-2.0" + tags: + - llm + - gguf + - qwen + icon: https://qianwen-res.oss-accelerate-overseas.aliyuncs.com/Qwen-AgentWorld/logo.png + overrides: + backend: llama-cpp + function: + automatic_tool_parsing_fallback: true + grammar: + disable: true + known_usecases: + - chat + options: + - use_jinja:true + parameters: + model: llama-cpp/models/Qwen-AgentWorld-35B-A3B-GGUF/Qwen-AgentWorld-35B-A3B-UD-Q4_K_M.gguf + template: + use_tokenizer_template: true + files: + - filename: llama-cpp/models/Qwen-AgentWorld-35B-A3B-GGUF/Qwen-AgentWorld-35B-A3B-UD-Q4_K_M.gguf + sha256: e7a8eafdd8013443b6bcc4b6fb47b2d2025f772d359650b9ceb7d75971e22cad + uri: https://huggingface.co/unsloth/Qwen-AgentWorld-35B-A3B-GGUF/resolve/main/Qwen-AgentWorld-35B-A3B-UD-Q4_K_M.gguf - name: "ornith-1.0-9b" url: "github:mudler/LocalAI/gallery/virtual.yaml@master" urls: diff --git a/scripts/build/golang-darwin.sh b/scripts/build/golang-darwin.sh index fddd4bc4f..d2e72eac9 100644 --- a/scripts/build/golang-darwin.sh +++ b/scripts/build/golang-darwin.sh @@ -17,9 +17,15 @@ rm -rf "${BACKEND_DIR}"/build-* # run.sh's final `exec $CURDIR/` is the contract for what gets launched; # the binary is not always named after the backend (e.g. parakeet-cpp launches # parakeet-cpp-grpc), so derive it from run.sh and fall back to ${BACKEND}. +# +# Only scan the `exec` line(s): many run.sh select a runtime CPU variant via +# unquoted `LIBRARY=$CURDIR/libgo-avx512.so` lines, and a whole-file grep +# would pick the last of those (avx512, which Darwin never builds) instead of +# the binary — failing the check below for whisper/sam3-cpp/vibevoice-cpp/... +# Also tolerate the exec being quoted (`exec "$CURDIR"/`). RUN_BINARY="" if [ -f "${BACKEND_DIR}/run.sh" ]; then - RUN_BINARY=$(grep -oE '\$CURDIR/[A-Za-z0-9._-]+' "${BACKEND_DIR}/run.sh" | grep -v 'ld\.so' | tail -1 | sed 's|\$CURDIR/||') + RUN_BINARY=$(grep -E '^[[:space:]]*exec[[:space:]]' "${BACKEND_DIR}/run.sh" | grep -oE '"?\$CURDIR"?/[A-Za-z0-9._-]+' | grep -v 'ld\.so' | tail -1 | sed -E 's|"?\$CURDIR"?/||') fi RUN_BINARY="${RUN_BINARY:-${BACKEND}}" diff --git a/scripts/build/package-gpu-libs.sh b/scripts/build/package-gpu-libs.sh index 40f410173..17c0d0ca8 100755 --- a/scripts/build/package-gpu-libs.sh +++ b/scripts/build/package-gpu-libs.sh @@ -141,6 +141,38 @@ copy_elf_deps() { done < <(ldd "$elf" 2>/dev/null | awk '/=>/ && $3 ~ /^\// {print $3}') } +# Sweep the transitive shared-library dependencies of everything already +# bundled in a lib dir. The per-vendor packagers below copy an explicit +# allowlist of top-level runtime libs, but those libs pull in transitive deps +# that aren't in the list (e.g. ROCm's librocprofiler-register.so.0, libnuma, +# libdrm_amdgpu). Because backends run through the bundled lib/ld.so with +# LD_LIBRARY_PATH=lib (see run.sh), an unbundled transitive dep is a hard load +# failure (issue #10537: "librocprofiler-register.so.0: cannot open shared +# object file"). ldd resolves the full recursive closure, so a single pass over +# the already-bundled libs is enough; core libc-family deps are skipped via +# copy_elf_deps/is_core_lib so we never shadow the loader's own libc/libstdc++. +sweep_transitive_deps() { + local dir="${1:-$TARGET_LIB_DIR}" + command -v ldd >/dev/null 2>&1 || return 0 + + # Snapshot the current set first: copy_elf_deps adds files as it runs, and + # ldd already returns the full recursive closure, so we only need to sweep + # the libs that were present before the sweep started. + # `local x=$(...)` keeps set -e from tripping on shopt -p's nonzero exit. + local old_nullglob=$(shopt -p nullglob) + shopt -s nullglob + local libs=("$dir"/*.so*) + eval "$old_nullglob" + + local lib + for lib in "${libs[@]}"; do + [ -e "$lib" ] || continue + # Skip symlinks: their real target is in the snapshot and gets swept. + [ -L "$lib" ] && continue + copy_elf_deps "$lib" + done +} + # Package NVIDIA CUDA libraries package_cuda_libs() { echo "Packaging CUDA libraries for BUILD_TYPE=${BUILD_TYPE}..." @@ -185,6 +217,10 @@ package_cuda_libs() { # cp -arfL /usr/local/cuda/targets "$TARGET_LIB_DIR/../cuda/" 2>/dev/null || true # fi + # Pull in transitive deps the allowlist misses so the backend is + # self-contained (same class of failure as #10537). + sweep_transitive_deps "$TARGET_LIB_DIR" + echo "CUDA libraries packaged successfully" } @@ -261,6 +297,10 @@ package_rocm_libs() { fi done + # Pull in transitive deps the allowlist misses (librocprofiler-register.so.0, + # libnuma, libdrm_amdgpu, ...) so the backend is self-contained. See #10537. + sweep_transitive_deps "$TARGET_LIB_DIR" + echo "ROCm libraries packaged successfully" } @@ -303,6 +343,10 @@ package_intel_libs() { fi done + # Pull in transitive deps the allowlist misses so the backend is + # self-contained (same class of failure as #10537). + sweep_transitive_deps "$TARGET_LIB_DIR" + echo "Intel oneAPI libraries packaged successfully" } @@ -432,6 +476,7 @@ export -f copy_lib export -f copy_libs_glob export -f is_core_lib export -f copy_elf_deps +export -f sweep_transitive_deps export -f package_cuda_libs export -f package_rocm_libs export -f package_intel_libs diff --git a/scripts/build/package-gpu-libs_test.sh b/scripts/build/package-gpu-libs_test.sh new file mode 100755 index 000000000..39f8331c0 --- /dev/null +++ b/scripts/build/package-gpu-libs_test.sh @@ -0,0 +1,54 @@ +#!/bin/bash +# Regression test for scripts/build/package-gpu-libs.sh. +# +# Guards issue #10537: the per-vendor packagers copy an explicit allowlist of +# top-level GPU runtime libs but used to miss their transitive dependencies +# (e.g. ROCm's librocprofiler-register.so.0). Since backends run through the +# bundled lib/ld.so with LD_LIBRARY_PATH=lib, an unbundled transitive dep is a +# fatal "cannot open shared object file" at load time. +# +# This test fabricates a primary lib that links a transitive lib, simulates the +# allowlist step (primary copied, transitive not), and asserts the transitive +# sweep pulls the dependency in. Requires gcc + ldd (present in build images). +set -euo pipefail + +CURDIR=$(dirname "$(realpath "$0")") +SCRIPT="$CURDIR/package-gpu-libs.sh" + +if ! command -v gcc >/dev/null 2>&1 || ! command -v ldd >/dev/null 2>&1; then + echo "SKIP: gcc/ldd not available" + exit 0 +fi + +WORK=$(mktemp -d) +trap 'rm -rf "$WORK"' EXIT + +# Transitive dependency (stand-in for librocprofiler-register.so.0). +echo 'int transitive_fn(void){return 42;}' > "$WORK/transitive.c" +gcc -shared -fPIC -o "$WORK/libfaketransitive.so.0" "$WORK/transitive.c" + +# Primary allowlisted lib (stand-in for libhipblas.so) that links it. +echo 'int transitive_fn(void); int primary_fn(void){return transitive_fn();}' > "$WORK/primary.c" +gcc -shared -fPIC -o "$WORK/libfakeprimary.so.0" "$WORK/primary.c" \ + -L"$WORK" -l:libfaketransitive.so.0 -Wl,-rpath,"$WORK" + +# Simulate the allowlist step: primary already bundled, transitive not. +TARGET="$WORK/target" +mkdir -p "$TARGET" +cp "$WORK/libfakeprimary.so.0" "$TARGET/" + +# Make the transitive dep resolvable like /opt/rocm libs are in the build image. +export LD_LIBRARY_PATH="$WORK:${LD_LIBRARY_PATH:-}" + +# shellcheck source=/dev/null +source "$SCRIPT" "$TARGET" +sweep_transitive_deps "$TARGET" + +if [ -e "$TARGET/libfaketransitive.so.0" ]; then + echo "PASS: transitive dependency was bundled by sweep_transitive_deps" + exit 0 +fi + +echo "FAIL: transitive dependency was NOT bundled (regression of #10537)" +ls -la "$TARGET" +exit 1