From f0d0bff2320c7a7b0a93623949c4f00fc6c61613 Mon Sep 17 00:00:00 2001 From: "LocalAI [bot]" <139863280+localai-bot@users.noreply.github.com> Date: Sat, 27 Jun 2026 01:42:05 +0200 Subject: [PATCH] fix(llama-cpp): stop reinterpreting plain-string message content as JSON (#10524) (#10538) The llama-cpp gRPC backend reconstructs OpenAI messages from proto for the tokenizer-template path and blindly json::parse'd each message's content string. LocalAI's Go layer always flattens content to a plain string, so a user prompt that merely looks like JSON (e.g. mealie's ingredient array ["1/4 cup brown sugar", ...]) was reinterpreted as structured content parts and rejected by oaicompat_chat_params_parse with "unsupported content[].type". Normalize content per role instead: user/system/developer content is opaque text and is never JSON-sniffed; assistant/tool content still collapses a literal JSON null/object (tool-call bookkeeping) to a string, but a plain string is never turned into an array/scalar. The array defense is role-independent, so the role gate only governs the benign null/object case. While here, extract the duplicated per-message reconstruction and the pre-template content sanitization into shared, unit-tested helpers (message_content.h) so the streaming (PredictStream) and non-streaming (Predict) paths cannot drift. This removes ~490 lines of copy-pasted defensive code, the dead tool-role parse branches, and the redundant Predict-only tool_calls branch, while preserving the prior #7324 (null content -> "") and #7528 (tool array content -> string) fixes. Tests: - backend/cpp/llama-cpp/message_content_test.cpp: standalone C++ unit tests for all three helpers (#10524, #7324, #7528, multimodal), discovered and run by `make test-backend-cpp` and a new generic tests-backend-cpp CI job. Also wired as an opt-in CMake/ctest target (-DLLAMA_GRPC_BUILD_TESTS=ON). - core/schema/message_test.go: Go regression pinning that ToProto flattens a JSON-array-looking text part to the verbatim string. - prepare.sh now copies message_content.h into the build tree. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto Co-authored-by: Ettore Di Giacinto --- .github/workflows/test.yml | 16 + Makefile | 9 +- backend/cpp/llama-cpp/CMakeLists.txt | 15 + backend/cpp/llama-cpp/grpc-server.cpp | 581 +----------------- backend/cpp/llama-cpp/message_content.h | 192 ++++++ .../cpp/llama-cpp/message_content_test.cpp | 234 +++++++ backend/cpp/llama-cpp/prepare.sh | 4 + backend/cpp/run-unit-tests.sh | 71 +++ core/schema/message_test.go | 26 + 9 files changed, 595 insertions(+), 553 deletions(-) create mode 100644 backend/cpp/llama-cpp/message_content.h create mode 100644 backend/cpp/llama-cpp/message_content_test.cpp create mode 100755 backend/cpp/run-unit-tests.sh diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index df5512283..e18e38b62 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -121,3 +121,19 @@ jobs: detached: true connect-timeout-seconds: 180 limit-access-to-actor: true + + # Fast standalone unit tests for the backends' pure C++ helpers - currently the + # llama-cpp message reconstruction (backend/cpp/llama-cpp/message_content.h), + # which guards the OpenAI chat content normalization (mudler/LocalAI#10524, + # #7324, #7528). The runner discovers every *_test.cpp under backend/cpp/, so + # new pure-C++ unit tests are picked up with no CI changes. These need only the + # C++ stdlib + nlohmann/json, so they run on every PR without the full + # llama.cpp + gRPC backend build. (The same suite is also wired as an opt-in + # CMake/ctest target, -DLLAMA_GRPC_BUILD_TESTS=ON, for in-backend-build runs.) + tests-backend-cpp: + runs-on: ubuntu-latest + steps: + - name: Clone + uses: actions/checkout@v7 + - name: Run backend C++ unit tests + run: make test-backend-cpp diff --git a/Makefile b/Makefile index 600f6cbbe..2a8edc3fc 100644 --- a/Makefile +++ b/Makefile @@ -103,7 +103,7 @@ COVERAGE_E2E_LABELS?=!real-models COVERAGE_EXCLUDE_RE?=grpc/proto/.*[.]pb[.]go -.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all +.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-backend-cpp test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all all: help @@ -201,6 +201,13 @@ test: prepare-test OPUS_SHIM_LIBRARY=$(abspath ./pkg/opus/shim/libopusshim.so) \ $(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS) +## Compiles and runs the standalone C++ unit tests for the backends (pure +## helpers that depend only on the stdlib + nlohmann/json, no full backend +## build). Discovers every *_test.cpp under backend/cpp/ - see +## backend/cpp/run-unit-tests.sh. Set NLOHMANN_INCLUDE to skip the header fetch. +test-backend-cpp: + bash backend/cpp/run-unit-tests.sh + ## Runs the core suite ($(TEST_PATHS)) with statement-coverage instrumentation ## and writes a merged profile to $(COVERAGE_PROFILE). Deliberately omits ## --fail-fast so a single failure doesn't truncate the coverage number, and diff --git a/backend/cpp/llama-cpp/CMakeLists.txt b/backend/cpp/llama-cpp/CMakeLists.txt index bdf20802a..8b8d2e2d5 100644 --- a/backend/cpp/llama-cpp/CMakeLists.txt +++ b/backend/cpp/llama-cpp/CMakeLists.txt @@ -87,3 +87,18 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11) if(TARGET BUILD_INFO) add_dependencies(${TARGET} BUILD_INFO) endif() + +# Unit test for the message-content normalization helper (message_content.h). +# Off by default so the normal backend build is untouched; enable with +# -DLLAMA_GRPC_BUILD_TESTS=ON and run via ctest. It reuses llama.cpp's vendored +# (propagated by the common helpers library) so it has no +# extra dependency beyond what the backend already builds against. +option(LLAMA_GRPC_BUILD_TESTS "Build grpc-server unit tests" OFF) +if(LLAMA_GRPC_BUILD_TESTS) + enable_testing() + add_executable(message_content_test message_content_test.cpp message_content.h) + target_include_directories(message_content_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}) + target_link_libraries(message_content_test PRIVATE ${_LLAMA_COMMON_TARGET}) + target_compile_features(message_content_test PRIVATE cxx_std_17) + add_test(NAME message_content_test COMMAND message_content_test) +endif() diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 6907b9122..9d17e23b1 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -39,6 +39,7 @@ #include "common.h" #include "arg.h" #include "chat-auto-parser.h" +#include "message_content.h" #include #include #include @@ -1616,242 +1617,20 @@ public: for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); - json msg_json; - msg_json["role"] = msg.role(); - - bool is_last_user_msg = (i == last_user_msg_idx); - bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0); - - // Handle content - can be string, null, or array - // For multimodal content, we'll embed images/audio from separate fields - if (!msg.content().empty()) { - // Try to parse content as JSON to see if it's already an array - json content_val; - try { - content_val = json::parse(msg.content()); - // Handle null values - convert to empty string to avoid template errors - if (content_val.is_null()) { - content_val = ""; - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - content_val = msg.content(); - } - - // If content is an object (e.g., from tool call failures), convert to string - if (content_val.is_object()) { - content_val = content_val.dump(); - } - - // If content is a string and this is the last user message with images/audio, combine them - if (content_val.is_string() && is_last_user_msg && has_images_or_audio) { - json content_array = json::array(); - // Add text first - content_array.push_back({{"type", "text"}, {"text", content_val.get()}}); - // Add images - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - // Add audios - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - } else { - // Use content as-is (already array or not last user message) - // Ensure null values are converted to empty string - if (content_val.is_null()) { - msg_json["content"] = ""; - } else { - msg_json["content"] = content_val; - } - } - } else if (is_last_user_msg && has_images_or_audio) { - // If no content but this is the last user message with images/audio, create content array - json content_array = json::array(); - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - } else if (msg.role() == "tool") { - // Tool role messages must have content field set, even if empty - // Jinja templates expect content to be a string, not null or object - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0); - if (msg.content().empty()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): empty content, set to empty string\n", i); - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): content exists: %s\n", - i, msg.content().substr(0, std::min(200, msg.content().size())).c_str()); - // Content exists, parse and ensure it's a string - json content_val; - try { - content_val = json::parse(msg.content()); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): parsed JSON, type=%s\n", - i, content_val.is_null() ? "null" : - content_val.is_object() ? "object" : - content_val.is_string() ? "string" : - content_val.is_array() ? "array" : "other"); - // Handle null values - Jinja templates expect content to be a string, not null - if (content_val.is_null()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): null content, converted to empty string\n", i); - } else if (content_val.is_object()) { - // If content is an object (e.g., from tool call failures/errors), convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): object content, converted to string: %s\n", - i, content_val.dump().substr(0, std::min(200, content_val.dump().size())).c_str()); - } else if (content_val.is_string()) { - msg_json["content"] = content_val.get(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): string content, using as-is\n", i); - } else { - // For arrays or other types, convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): %s content, converted to string\n", - i, content_val.is_array() ? "array" : "other type"); - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - msg_json["content"] = msg.content(); - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): not JSON, using as string\n", i); - } - } - } else { - // Ensure all messages have content set (fallback for any unhandled cases) - // Jinja templates expect content to be present, default to empty string if not set - if (!msg_json.contains("content")) { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (role=%s): no content field, adding empty string\n", - i, msg.role().c_str()); - msg_json["content"] = ""; - } + llama_grpc::ReconstructedMessageInput rin; + rin.role = msg.role(); + rin.content = msg.content(); + rin.name = msg.name(); + rin.tool_call_id = msg.tool_call_id(); + rin.reasoning_content = msg.reasoning_content(); + rin.tool_calls = msg.tool_calls(); + rin.is_last_user_msg = (i == last_user_msg_idx); + if (rin.is_last_user_msg) { + for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j)); + for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j)); + for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j)); } - - // Add optional fields for OpenAI-compatible message format - if (!msg.name().empty()) { - msg_json["name"] = msg.name(); - } - if (!msg.tool_call_id().empty()) { - msg_json["tool_call_id"] = msg.tool_call_id(); - } - if (!msg.reasoning_content().empty()) { - msg_json["reasoning_content"] = msg.reasoning_content(); - } - if (!msg.tool_calls().empty()) { - // Parse tool_calls JSON string and add to message - try { - json tool_calls = json::parse(msg.tool_calls()); - msg_json["tool_calls"] = tool_calls; - SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str()); - // IMPORTANT: If message has tool_calls but content is empty or not set, - // set content to space " " instead of empty string "", because llama.cpp's - // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312), - // which causes template errors when accessing message.content[:tool_start_length] - if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get().empty())) { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d has tool_calls but empty content, setting to space\n", i); - msg_json["content"] = " "; - } - // Log each tool call with name and arguments - if (tool_calls.is_array()) { - for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) { - const auto& tc = tool_calls[tc_idx]; - std::string tool_name = "unknown"; - std::string tool_args = "{}"; - if (tc.contains("function")) { - const auto& func = tc["function"]; - if (func.contains("name")) { - tool_name = func["name"].get(); - } - if (func.contains("arguments")) { - tool_args = func["arguments"].is_string() ? - func["arguments"].get() : - func["arguments"].dump(); - } - } else if (tc.contains("name")) { - tool_name = tc["name"].get(); - if (tc.contains("arguments")) { - tool_args = tc["arguments"].is_string() ? - tc["arguments"].get() : - tc["arguments"].dump(); - } - } - SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d, tool_call %zu: name=%s, arguments=%s\n", - i, tc_idx, tool_name.c_str(), tool_args.c_str()); - } - } - } catch (const json::parse_error& e) { - SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what()); - } - } - - // Debug: Log final content state before adding to array - if (msg_json.contains("content")) { - if (msg_json["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i); - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content type=%s, has_value=%d\n", - i, msg_json["content"].is_string() ? "string" : - msg_json["content"].is_array() ? "array" : - msg_json["content"].is_object() ? "object" : "other", - msg_json["content"].is_null() ? 0 : 1); - } - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i); - } - - messages_json.push_back(msg_json); + messages_json.push_back(llama_grpc::build_reconstructed_message(rin)); } // Final safety check: Ensure no message has null content (Jinja templates require strings) @@ -2072,36 +1851,7 @@ public: if (body_json.contains("messages") && body_json["messages"].is_array()) { SRV_INF("[CONTENT DEBUG] PredictStream: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size()); for (size_t idx = 0; idx < body_json["messages"].size(); idx++) { - auto& msg = body_json["messages"][idx]; - std::string role_str = msg.contains("role") ? msg["role"].get() : "unknown"; - if (msg.contains("content")) { - if (msg["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Fix null content - } else if (role_str == "tool" && msg["content"].is_array()) { - // Tool messages must have string content, not array - // oaicompat_chat_params_parse expects tool messages to have string content - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx); - msg["content"] = msg["content"].dump(); - } else if (!msg["content"].is_string() && !msg["content"].is_array()) { - // If content is object or other non-string type, convert to string for templates - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str()); - if (msg["content"].is_object()) { - msg["content"] = msg["content"].dump(); - } else { - msg["content"] = ""; - } - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n", - idx, role_str.c_str(), - msg["content"].is_string() ? "string" : - msg["content"].is_array() ? "array" : - msg["content"].is_object() ? "object" : "other"); - } - } else { - SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Add missing content - } + llama_grpc::normalize_template_message(body_json["messages"][idx]); } } @@ -2433,264 +2183,20 @@ public: SRV_INF("[CONTENT DEBUG] Predict: Processing %d messages\n", request->messages_size()); for (int i = 0; i < request->messages_size(); i++) { const auto& msg = request->messages(i); - json msg_json; - msg_json["role"] = msg.role(); - - SRV_INF("[CONTENT DEBUG] Predict: Message %d: role=%s, content_empty=%d, content_length=%zu\n", - i, msg.role().c_str(), msg.content().empty() ? 1 : 0, msg.content().size()); - if (!msg.content().empty()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d content (first 200 chars): %s\n", - i, msg.content().substr(0, std::min(200, msg.content().size())).c_str()); + llama_grpc::ReconstructedMessageInput rin; + rin.role = msg.role(); + rin.content = msg.content(); + rin.name = msg.name(); + rin.tool_call_id = msg.tool_call_id(); + rin.reasoning_content = msg.reasoning_content(); + rin.tool_calls = msg.tool_calls(); + rin.is_last_user_msg = (i == last_user_msg_idx); + if (rin.is_last_user_msg) { + for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j)); + for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j)); + for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j)); } - - bool is_last_user_msg = (i == last_user_msg_idx); - bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0); - - // Handle content - can be string, null, or array - // For multimodal content, we'll embed images/audio from separate fields - if (!msg.content().empty()) { - // Try to parse content as JSON to see if it's already an array - json content_val; - try { - content_val = json::parse(msg.content()); - // Handle null values - convert to empty string to avoid template errors - if (content_val.is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d parsed JSON is null, converting to empty string\n", i); - content_val = ""; - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - content_val = msg.content(); - } - - // If content is an object (e.g., from tool call failures), convert to string - if (content_val.is_object()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d content is object, converting to string\n", i); - content_val = content_val.dump(); - } - - // If content is a string and this is the last user message with images/audio, combine them - if (content_val.is_string() && is_last_user_msg && has_images_or_audio) { - json content_array = json::array(); - // Add text first - content_array.push_back({{"type", "text"}, {"text", content_val.get()}}); - // Add images - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - // Add audios - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - } else { - // Use content as-is (already array or not last user message) - // Ensure null values are converted to empty string - if (content_val.is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d content_val was null, setting to empty string\n", i); - msg_json["content"] = ""; - } else { - msg_json["content"] = content_val; - SRV_INF("[CONTENT DEBUG] Predict: Message %d content set, type=%s\n", - i, content_val.is_string() ? "string" : - content_val.is_array() ? "array" : - content_val.is_object() ? "object" : "other"); - } - } - } else if (is_last_user_msg && has_images_or_audio) { - // If no content but this is the last user message with images/audio, create content array - json content_array = json::array(); - if (request->images_size() > 0) { - for (int j = 0; j < request->images_size(); j++) { - json image_chunk; - image_chunk["type"] = "image_url"; - json image_url; - image_url["url"] = "data:image/jpeg;base64," + request->images(j); - image_chunk["image_url"] = image_url; - content_array.push_back(image_chunk); - } - } - if (request->audios_size() > 0) { - for (int j = 0; j < request->audios_size(); j++) { - json audio_chunk; - audio_chunk["type"] = "input_audio"; - json input_audio; - input_audio["data"] = request->audios(j); - input_audio["format"] = "wav"; // default, could be made configurable - audio_chunk["input_audio"] = input_audio; - content_array.push_back(audio_chunk); - } - } - if (request->videos_size() > 0) { - for (int j = 0; j < request->videos_size(); j++) { - json video_chunk; - video_chunk["type"] = "input_video"; - json input_video; - input_video["data"] = request->videos(j); - video_chunk["input_video"] = input_video; - content_array.push_back(video_chunk); - } - } - msg_json["content"] = content_array; - SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i); - } else if (!msg.tool_calls().empty()) { - // Tool call messages may have null content, but templates expect string - // IMPORTANT: Set to space " " instead of empty string "", because llama.cpp's - // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312), - // which causes template errors when accessing message.content[:tool_start_length] - SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls, setting content to space (not empty string)\n", i); - msg_json["content"] = " "; - } else if (msg.role() == "tool") { - // Tool role messages must have content field set, even if empty - // Jinja templates expect content to be a string, not null or object - SRV_INF("[CONTENT DEBUG] Predict: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0); - if (msg.content().empty()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): empty content, set to empty string\n", i); - } else { - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): content exists: %s\n", - i, msg.content().substr(0, std::min(200, msg.content().size())).c_str()); - // Content exists, parse and ensure it's a string - json content_val; - try { - content_val = json::parse(msg.content()); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): parsed JSON, type=%s\n", - i, content_val.is_null() ? "null" : - content_val.is_object() ? "object" : - content_val.is_string() ? "string" : - content_val.is_array() ? "array" : "other"); - // Handle null values - Jinja templates expect content to be a string, not null - if (content_val.is_null()) { - msg_json["content"] = ""; - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): null content, converted to empty string\n", i); - } else if (content_val.is_object()) { - // If content is an object (e.g., from tool call failures/errors), convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): object content, converted to string: %s\n", - i, content_val.dump().substr(0, std::min(200, content_val.dump().size())).c_str()); - } else if (content_val.is_string()) { - msg_json["content"] = content_val.get(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): string content, using as-is\n", i); - } else { - // For arrays or other types, convert to string - msg_json["content"] = content_val.dump(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): %s content, converted to string\n", - i, content_val.is_array() ? "array" : "other type"); - } - } catch (const json::parse_error&) { - // Not JSON, treat as plain string - msg_json["content"] = msg.content(); - SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): not JSON, using as string\n", i); - } - } - } else { - // Ensure all messages have content set (fallback for any unhandled cases) - // Jinja templates expect content to be present, default to empty string if not set - if (!msg_json.contains("content")) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d (role=%s): no content field, adding empty string\n", - i, msg.role().c_str()); - msg_json["content"] = ""; - } - } - - // Add optional fields for OpenAI-compatible message format - if (!msg.name().empty()) { - msg_json["name"] = msg.name(); - } - if (!msg.tool_call_id().empty()) { - msg_json["tool_call_id"] = msg.tool_call_id(); - } - if (!msg.reasoning_content().empty()) { - msg_json["reasoning_content"] = msg.reasoning_content(); - } - if (!msg.tool_calls().empty()) { - // Parse tool_calls JSON string and add to message - try { - json tool_calls = json::parse(msg.tool_calls()); - msg_json["tool_calls"] = tool_calls; - SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str()); - // IMPORTANT: If message has tool_calls but content is empty or not set, - // set content to space " " instead of empty string "", because llama.cpp's - // common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312), - // which causes template errors when accessing message.content[:tool_start_length] - if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get().empty())) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls but empty content, setting to space\n", i); - msg_json["content"] = " "; - } - // Log each tool call with name and arguments - if (tool_calls.is_array()) { - for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) { - const auto& tc = tool_calls[tc_idx]; - std::string tool_name = "unknown"; - std::string tool_args = "{}"; - if (tc.contains("function")) { - const auto& func = tc["function"]; - if (func.contains("name")) { - tool_name = func["name"].get(); - } - if (func.contains("arguments")) { - tool_args = func["arguments"].is_string() ? - func["arguments"].get() : - func["arguments"].dump(); - } - } else if (tc.contains("name")) { - tool_name = tc["name"].get(); - if (tc.contains("arguments")) { - tool_args = tc["arguments"].is_string() ? - tc["arguments"].get() : - tc["arguments"].dump(); - } - } - SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d, tool_call %zu: name=%s, arguments=%s\n", - i, tc_idx, tool_name.c_str(), tool_args.c_str()); - } - } - } catch (const json::parse_error& e) { - SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what()); - } - } - - // Debug: Log final content state before adding to array - if (msg_json.contains("content")) { - if (msg_json["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i); - } else { - SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content type=%s, has_value=%d\n", - i, msg_json["content"].is_string() ? "string" : - msg_json["content"].is_array() ? "array" : - msg_json["content"].is_object() ? "object" : "other", - msg_json["content"].is_null() ? 0 : 1); - } - } else { - SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i); - } - - messages_json.push_back(msg_json); + messages_json.push_back(llama_grpc::build_reconstructed_message(rin)); } // Final safety check: Ensure no message has null content (Jinja templates require strings) @@ -2911,36 +2417,7 @@ public: if (body_json.contains("messages") && body_json["messages"].is_array()) { SRV_INF("[CONTENT DEBUG] Predict: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size()); for (size_t idx = 0; idx < body_json["messages"].size(); idx++) { - auto& msg = body_json["messages"][idx]; - std::string role_str = msg.contains("role") ? msg["role"].get() : "unknown"; - if (msg.contains("content")) { - if (msg["content"].is_null()) { - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Fix null content - } else if (role_str == "tool" && msg["content"].is_array()) { - // Tool messages must have string content, not array - // oaicompat_chat_params_parse expects tool messages to have string content - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx); - msg["content"] = msg["content"].dump(); - } else if (!msg["content"].is_string() && !msg["content"].is_array()) { - // If content is object or other non-string type, convert to string for templates - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str()); - if (msg["content"].is_object()) { - msg["content"] = msg["content"].dump(); - } else { - msg["content"] = ""; - } - } else { - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n", - idx, role_str.c_str(), - msg["content"].is_string() ? "string" : - msg["content"].is_array() ? "array" : - msg["content"].is_object() ? "object" : "other"); - } - } else { - SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str()); - msg["content"] = ""; // Add missing content - } + llama_grpc::normalize_template_message(body_json["messages"][idx]); } } diff --git a/backend/cpp/llama-cpp/message_content.h b/backend/cpp/llama-cpp/message_content.h new file mode 100644 index 000000000..4c7317ecd --- /dev/null +++ b/backend/cpp/llama-cpp/message_content.h @@ -0,0 +1,192 @@ +#pragma once + +#include +#include + +#include + +namespace llama_grpc { + +// Normalizes a proto message's content string into the JSON value used when +// reconstructing OpenAI-format messages for the tokenizer (jinja) template. +// +// Shared by the streaming (PredictStream) and non-streaming (Predict) message +// reconstruction paths so the two cannot drift. +// +// LocalAI's Go layer (schema.Messages.ToProto) always sends content as a plain +// text string; multimodal media travels in separate proto fields, never inside +// content. So user/system/developer content is *only ever* opaque text and must +// NOT be JSON-sniffed: a prompt that merely looks like JSON (e.g. an ingredient +// list ["1/4 cup sugar", ...]) would otherwise be reinterpreted as structured +// content parts and rejected by oaicompat_chat_params_parse with +// "unsupported content[].type" (https://github.com/mudler/LocalAI/issues/10524). +// (developer is OpenAI's modern system alias - same "human-authored text" nature.) +// +// For assistant/tool messages we still collapse a literal JSON null/object +// (tool-call bookkeeping) to a string, but we never turn a plain string into an +// array/scalar. The array defense is therefore role-independent (arrays/scalars +// fall through for every role); the role gate only governs the null/object case. +inline nlohmann::ordered_json normalize_message_content(const std::string& role, + const std::string& content) { + nlohmann::ordered_json content_val = content; + if (role != "user" && role != "system" && role != "developer") { + try { + nlohmann::ordered_json parsed = nlohmann::ordered_json::parse(content); + if (parsed.is_null()) { + content_val = ""; + } else if (parsed.is_object()) { + content_val = parsed.dump(); + } + // arrays / scalars: keep the original plain-text string as-is + } catch (const nlohmann::ordered_json::parse_error&) { + // Not JSON, already the plain string + } + } + return content_val; +} + +// Final safety pass applied to each reconstructed OpenAI message right before it +// is handed to oaicompat_chat_params_parse (jinja templating). Jinja templates +// assume content is a string: a literal null breaks slicing such as +// message.content[:N] (#7324), and a tool message with array content is rejected +// (#7528). A multimodal user message legitimately carries a typed-part array +// ({type:text}, {type:image_url}, ...), which must be left intact. Shared by the +// streaming and non-streaming paths so this invariant cannot drift between them. +inline void normalize_template_message(nlohmann::ordered_json& msg) { + if (!msg.contains("content")) { + msg["content"] = ""; // templates expect the field to exist + return; + } + nlohmann::ordered_json& content = msg["content"]; + const std::string role = (msg.contains("role") && msg["role"].is_string()) + ? msg["role"].get() + : std::string(); + if (content.is_null()) { + content = ""; // #7324: null would crash content[:N] slicing + } else if (role == "tool" && content.is_array()) { + content = content.dump(); // #7528: tool messages must have string content + } else if (!content.is_string() && !content.is_array()) { + if (content.is_object()) { + content = content.dump(); // tool-call bookkeeping object -> string + } else { + content = ""; // other scalar (number/bool) -> empty + } + } + // string, or a non-tool (multimodal) typed-part array: leave untouched +} + +// One proto message's data, flattened to plain types so the reconstruction logic +// can be shared and unit-tested without protobuf. The streaming and non-streaming +// predict paths both populate this from proto::Message + the request's media. +struct ReconstructedMessageInput { + std::string role; + std::string content; // proto.Message.content (always a plain string) + std::string name; + std::string tool_call_id; + std::string reasoning_content; + std::string tool_calls; // tool_calls as a JSON string, or empty + bool is_last_user_msg = false; // attach request media to this message + std::vector images; // base64 (jpeg) + std::vector audios; // base64 (wav) + std::vector videos; // base64 +}; + +// Appends the request's media as OpenAI typed content parts. Imperative (not +// brace-init) to avoid nlohmann's object-vs-array initializer-list ambiguity. +inline void append_media_parts(nlohmann::ordered_json& content_array, + const std::vector& images, + const std::vector& audios, + const std::vector& videos) { + for (const auto& img : images) { + nlohmann::ordered_json image_chunk; + image_chunk["type"] = "image_url"; + nlohmann::ordered_json image_url; + image_url["url"] = "data:image/jpeg;base64," + img; + image_chunk["image_url"] = image_url; + content_array.push_back(image_chunk); + } + for (const auto& aud : audios) { + nlohmann::ordered_json audio_chunk; + audio_chunk["type"] = "input_audio"; + nlohmann::ordered_json input_audio; + input_audio["data"] = aud; + input_audio["format"] = "wav"; // default; could be made configurable + audio_chunk["input_audio"] = input_audio; + content_array.push_back(audio_chunk); + } + for (const auto& vid : videos) { + nlohmann::ordered_json video_chunk; + video_chunk["type"] = "input_video"; + nlohmann::ordered_json input_video; + input_video["data"] = vid; + video_chunk["input_video"] = input_video; + content_array.push_back(video_chunk); + } +} + +// Reconstructs a single OpenAI-format message (the object fed to +// oaicompat_chat_params_parse) from a proto message. Shared by PredictStream and +// Predict so the content/multimodal/tool_calls handling cannot drift between the +// two stream modes (it previously lived as two ~150-line copies with a redundant +// Predict-only tool_calls->" " branch). Guarantees content is always a string or +// a typed-part array, never null/missing. +inline nlohmann::ordered_json build_reconstructed_message(const ReconstructedMessageInput& in) { + nlohmann::ordered_json msg_json; + msg_json["role"] = in.role; + const bool has_media = !in.images.empty() || !in.audios.empty() || !in.videos.empty(); + + if (!in.content.empty()) { + nlohmann::ordered_json content_val = normalize_message_content(in.role, in.content); + if (content_val.is_string() && in.is_last_user_msg && has_media) { + // Last user message + media: build a typed-part array (text first). + nlohmann::ordered_json content_array = nlohmann::ordered_json::array(); + nlohmann::ordered_json text_part; + text_part["type"] = "text"; + text_part["text"] = content_val.get(); + content_array.push_back(text_part); + append_media_parts(content_array, in.images, in.audios, in.videos); + msg_json["content"] = content_array; + } else if (content_val.is_null()) { + msg_json["content"] = ""; + } else { + msg_json["content"] = content_val; + } + } else if (in.is_last_user_msg && has_media) { + // No text but media on the last user message: media-only typed array. + nlohmann::ordered_json content_array = nlohmann::ordered_json::array(); + append_media_parts(content_array, in.images, in.audios, in.videos); + msg_json["content"] = content_array; + } else { + // Empty content (any role, incl. tool/assistant): templates need a string. + msg_json["content"] = ""; + } + + if (!in.name.empty()) { + msg_json["name"] = in.name; + } + if (!in.tool_call_id.empty()) { + msg_json["tool_call_id"] = in.tool_call_id; + } + if (!in.reasoning_content.empty()) { + msg_json["reasoning_content"] = in.reasoning_content; + } + if (!in.tool_calls.empty()) { + try { + nlohmann::ordered_json tool_calls = nlohmann::ordered_json::parse(in.tool_calls); + msg_json["tool_calls"] = tool_calls; + // tool_calls + empty/blank content: use " " not "", because llama.cpp's + // common_chat_msgs_to_json_oaicompat turns "" into null, which breaks + // templates that slice message.content[:tool_start_length] (#7324). + if (!msg_json.contains("content") || + (msg_json["content"].is_string() && msg_json["content"].get().empty())) { + msg_json["content"] = " "; + } + } catch (const nlohmann::ordered_json::parse_error&) { + // Malformed tool_calls JSON: leave content as-is (prior behavior). + } + } + + return msg_json; +} + +} // namespace llama_grpc diff --git a/backend/cpp/llama-cpp/message_content_test.cpp b/backend/cpp/llama-cpp/message_content_test.cpp new file mode 100644 index 000000000..7e9a5383a --- /dev/null +++ b/backend/cpp/llama-cpp/message_content_test.cpp @@ -0,0 +1,234 @@ +// Unit tests for the shared message-reconstruction helpers (message_content.h). +// +// Build & run standalone (nlohmann/json single header on the include path): +// g++ -std=c++17 -I message_content_test.cpp -o t && ./t +// or via CMake: -DLLAMA_GRPC_BUILD_TESTS=ON then ctest. +// +// Regression coverage for: +// #10524 - a user/system prompt that is itself a JSON-array string must stay +// plain text, never be reinterpreted as OpenAI structured parts. +// #7324 - assistant/tool null content -> "" (templates slice content[:N]); +// assistant+tool_calls+empty content -> " " (not "", which becomes null). +// #7528 - tool message array content must reach the template as a string. +// multimodal - last user message text + media -> typed-part array, media kept. + +#include +#include +#include + +#include "message_content.h" + +using nlohmann::ordered_json; +using llama_grpc::normalize_message_content; +using llama_grpc::normalize_template_message; +using llama_grpc::build_reconstructed_message; +using llama_grpc::ReconstructedMessageInput; + +static int failures = 0; + +static void check(bool ok, const std::string& name, const std::string& detail = "") { + if (!ok) { + std::cerr << "FAIL " << name << (detail.empty() ? "" : ": " + detail) << "\n"; + failures++; + } +} + +// ---- normalize_message_content ------------------------------------------- + +static void expect_norm_string(const char* name, const std::string& role, + const std::string& content, const std::string& want) { + auto got = normalize_message_content(role, content); + if (!got.is_string()) { + check(false, name, "expected a JSON string, got " + + std::string(got.is_array() ? "array" : got.is_object() ? "object" : "other") + + " (" + got.dump() + ")"); + return; + } + check(got.get() == want, name, "expected \"" + want + "\", got \"" + got.get() + "\""); +} + +static void test_normalize() { + const std::string ingredients = R"(["1/4 cup brown sugar, packed","1 pound ground beef"])"; + + // #10524 - JSON-array text must stay a string. Role-INDEPENDENT array defense. + for (const char* role : {"user", "system", "developer", "function", "assistant", "tool"}) { + expect_norm_string((std::string("json_array_stays_text:") + role).c_str(), role, ingredients, ingredients); + } + + // #10524 - user/system/developer JSON-object text stays verbatim (NOT re-dumped). + expect_norm_string("user_json_object_verbatim", "user", R"({"a":1})", R"({"a":1})"); + expect_norm_string("system_json_object_verbatim", "system", R"({"a":1})", R"({"a":1})"); + expect_norm_string("developer_json_object_verbatim", "developer", R"({"a":1})", R"({"a":1})"); + + // Plain text unchanged for all roles. + expect_norm_string("user_plain_text", "user", "hello world", "hello world"); + expect_norm_string("assistant_non_json_text_kept", "assistant", "hi [unclosed", "hi [unclosed"); + + // #7324 boundary - user/system/developer literal "null" preserved (never parsed). + expect_norm_string("user_literal_null_stays", "user", "null", "null"); + expect_norm_string("system_literal_null_stays", "system", "null", "null"); + expect_norm_string("developer_literal_null_stays", "developer", "null", "null"); + + // #7324 - assistant/tool literal null collapses to empty string. + expect_norm_string("assistant_null_to_empty", "assistant", "null", ""); + expect_norm_string("tool_null_to_empty", "tool", "null", ""); + + // #7324/#7528 - assistant/tool object bookkeeping stringified (stays a string). + check(normalize_message_content("assistant", R"({"tool":"x"})").is_string(), "assistant_object_stringified"); + check(normalize_message_content("tool", R"({"error":"boom"})").is_string(), "tool_object_stringified"); + + // #10524-family - a bare scalar that parses as a JSON number stays the string. + expect_norm_string("assistant_scalar_number_stays_string", "assistant", "42", "42"); + + // baseline - empty content stays empty. + expect_norm_string("user_empty_stays_empty", "user", "", ""); +} + +// ---- normalize_template_message (BEFORE TEMPLATE sanitizer) --------------- + +static void test_template_sanitizer() { + // #7528 - a tool message with an ACTUAL array becomes a string. + { + ordered_json msg = {{"role", "tool"}, {"content", ordered_json::array({{{"type", "text"}, {"text", "r"}}})}}; + normalize_template_message(msg); + check(msg["content"].is_string(), "before_template_tool_array_to_string", "got " + msg["content"].dump()); + } + // #7324 - null content -> "" for any role. + { + ordered_json msg = {{"role", "assistant"}, {"content", nullptr}}; + normalize_template_message(msg); + check(msg["content"].is_string() && msg["content"] == "", "before_template_null_to_empty"); + } + // object content -> dumped string (would otherwise throw at the template). + { + ordered_json msg = {{"role", "assistant"}, {"content", {{"x", 1}}}}; + normalize_template_message(msg); + check(msg["content"].is_string(), "before_template_object_to_string", "got " + msg["content"].dump()); + } + // missing content field -> "". + { + ordered_json msg = {{"role", "user"}}; + normalize_template_message(msg); + check(msg.contains("content") && msg["content"] == "", "before_template_missing_to_empty"); + } + // multimodal: a well-typed user array must be left UNTOUCHED (role!=tool). + { + ordered_json parts = ordered_json::array(); + parts.push_back({{"type", "text"}, {"text", "x"}}); + ordered_json img; img["type"] = "image_url"; img["image_url"] = {{"url", "data:..."}}; + parts.push_back(img); + ordered_json msg = {{"role", "user"}, {"content", parts}}; + normalize_template_message(msg); + check(msg["content"].is_array() && msg["content"].size() == 2, "before_template_user_typed_array_preserved", + "got " + msg["content"].dump()); + } + // a plain string is left untouched. + { + ordered_json msg = {{"role", "user"}, {"content", "hello"}}; + normalize_template_message(msg); + check(msg["content"] == "hello", "before_template_string_untouched"); + } +} + +// ---- build_reconstructed_message ---------------------------------------- + +static void test_reconstruction() { + const std::string ingredients = R"(["1/4 cup brown sugar","1 pound ground beef"])"; + + // #10524 end-state - user JSON-array text, no media -> string content. + { + ReconstructedMessageInput in; + in.role = "user"; in.content = ingredients; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == ingredients, "recon_user_json_array_string", + "got " + m["content"].dump()); + } + // multimodal - user text + one image on last user msg -> typed array, image kept. + { + ReconstructedMessageInput in; + in.role = "user"; in.content = ingredients; in.is_last_user_msg = true; + in.images.push_back("BASE64IMG"); + auto m = build_reconstructed_message(in); + check(m["content"].is_array() && m["content"].size() == 2, "recon_multimodal_text_plus_image", + "got " + m["content"].dump()); + check(m["content"][0]["type"] == "text" && m["content"][0]["text"] == ingredients, "recon_multimodal_text_first"); + check(m["content"][1]["type"] == "image_url", "recon_multimodal_image_kept"); + } + // multimodal media-only - empty text + image on last user msg. + { + ReconstructedMessageInput in; + in.role = "user"; in.content = ""; in.is_last_user_msg = true; + in.images.push_back("BASE64IMG"); + auto m = build_reconstructed_message(in); + check(m["content"].is_array() && m["content"].size() == 1 && m["content"][0]["type"] == "image_url", + "recon_media_only", "got " + m["content"].dump()); + } + // #7528 - tool array-string content stays a string. + { + ReconstructedMessageInput in; + in.role = "tool"; in.content = R"(["a","b"])"; in.tool_call_id = "call_1"; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == R"(["a","b"])", "recon_tool_array_string", + "got " + m["content"].dump()); + check(m["tool_call_id"] == "call_1", "recon_tool_call_id_set"); + } + // tool empty content -> "". + { + ReconstructedMessageInput in; + in.role = "tool"; in.content = ""; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == "", "recon_tool_empty_to_string"); + } + // #7324 - assistant + tool_calls + empty content -> " " (single space, not ""). + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = ""; + in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])"; + auto m = build_reconstructed_message(in); + check(m["content"].is_string() && m["content"] == " ", "recon_toolcalls_empty_content_space", + "got " + m["content"].dump()); + check(m["tool_calls"].is_array() && m["tool_calls"].size() == 1, "recon_toolcalls_parsed"); + } + // assistant + tool_calls + real content keeps the content. + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = "I'll call f"; + in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])"; + auto m = build_reconstructed_message(in); + check(m["content"] == "I'll call f", "recon_toolcalls_with_content_kept"); + } + // assistant null content -> "". + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = "null"; + auto m = build_reconstructed_message(in); + check(m["content"] == "", "recon_assistant_null_to_empty"); + } + // malformed tool_calls JSON must not throw; content preserved. + { + ReconstructedMessageInput in; + in.role = "assistant"; in.content = "hi"; in.tool_calls = "{not json"; + auto m = build_reconstructed_message(in); + check(m["content"] == "hi" && !m.contains("tool_calls"), "recon_malformed_toolcalls_safe"); + } + // optional fields: name + reasoning carried through. + { + ReconstructedMessageInput in; + in.role = "tool"; in.content = "result"; in.name = "get_weather"; in.reasoning_content = "thinking"; + auto m = build_reconstructed_message(in); + check(m["name"] == "get_weather" && m["reasoning_content"] == "thinking", "recon_optional_fields"); + } +} + +int main() { + test_normalize(); + test_template_sanitizer(); + test_reconstruction(); + + if (failures == 0) { + std::cout << "OK: all message_content tests passed\n"; + return 0; + } + std::cerr << failures << " test(s) failed\n"; + return 1; +} diff --git a/backend/cpp/llama-cpp/prepare.sh b/backend/cpp/llama-cpp/prepare.sh index f9b7e3dd2..4da45ea9d 100644 --- a/backend/cpp/llama-cpp/prepare.sh +++ b/backend/cpp/llama-cpp/prepare.sh @@ -18,6 +18,10 @@ done cp -r CMakeLists.txt llama.cpp/tools/grpc-server/ cp -r grpc-server.cpp llama.cpp/tools/grpc-server/ +# Shared message-reconstruction helpers (included by grpc-server.cpp) and their +# unit test (compiled only when -DLLAMA_GRPC_BUILD_TESTS=ON). +cp -r message_content.h llama.cpp/tools/grpc-server/ +cp -r message_content_test.cpp llama.cpp/tools/grpc-server/ cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/ cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/ diff --git a/backend/cpp/run-unit-tests.sh b/backend/cpp/run-unit-tests.sh new file mode 100755 index 000000000..3f63faa40 --- /dev/null +++ b/backend/cpp/run-unit-tests.sh @@ -0,0 +1,71 @@ +#!/bin/bash +# +# Discovers and runs every standalone C++ unit test under backend/cpp/. +# +# A "standalone" unit test is a *_test.cpp that depends only on the C++ standard +# library and nlohmann/json (single header) - i.e. it exercises pure helpers and +# does not need the full llama.cpp + gRPC backend build. Tests that DO need the +# backend build use the CMake/ctest path (e.g. -DLLAMA_GRPC_BUILD_TESTS=ON) +# instead and are skipped here. +# +# This keeps CI generic: adding a new pure-C++ unit test file named *_test.cpp in +# an active backend source dir is picked up automatically, with no CI edits. +# +# Env: +# NLOHMANN_INCLUDE include dir that contains nlohmann/json.hpp. If unset, the +# nlohmann/json single header is fetched to a temp dir. +# CXX compiler (default: g++). +# JSON_VERSION nlohmann/json tag to fetch when NLOHMANN_INCLUDE is unset +# (default: v3.11.3). +set -uo pipefail + +ROOT="$(cd "$(dirname "$0")" && pwd)" +CXX="${CXX:-g++}" +JSON_VERSION="${JSON_VERSION:-v3.11.3}" + +JSON_INC="${NLOHMANN_INCLUDE:-}" +if [ -z "$JSON_INC" ]; then + JSON_INC="$(mktemp -d)" + mkdir -p "$JSON_INC/nlohmann" + echo "Fetching nlohmann/json ${JSON_VERSION} single header..." + if ! curl -L -sf \ + "https://raw.githubusercontent.com/nlohmann/json/${JSON_VERSION}/single_include/nlohmann/json.hpp" \ + -o "$JSON_INC/nlohmann/json.hpp"; then + echo "ERROR: failed to fetch nlohmann/json header" >&2 + exit 1 + fi +fi + +# Active source dirs only - exclude per-variant build copies, dev snapshots and +# the vendored upstream llama.cpp tree. +mapfile -t tests < <(find "$ROOT" -name '*_test.cpp' \ + -not -path '*/llama.cpp/*' \ + -not -path '*-build/*' \ + -not -path '*-dev/*' \ + -not -path '*fallback*' | sort) + +if [ "${#tests[@]}" -eq 0 ]; then + echo "No standalone C++ unit tests found under $ROOT" + exit 0 +fi + +fail=0 +for test_src in "${tests[@]}"; do + name="$(basename "$test_src" .cpp)" + bin="$(mktemp -d)/$name" + echo "==> $test_src" + if ! "$CXX" -std=c++17 -Wall -Wextra \ + -I"$JSON_INC" -I"$(dirname "$test_src")" \ + "$test_src" -o "$bin"; then + echo "COMPILE FAILED: $test_src" >&2 + fail=1 + continue + fi + if ! "$bin"; then + echo "TEST FAILED: $test_src" >&2 + fail=1 + fi +done + +echo "Ran ${#tests[@]} standalone C++ unit test file(s)" +exit "$fail" diff --git a/core/schema/message_test.go b/core/schema/message_test.go index da11d9d20..eb471b57b 100644 --- a/core/schema/message_test.go +++ b/core/schema/message_test.go @@ -68,6 +68,32 @@ var _ = Describe("LLM tests", func() { Expect(protoMessages[0].Content).To(Equal("Hello World")) }) + // Regression for mudler/LocalAI#10524: a text part whose inner text is + // itself a JSON-array string (mealie sends an ingredient list) must + // flatten to that exact string verbatim. ToProto must NOT escape or + // restructure it - the C++ backend then treats it as opaque text. This + // pins the precise Go-side input that produced the "unsupported + // content[].type" gRPC error before the backend stopped re-parsing it. + It("flattens a JSON-array-looking text part to the verbatim string (#10524)", func() { + ingredients := `["1/4 cup brown sugar, packed","1 pound ground beef"]` + messages := Messages{ + { + Role: "user", + Content: []any{ + map[string]any{ + "type": "text", + "text": ingredients, + }, + }, + }, + } + + protoMessages := messages.ToProto() + + Expect(protoMessages).To(HaveLen(1)) + Expect(protoMessages[0].Content).To(Equal(ingredients)) + }) + It("should convert message with tool_calls", func() { messages := Messages{ {