mirror of
https://github.com/mudler/LocalAI.git
synced 2026-06-27 01:47:18 -04:00
The llama-cpp gRPC backend reconstructs OpenAI messages from proto for the tokenizer-template path and blindly json::parse'd each message's content string. LocalAI's Go layer always flattens content to a plain string, so a user prompt that merely looks like JSON (e.g. mealie's ingredient array ["1/4 cup brown sugar", ...]) was reinterpreted as structured content parts and rejected by oaicompat_chat_params_parse with "unsupported content[].type". Normalize content per role instead: user/system/developer content is opaque text and is never JSON-sniffed; assistant/tool content still collapses a literal JSON null/object (tool-call bookkeeping) to a string, but a plain string is never turned into an array/scalar. The array defense is role-independent, so the role gate only governs the benign null/object case. While here, extract the duplicated per-message reconstruction and the pre-template content sanitization into shared, unit-tested helpers (message_content.h) so the streaming (PredictStream) and non-streaming (Predict) paths cannot drift. This removes ~490 lines of copy-pasted defensive code, the dead tool-role parse branches, and the redundant Predict-only tool_calls branch, while preserving the prior #7324 (null content -> "") and #7528 (tool array content -> string) fixes. Tests: - backend/cpp/llama-cpp/message_content_test.cpp: standalone C++ unit tests for all three helpers (#10524, #7324, #7528, multimodal), discovered and run by `make test-backend-cpp` and a new generic tests-backend-cpp CI job. Also wired as an opt-in CMake/ctest target (-DLLAMA_GRPC_BUILD_TESTS=ON). - core/schema/message_test.go: Go regression pinning that ToProto flattens a JSON-array-looking text part to the verbatim string. - prepare.sh now copies message_content.h into the build tree. Assisted-by: Claude:claude-opus-4-8 [Claude Code] Signed-off-by: Ettore Di Giacinto <mudler@localai.io> Co-authored-by: Ettore Di Giacinto <mudler@localai.io>
This commit is contained in:
16
.github/workflows/test.yml
vendored
16
.github/workflows/test.yml
vendored
@@ -121,3 +121,19 @@ jobs:
|
||||
detached: true
|
||||
connect-timeout-seconds: 180
|
||||
limit-access-to-actor: true
|
||||
|
||||
# Fast standalone unit tests for the backends' pure C++ helpers - currently the
|
||||
# llama-cpp message reconstruction (backend/cpp/llama-cpp/message_content.h),
|
||||
# which guards the OpenAI chat content normalization (mudler/LocalAI#10524,
|
||||
# #7324, #7528). The runner discovers every *_test.cpp under backend/cpp/, so
|
||||
# new pure-C++ unit tests are picked up with no CI changes. These need only the
|
||||
# C++ stdlib + nlohmann/json, so they run on every PR without the full
|
||||
# llama.cpp + gRPC backend build. (The same suite is also wired as an opt-in
|
||||
# CMake/ctest target, -DLLAMA_GRPC_BUILD_TESTS=ON, for in-backend-build runs.)
|
||||
tests-backend-cpp:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Clone
|
||||
uses: actions/checkout@v7
|
||||
- name: Run backend C++ unit tests
|
||||
run: make test-backend-cpp
|
||||
|
||||
9
Makefile
9
Makefile
@@ -103,7 +103,7 @@ COVERAGE_E2E_LABELS?=!real-models
|
||||
COVERAGE_EXCLUDE_RE?=grpc/proto/.*[.]pb[.]go
|
||||
|
||||
|
||||
.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all
|
||||
.PHONY: all test test-coverage test-coverage-baseline test-coverage-check test-backend-cpp test-ui test-ui-coverage-baseline test-ui-coverage-check install-hooks build vendor lint lint-all
|
||||
|
||||
all: help
|
||||
|
||||
@@ -201,6 +201,13 @@ test: prepare-test
|
||||
OPUS_SHIM_LIBRARY=$(abspath ./pkg/opus/shim/libopusshim.so) \
|
||||
$(GOCMD) run github.com/onsi/ginkgo/v2/ginkgo --flake-attempts $(TEST_FLAKES) --fail-fast -v -r $(TEST_PATHS)
|
||||
|
||||
## Compiles and runs the standalone C++ unit tests for the backends (pure
|
||||
## helpers that depend only on the stdlib + nlohmann/json, no full backend
|
||||
## build). Discovers every *_test.cpp under backend/cpp/ - see
|
||||
## backend/cpp/run-unit-tests.sh. Set NLOHMANN_INCLUDE to skip the header fetch.
|
||||
test-backend-cpp:
|
||||
bash backend/cpp/run-unit-tests.sh
|
||||
|
||||
## Runs the core suite ($(TEST_PATHS)) with statement-coverage instrumentation
|
||||
## and writes a merged profile to $(COVERAGE_PROFILE). Deliberately omits
|
||||
## --fail-fast so a single failure doesn't truncate the coverage number, and
|
||||
|
||||
@@ -87,3 +87,18 @@ target_compile_features(${TARGET} PRIVATE cxx_std_11)
|
||||
if(TARGET BUILD_INFO)
|
||||
add_dependencies(${TARGET} BUILD_INFO)
|
||||
endif()
|
||||
|
||||
# Unit test for the message-content normalization helper (message_content.h).
|
||||
# Off by default so the normal backend build is untouched; enable with
|
||||
# -DLLAMA_GRPC_BUILD_TESTS=ON and run via ctest. It reuses llama.cpp's vendored
|
||||
# <nlohmann/json.hpp> (propagated by the common helpers library) so it has no
|
||||
# extra dependency beyond what the backend already builds against.
|
||||
option(LLAMA_GRPC_BUILD_TESTS "Build grpc-server unit tests" OFF)
|
||||
if(LLAMA_GRPC_BUILD_TESTS)
|
||||
enable_testing()
|
||||
add_executable(message_content_test message_content_test.cpp message_content.h)
|
||||
target_include_directories(message_content_test PRIVATE ${CMAKE_CURRENT_SOURCE_DIR})
|
||||
target_link_libraries(message_content_test PRIVATE ${_LLAMA_COMMON_TARGET})
|
||||
target_compile_features(message_content_test PRIVATE cxx_std_17)
|
||||
add_test(NAME message_content_test COMMAND message_content_test)
|
||||
endif()
|
||||
|
||||
@@ -39,6 +39,7 @@
|
||||
#include "common.h"
|
||||
#include "arg.h"
|
||||
#include "chat-auto-parser.h"
|
||||
#include "message_content.h"
|
||||
#include <getopt.h>
|
||||
#include <grpcpp/ext/proto_server_reflection_plugin.h>
|
||||
#include <grpcpp/grpcpp.h>
|
||||
@@ -1616,242 +1617,20 @@ public:
|
||||
|
||||
for (int i = 0; i < request->messages_size(); i++) {
|
||||
const auto& msg = request->messages(i);
|
||||
json msg_json;
|
||||
msg_json["role"] = msg.role();
|
||||
|
||||
bool is_last_user_msg = (i == last_user_msg_idx);
|
||||
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
|
||||
|
||||
// Handle content - can be string, null, or array
|
||||
// For multimodal content, we'll embed images/audio from separate fields
|
||||
if (!msg.content().empty()) {
|
||||
// Try to parse content as JSON to see if it's already an array
|
||||
json content_val;
|
||||
try {
|
||||
content_val = json::parse(msg.content());
|
||||
// Handle null values - convert to empty string to avoid template errors
|
||||
if (content_val.is_null()) {
|
||||
content_val = "";
|
||||
}
|
||||
} catch (const json::parse_error&) {
|
||||
// Not JSON, treat as plain string
|
||||
content_val = msg.content();
|
||||
}
|
||||
|
||||
// If content is an object (e.g., from tool call failures), convert to string
|
||||
if (content_val.is_object()) {
|
||||
content_val = content_val.dump();
|
||||
}
|
||||
|
||||
// If content is a string and this is the last user message with images/audio, combine them
|
||||
if (content_val.is_string() && is_last_user_msg && has_images_or_audio) {
|
||||
json content_array = json::array();
|
||||
// Add text first
|
||||
content_array.push_back({{"type", "text"}, {"text", content_val.get<std::string>()}});
|
||||
// Add images
|
||||
if (request->images_size() > 0) {
|
||||
for (int j = 0; j < request->images_size(); j++) {
|
||||
json image_chunk;
|
||||
image_chunk["type"] = "image_url";
|
||||
json image_url;
|
||||
image_url["url"] = "data:image/jpeg;base64," + request->images(j);
|
||||
image_chunk["image_url"] = image_url;
|
||||
content_array.push_back(image_chunk);
|
||||
}
|
||||
}
|
||||
// Add audios
|
||||
if (request->audios_size() > 0) {
|
||||
for (int j = 0; j < request->audios_size(); j++) {
|
||||
json audio_chunk;
|
||||
audio_chunk["type"] = "input_audio";
|
||||
json input_audio;
|
||||
input_audio["data"] = request->audios(j);
|
||||
input_audio["format"] = "wav"; // default, could be made configurable
|
||||
audio_chunk["input_audio"] = input_audio;
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
} else {
|
||||
// Use content as-is (already array or not last user message)
|
||||
// Ensure null values are converted to empty string
|
||||
if (content_val.is_null()) {
|
||||
msg_json["content"] = "";
|
||||
} else {
|
||||
msg_json["content"] = content_val;
|
||||
}
|
||||
}
|
||||
} else if (is_last_user_msg && has_images_or_audio) {
|
||||
// If no content but this is the last user message with images/audio, create content array
|
||||
json content_array = json::array();
|
||||
if (request->images_size() > 0) {
|
||||
for (int j = 0; j < request->images_size(); j++) {
|
||||
json image_chunk;
|
||||
image_chunk["type"] = "image_url";
|
||||
json image_url;
|
||||
image_url["url"] = "data:image/jpeg;base64," + request->images(j);
|
||||
image_chunk["image_url"] = image_url;
|
||||
content_array.push_back(image_chunk);
|
||||
}
|
||||
}
|
||||
if (request->audios_size() > 0) {
|
||||
for (int j = 0; j < request->audios_size(); j++) {
|
||||
json audio_chunk;
|
||||
audio_chunk["type"] = "input_audio";
|
||||
json input_audio;
|
||||
input_audio["data"] = request->audios(j);
|
||||
input_audio["format"] = "wav"; // default, could be made configurable
|
||||
audio_chunk["input_audio"] = input_audio;
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
} else if (msg.role() == "tool") {
|
||||
// Tool role messages must have content field set, even if empty
|
||||
// Jinja templates expect content to be a string, not null or object
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0);
|
||||
if (msg.content().empty()) {
|
||||
msg_json["content"] = "";
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): empty content, set to empty string\n", i);
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): content exists: %s\n",
|
||||
i, msg.content().substr(0, std::min<size_t>(200, msg.content().size())).c_str());
|
||||
// Content exists, parse and ensure it's a string
|
||||
json content_val;
|
||||
try {
|
||||
content_val = json::parse(msg.content());
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): parsed JSON, type=%s\n",
|
||||
i, content_val.is_null() ? "null" :
|
||||
content_val.is_object() ? "object" :
|
||||
content_val.is_string() ? "string" :
|
||||
content_val.is_array() ? "array" : "other");
|
||||
// Handle null values - Jinja templates expect content to be a string, not null
|
||||
if (content_val.is_null()) {
|
||||
msg_json["content"] = "";
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): null content, converted to empty string\n", i);
|
||||
} else if (content_val.is_object()) {
|
||||
// If content is an object (e.g., from tool call failures/errors), convert to string
|
||||
msg_json["content"] = content_val.dump();
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): object content, converted to string: %s\n",
|
||||
i, content_val.dump().substr(0, std::min<size_t>(200, content_val.dump().size())).c_str());
|
||||
} else if (content_val.is_string()) {
|
||||
msg_json["content"] = content_val.get<std::string>();
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): string content, using as-is\n", i);
|
||||
} else {
|
||||
// For arrays or other types, convert to string
|
||||
msg_json["content"] = content_val.dump();
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): %s content, converted to string\n",
|
||||
i, content_val.is_array() ? "array" : "other type");
|
||||
}
|
||||
} catch (const json::parse_error&) {
|
||||
// Not JSON, treat as plain string
|
||||
msg_json["content"] = msg.content();
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (tool): not JSON, using as string\n", i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Ensure all messages have content set (fallback for any unhandled cases)
|
||||
// Jinja templates expect content to be present, default to empty string if not set
|
||||
if (!msg_json.contains("content")) {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d (role=%s): no content field, adding empty string\n",
|
||||
i, msg.role().c_str());
|
||||
msg_json["content"] = "";
|
||||
}
|
||||
llama_grpc::ReconstructedMessageInput rin;
|
||||
rin.role = msg.role();
|
||||
rin.content = msg.content();
|
||||
rin.name = msg.name();
|
||||
rin.tool_call_id = msg.tool_call_id();
|
||||
rin.reasoning_content = msg.reasoning_content();
|
||||
rin.tool_calls = msg.tool_calls();
|
||||
rin.is_last_user_msg = (i == last_user_msg_idx);
|
||||
if (rin.is_last_user_msg) {
|
||||
for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j));
|
||||
for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j));
|
||||
for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j));
|
||||
}
|
||||
|
||||
// Add optional fields for OpenAI-compatible message format
|
||||
if (!msg.name().empty()) {
|
||||
msg_json["name"] = msg.name();
|
||||
}
|
||||
if (!msg.tool_call_id().empty()) {
|
||||
msg_json["tool_call_id"] = msg.tool_call_id();
|
||||
}
|
||||
if (!msg.reasoning_content().empty()) {
|
||||
msg_json["reasoning_content"] = msg.reasoning_content();
|
||||
}
|
||||
if (!msg.tool_calls().empty()) {
|
||||
// Parse tool_calls JSON string and add to message
|
||||
try {
|
||||
json tool_calls = json::parse(msg.tool_calls());
|
||||
msg_json["tool_calls"] = tool_calls;
|
||||
SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str());
|
||||
// IMPORTANT: If message has tool_calls but content is empty or not set,
|
||||
// set content to space " " instead of empty string "", because llama.cpp's
|
||||
// common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312),
|
||||
// which causes template errors when accessing message.content[:tool_start_length]
|
||||
if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get<std::string>().empty())) {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d has tool_calls but empty content, setting to space\n", i);
|
||||
msg_json["content"] = " ";
|
||||
}
|
||||
// Log each tool call with name and arguments
|
||||
if (tool_calls.is_array()) {
|
||||
for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) {
|
||||
const auto& tc = tool_calls[tc_idx];
|
||||
std::string tool_name = "unknown";
|
||||
std::string tool_args = "{}";
|
||||
if (tc.contains("function")) {
|
||||
const auto& func = tc["function"];
|
||||
if (func.contains("name")) {
|
||||
tool_name = func["name"].get<std::string>();
|
||||
}
|
||||
if (func.contains("arguments")) {
|
||||
tool_args = func["arguments"].is_string() ?
|
||||
func["arguments"].get<std::string>() :
|
||||
func["arguments"].dump();
|
||||
}
|
||||
} else if (tc.contains("name")) {
|
||||
tool_name = tc["name"].get<std::string>();
|
||||
if (tc.contains("arguments")) {
|
||||
tool_args = tc["arguments"].is_string() ?
|
||||
tc["arguments"].get<std::string>() :
|
||||
tc["arguments"].dump();
|
||||
}
|
||||
}
|
||||
SRV_INF("[TOOL CALLS DEBUG] PredictStream: Message %d, tool_call %zu: name=%s, arguments=%s\n",
|
||||
i, tc_idx, tool_name.c_str(), tool_args.c_str());
|
||||
}
|
||||
}
|
||||
} catch (const json::parse_error& e) {
|
||||
SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
// Debug: Log final content state before adding to array
|
||||
if (msg_json.contains("content")) {
|
||||
if (msg_json["content"].is_null()) {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i);
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: content type=%s, has_value=%d\n",
|
||||
i, msg_json["content"].is_string() ? "string" :
|
||||
msg_json["content"].is_array() ? "array" :
|
||||
msg_json["content"].is_object() ? "object" : "other",
|
||||
msg_json["content"].is_null() ? 0 : 1);
|
||||
}
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i);
|
||||
}
|
||||
|
||||
messages_json.push_back(msg_json);
|
||||
messages_json.push_back(llama_grpc::build_reconstructed_message(rin));
|
||||
}
|
||||
|
||||
// Final safety check: Ensure no message has null content (Jinja templates require strings)
|
||||
@@ -2072,36 +1851,7 @@ public:
|
||||
if (body_json.contains("messages") && body_json["messages"].is_array()) {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size());
|
||||
for (size_t idx = 0; idx < body_json["messages"].size(); idx++) {
|
||||
auto& msg = body_json["messages"][idx];
|
||||
std::string role_str = msg.contains("role") ? msg["role"].get<std::string>() : "unknown";
|
||||
if (msg.contains("content")) {
|
||||
if (msg["content"].is_null()) {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str());
|
||||
msg["content"] = ""; // Fix null content
|
||||
} else if (role_str == "tool" && msg["content"].is_array()) {
|
||||
// Tool messages must have string content, not array
|
||||
// oaicompat_chat_params_parse expects tool messages to have string content
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx);
|
||||
msg["content"] = msg["content"].dump();
|
||||
} else if (!msg["content"].is_string() && !msg["content"].is_array()) {
|
||||
// If content is object or other non-string type, convert to string for templates
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str());
|
||||
if (msg["content"].is_object()) {
|
||||
msg["content"] = msg["content"].dump();
|
||||
} else {
|
||||
msg["content"] = "";
|
||||
}
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n",
|
||||
idx, role_str.c_str(),
|
||||
msg["content"].is_string() ? "string" :
|
||||
msg["content"].is_array() ? "array" :
|
||||
msg["content"].is_object() ? "object" : "other");
|
||||
}
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] PredictStream: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str());
|
||||
msg["content"] = ""; // Add missing content
|
||||
}
|
||||
llama_grpc::normalize_template_message(body_json["messages"][idx]);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2433,264 +2183,20 @@ public:
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Processing %d messages\n", request->messages_size());
|
||||
for (int i = 0; i < request->messages_size(); i++) {
|
||||
const auto& msg = request->messages(i);
|
||||
json msg_json;
|
||||
msg_json["role"] = msg.role();
|
||||
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d: role=%s, content_empty=%d, content_length=%zu\n",
|
||||
i, msg.role().c_str(), msg.content().empty() ? 1 : 0, msg.content().size());
|
||||
if (!msg.content().empty()) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d content (first 200 chars): %s\n",
|
||||
i, msg.content().substr(0, std::min<size_t>(200, msg.content().size())).c_str());
|
||||
llama_grpc::ReconstructedMessageInput rin;
|
||||
rin.role = msg.role();
|
||||
rin.content = msg.content();
|
||||
rin.name = msg.name();
|
||||
rin.tool_call_id = msg.tool_call_id();
|
||||
rin.reasoning_content = msg.reasoning_content();
|
||||
rin.tool_calls = msg.tool_calls();
|
||||
rin.is_last_user_msg = (i == last_user_msg_idx);
|
||||
if (rin.is_last_user_msg) {
|
||||
for (int j = 0; j < request->images_size(); j++) rin.images.push_back(request->images(j));
|
||||
for (int j = 0; j < request->audios_size(); j++) rin.audios.push_back(request->audios(j));
|
||||
for (int j = 0; j < request->videos_size(); j++) rin.videos.push_back(request->videos(j));
|
||||
}
|
||||
|
||||
bool is_last_user_msg = (i == last_user_msg_idx);
|
||||
bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0);
|
||||
|
||||
// Handle content - can be string, null, or array
|
||||
// For multimodal content, we'll embed images/audio from separate fields
|
||||
if (!msg.content().empty()) {
|
||||
// Try to parse content as JSON to see if it's already an array
|
||||
json content_val;
|
||||
try {
|
||||
content_val = json::parse(msg.content());
|
||||
// Handle null values - convert to empty string to avoid template errors
|
||||
if (content_val.is_null()) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d parsed JSON is null, converting to empty string\n", i);
|
||||
content_val = "";
|
||||
}
|
||||
} catch (const json::parse_error&) {
|
||||
// Not JSON, treat as plain string
|
||||
content_val = msg.content();
|
||||
}
|
||||
|
||||
// If content is an object (e.g., from tool call failures), convert to string
|
||||
if (content_val.is_object()) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d content is object, converting to string\n", i);
|
||||
content_val = content_val.dump();
|
||||
}
|
||||
|
||||
// If content is a string and this is the last user message with images/audio, combine them
|
||||
if (content_val.is_string() && is_last_user_msg && has_images_or_audio) {
|
||||
json content_array = json::array();
|
||||
// Add text first
|
||||
content_array.push_back({{"type", "text"}, {"text", content_val.get<std::string>()}});
|
||||
// Add images
|
||||
if (request->images_size() > 0) {
|
||||
for (int j = 0; j < request->images_size(); j++) {
|
||||
json image_chunk;
|
||||
image_chunk["type"] = "image_url";
|
||||
json image_url;
|
||||
image_url["url"] = "data:image/jpeg;base64," + request->images(j);
|
||||
image_chunk["image_url"] = image_url;
|
||||
content_array.push_back(image_chunk);
|
||||
}
|
||||
}
|
||||
// Add audios
|
||||
if (request->audios_size() > 0) {
|
||||
for (int j = 0; j < request->audios_size(); j++) {
|
||||
json audio_chunk;
|
||||
audio_chunk["type"] = "input_audio";
|
||||
json input_audio;
|
||||
input_audio["data"] = request->audios(j);
|
||||
input_audio["format"] = "wav"; // default, could be made configurable
|
||||
audio_chunk["input_audio"] = input_audio;
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
} else {
|
||||
// Use content as-is (already array or not last user message)
|
||||
// Ensure null values are converted to empty string
|
||||
if (content_val.is_null()) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d content_val was null, setting to empty string\n", i);
|
||||
msg_json["content"] = "";
|
||||
} else {
|
||||
msg_json["content"] = content_val;
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d content set, type=%s\n",
|
||||
i, content_val.is_string() ? "string" :
|
||||
content_val.is_array() ? "array" :
|
||||
content_val.is_object() ? "object" : "other");
|
||||
}
|
||||
}
|
||||
} else if (is_last_user_msg && has_images_or_audio) {
|
||||
// If no content but this is the last user message with images/audio, create content array
|
||||
json content_array = json::array();
|
||||
if (request->images_size() > 0) {
|
||||
for (int j = 0; j < request->images_size(); j++) {
|
||||
json image_chunk;
|
||||
image_chunk["type"] = "image_url";
|
||||
json image_url;
|
||||
image_url["url"] = "data:image/jpeg;base64," + request->images(j);
|
||||
image_chunk["image_url"] = image_url;
|
||||
content_array.push_back(image_chunk);
|
||||
}
|
||||
}
|
||||
if (request->audios_size() > 0) {
|
||||
for (int j = 0; j < request->audios_size(); j++) {
|
||||
json audio_chunk;
|
||||
audio_chunk["type"] = "input_audio";
|
||||
json input_audio;
|
||||
input_audio["data"] = request->audios(j);
|
||||
input_audio["format"] = "wav"; // default, could be made configurable
|
||||
audio_chunk["input_audio"] = input_audio;
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
}
|
||||
if (request->videos_size() > 0) {
|
||||
for (int j = 0; j < request->videos_size(); j++) {
|
||||
json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
json input_video;
|
||||
input_video["data"] = request->videos(j);
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
msg_json["content"] = content_array;
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i);
|
||||
} else if (!msg.tool_calls().empty()) {
|
||||
// Tool call messages may have null content, but templates expect string
|
||||
// IMPORTANT: Set to space " " instead of empty string "", because llama.cpp's
|
||||
// common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312),
|
||||
// which causes template errors when accessing message.content[:tool_start_length]
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls, setting content to space (not empty string)\n", i);
|
||||
msg_json["content"] = " ";
|
||||
} else if (msg.role() == "tool") {
|
||||
// Tool role messages must have content field set, even if empty
|
||||
// Jinja templates expect content to be a string, not null or object
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d is tool role, content_empty=%d\n", i, msg.content().empty() ? 1 : 0);
|
||||
if (msg.content().empty()) {
|
||||
msg_json["content"] = "";
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): empty content, set to empty string\n", i);
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): content exists: %s\n",
|
||||
i, msg.content().substr(0, std::min<size_t>(200, msg.content().size())).c_str());
|
||||
// Content exists, parse and ensure it's a string
|
||||
json content_val;
|
||||
try {
|
||||
content_val = json::parse(msg.content());
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): parsed JSON, type=%s\n",
|
||||
i, content_val.is_null() ? "null" :
|
||||
content_val.is_object() ? "object" :
|
||||
content_val.is_string() ? "string" :
|
||||
content_val.is_array() ? "array" : "other");
|
||||
// Handle null values - Jinja templates expect content to be a string, not null
|
||||
if (content_val.is_null()) {
|
||||
msg_json["content"] = "";
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): null content, converted to empty string\n", i);
|
||||
} else if (content_val.is_object()) {
|
||||
// If content is an object (e.g., from tool call failures/errors), convert to string
|
||||
msg_json["content"] = content_val.dump();
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): object content, converted to string: %s\n",
|
||||
i, content_val.dump().substr(0, std::min<size_t>(200, content_val.dump().size())).c_str());
|
||||
} else if (content_val.is_string()) {
|
||||
msg_json["content"] = content_val.get<std::string>();
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): string content, using as-is\n", i);
|
||||
} else {
|
||||
// For arrays or other types, convert to string
|
||||
msg_json["content"] = content_val.dump();
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): %s content, converted to string\n",
|
||||
i, content_val.is_array() ? "array" : "other type");
|
||||
}
|
||||
} catch (const json::parse_error&) {
|
||||
// Not JSON, treat as plain string
|
||||
msg_json["content"] = msg.content();
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (tool): not JSON, using as string\n", i);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Ensure all messages have content set (fallback for any unhandled cases)
|
||||
// Jinja templates expect content to be present, default to empty string if not set
|
||||
if (!msg_json.contains("content")) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d (role=%s): no content field, adding empty string\n",
|
||||
i, msg.role().c_str());
|
||||
msg_json["content"] = "";
|
||||
}
|
||||
}
|
||||
|
||||
// Add optional fields for OpenAI-compatible message format
|
||||
if (!msg.name().empty()) {
|
||||
msg_json["name"] = msg.name();
|
||||
}
|
||||
if (!msg.tool_call_id().empty()) {
|
||||
msg_json["tool_call_id"] = msg.tool_call_id();
|
||||
}
|
||||
if (!msg.reasoning_content().empty()) {
|
||||
msg_json["reasoning_content"] = msg.reasoning_content();
|
||||
}
|
||||
if (!msg.tool_calls().empty()) {
|
||||
// Parse tool_calls JSON string and add to message
|
||||
try {
|
||||
json tool_calls = json::parse(msg.tool_calls());
|
||||
msg_json["tool_calls"] = tool_calls;
|
||||
SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d has tool_calls: %s\n", i, tool_calls.dump().c_str());
|
||||
// IMPORTANT: If message has tool_calls but content is empty or not set,
|
||||
// set content to space " " instead of empty string "", because llama.cpp's
|
||||
// common_chat_msgs_to_json_oaicompat converts empty strings to null (line 312),
|
||||
// which causes template errors when accessing message.content[:tool_start_length]
|
||||
if (!msg_json.contains("content") || (msg_json.contains("content") && msg_json["content"].is_string() && msg_json["content"].get<std::string>().empty())) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d has tool_calls but empty content, setting to space\n", i);
|
||||
msg_json["content"] = " ";
|
||||
}
|
||||
// Log each tool call with name and arguments
|
||||
if (tool_calls.is_array()) {
|
||||
for (size_t tc_idx = 0; tc_idx < tool_calls.size(); tc_idx++) {
|
||||
const auto& tc = tool_calls[tc_idx];
|
||||
std::string tool_name = "unknown";
|
||||
std::string tool_args = "{}";
|
||||
if (tc.contains("function")) {
|
||||
const auto& func = tc["function"];
|
||||
if (func.contains("name")) {
|
||||
tool_name = func["name"].get<std::string>();
|
||||
}
|
||||
if (func.contains("arguments")) {
|
||||
tool_args = func["arguments"].is_string() ?
|
||||
func["arguments"].get<std::string>() :
|
||||
func["arguments"].dump();
|
||||
}
|
||||
} else if (tc.contains("name")) {
|
||||
tool_name = tc["name"].get<std::string>();
|
||||
if (tc.contains("arguments")) {
|
||||
tool_args = tc["arguments"].is_string() ?
|
||||
tc["arguments"].get<std::string>() :
|
||||
tc["arguments"].dump();
|
||||
}
|
||||
}
|
||||
SRV_INF("[TOOL CALLS DEBUG] Predict: Message %d, tool_call %zu: name=%s, arguments=%s\n",
|
||||
i, tc_idx, tool_name.c_str(), tool_args.c_str());
|
||||
}
|
||||
}
|
||||
} catch (const json::parse_error& e) {
|
||||
SRV_WRN("Failed to parse tool_calls JSON: %s\n", e.what());
|
||||
}
|
||||
}
|
||||
|
||||
// Debug: Log final content state before adding to array
|
||||
if (msg_json.contains("content")) {
|
||||
if (msg_json["content"].is_null()) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content is NULL - THIS WILL CAUSE ERROR!\n", i);
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: content type=%s, has_value=%d\n",
|
||||
i, msg_json["content"].is_string() ? "string" :
|
||||
msg_json["content"].is_array() ? "array" :
|
||||
msg_json["content"].is_object() ? "object" : "other",
|
||||
msg_json["content"].is_null() ? 0 : 1);
|
||||
}
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Message %d FINAL STATE: NO CONTENT FIELD - THIS WILL CAUSE ERROR!\n", i);
|
||||
}
|
||||
|
||||
messages_json.push_back(msg_json);
|
||||
messages_json.push_back(llama_grpc::build_reconstructed_message(rin));
|
||||
}
|
||||
|
||||
// Final safety check: Ensure no message has null content (Jinja templates require strings)
|
||||
@@ -2911,36 +2417,7 @@ public:
|
||||
if (body_json.contains("messages") && body_json["messages"].is_array()) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: Before oaicompat_chat_params_parse - checking %zu messages\n", body_json["messages"].size());
|
||||
for (size_t idx = 0; idx < body_json["messages"].size(); idx++) {
|
||||
auto& msg = body_json["messages"][idx];
|
||||
std::string role_str = msg.contains("role") ? msg["role"].get<std::string>() : "unknown";
|
||||
if (msg.contains("content")) {
|
||||
if (msg["content"].is_null()) {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) has NULL content - FIXING!\n", idx, role_str.c_str());
|
||||
msg["content"] = ""; // Fix null content
|
||||
} else if (role_str == "tool" && msg["content"].is_array()) {
|
||||
// Tool messages must have string content, not array
|
||||
// oaicompat_chat_params_parse expects tool messages to have string content
|
||||
SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=tool) has array content, converting to string\n", idx);
|
||||
msg["content"] = msg["content"].dump();
|
||||
} else if (!msg["content"].is_string() && !msg["content"].is_array()) {
|
||||
// If content is object or other non-string type, convert to string for templates
|
||||
SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) content is not string/array, converting\n", idx, role_str.c_str());
|
||||
if (msg["content"].is_object()) {
|
||||
msg["content"] = msg["content"].dump();
|
||||
} else {
|
||||
msg["content"] = "";
|
||||
}
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s): content type=%s\n",
|
||||
idx, role_str.c_str(),
|
||||
msg["content"].is_string() ? "string" :
|
||||
msg["content"].is_array() ? "array" :
|
||||
msg["content"].is_object() ? "object" : "other");
|
||||
}
|
||||
} else {
|
||||
SRV_INF("[CONTENT DEBUG] Predict: BEFORE TEMPLATE - Message %zu (role=%s) MISSING content field - ADDING!\n", idx, role_str.c_str());
|
||||
msg["content"] = ""; // Add missing content
|
||||
}
|
||||
llama_grpc::normalize_template_message(body_json["messages"][idx]);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
192
backend/cpp/llama-cpp/message_content.h
Normal file
192
backend/cpp/llama-cpp/message_content.h
Normal file
@@ -0,0 +1,192 @@
|
||||
#pragma once
|
||||
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
namespace llama_grpc {
|
||||
|
||||
// Normalizes a proto message's content string into the JSON value used when
|
||||
// reconstructing OpenAI-format messages for the tokenizer (jinja) template.
|
||||
//
|
||||
// Shared by the streaming (PredictStream) and non-streaming (Predict) message
|
||||
// reconstruction paths so the two cannot drift.
|
||||
//
|
||||
// LocalAI's Go layer (schema.Messages.ToProto) always sends content as a plain
|
||||
// text string; multimodal media travels in separate proto fields, never inside
|
||||
// content. So user/system/developer content is *only ever* opaque text and must
|
||||
// NOT be JSON-sniffed: a prompt that merely looks like JSON (e.g. an ingredient
|
||||
// list ["1/4 cup sugar", ...]) would otherwise be reinterpreted as structured
|
||||
// content parts and rejected by oaicompat_chat_params_parse with
|
||||
// "unsupported content[].type" (https://github.com/mudler/LocalAI/issues/10524).
|
||||
// (developer is OpenAI's modern system alias - same "human-authored text" nature.)
|
||||
//
|
||||
// For assistant/tool messages we still collapse a literal JSON null/object
|
||||
// (tool-call bookkeeping) to a string, but we never turn a plain string into an
|
||||
// array/scalar. The array defense is therefore role-independent (arrays/scalars
|
||||
// fall through for every role); the role gate only governs the null/object case.
|
||||
inline nlohmann::ordered_json normalize_message_content(const std::string& role,
|
||||
const std::string& content) {
|
||||
nlohmann::ordered_json content_val = content;
|
||||
if (role != "user" && role != "system" && role != "developer") {
|
||||
try {
|
||||
nlohmann::ordered_json parsed = nlohmann::ordered_json::parse(content);
|
||||
if (parsed.is_null()) {
|
||||
content_val = "";
|
||||
} else if (parsed.is_object()) {
|
||||
content_val = parsed.dump();
|
||||
}
|
||||
// arrays / scalars: keep the original plain-text string as-is
|
||||
} catch (const nlohmann::ordered_json::parse_error&) {
|
||||
// Not JSON, already the plain string
|
||||
}
|
||||
}
|
||||
return content_val;
|
||||
}
|
||||
|
||||
// Final safety pass applied to each reconstructed OpenAI message right before it
|
||||
// is handed to oaicompat_chat_params_parse (jinja templating). Jinja templates
|
||||
// assume content is a string: a literal null breaks slicing such as
|
||||
// message.content[:N] (#7324), and a tool message with array content is rejected
|
||||
// (#7528). A multimodal user message legitimately carries a typed-part array
|
||||
// ({type:text}, {type:image_url}, ...), which must be left intact. Shared by the
|
||||
// streaming and non-streaming paths so this invariant cannot drift between them.
|
||||
inline void normalize_template_message(nlohmann::ordered_json& msg) {
|
||||
if (!msg.contains("content")) {
|
||||
msg["content"] = ""; // templates expect the field to exist
|
||||
return;
|
||||
}
|
||||
nlohmann::ordered_json& content = msg["content"];
|
||||
const std::string role = (msg.contains("role") && msg["role"].is_string())
|
||||
? msg["role"].get<std::string>()
|
||||
: std::string();
|
||||
if (content.is_null()) {
|
||||
content = ""; // #7324: null would crash content[:N] slicing
|
||||
} else if (role == "tool" && content.is_array()) {
|
||||
content = content.dump(); // #7528: tool messages must have string content
|
||||
} else if (!content.is_string() && !content.is_array()) {
|
||||
if (content.is_object()) {
|
||||
content = content.dump(); // tool-call bookkeeping object -> string
|
||||
} else {
|
||||
content = ""; // other scalar (number/bool) -> empty
|
||||
}
|
||||
}
|
||||
// string, or a non-tool (multimodal) typed-part array: leave untouched
|
||||
}
|
||||
|
||||
// One proto message's data, flattened to plain types so the reconstruction logic
|
||||
// can be shared and unit-tested without protobuf. The streaming and non-streaming
|
||||
// predict paths both populate this from proto::Message + the request's media.
|
||||
struct ReconstructedMessageInput {
|
||||
std::string role;
|
||||
std::string content; // proto.Message.content (always a plain string)
|
||||
std::string name;
|
||||
std::string tool_call_id;
|
||||
std::string reasoning_content;
|
||||
std::string tool_calls; // tool_calls as a JSON string, or empty
|
||||
bool is_last_user_msg = false; // attach request media to this message
|
||||
std::vector<std::string> images; // base64 (jpeg)
|
||||
std::vector<std::string> audios; // base64 (wav)
|
||||
std::vector<std::string> videos; // base64
|
||||
};
|
||||
|
||||
// Appends the request's media as OpenAI typed content parts. Imperative (not
|
||||
// brace-init) to avoid nlohmann's object-vs-array initializer-list ambiguity.
|
||||
inline void append_media_parts(nlohmann::ordered_json& content_array,
|
||||
const std::vector<std::string>& images,
|
||||
const std::vector<std::string>& audios,
|
||||
const std::vector<std::string>& videos) {
|
||||
for (const auto& img : images) {
|
||||
nlohmann::ordered_json image_chunk;
|
||||
image_chunk["type"] = "image_url";
|
||||
nlohmann::ordered_json image_url;
|
||||
image_url["url"] = "data:image/jpeg;base64," + img;
|
||||
image_chunk["image_url"] = image_url;
|
||||
content_array.push_back(image_chunk);
|
||||
}
|
||||
for (const auto& aud : audios) {
|
||||
nlohmann::ordered_json audio_chunk;
|
||||
audio_chunk["type"] = "input_audio";
|
||||
nlohmann::ordered_json input_audio;
|
||||
input_audio["data"] = aud;
|
||||
input_audio["format"] = "wav"; // default; could be made configurable
|
||||
audio_chunk["input_audio"] = input_audio;
|
||||
content_array.push_back(audio_chunk);
|
||||
}
|
||||
for (const auto& vid : videos) {
|
||||
nlohmann::ordered_json video_chunk;
|
||||
video_chunk["type"] = "input_video";
|
||||
nlohmann::ordered_json input_video;
|
||||
input_video["data"] = vid;
|
||||
video_chunk["input_video"] = input_video;
|
||||
content_array.push_back(video_chunk);
|
||||
}
|
||||
}
|
||||
|
||||
// Reconstructs a single OpenAI-format message (the object fed to
|
||||
// oaicompat_chat_params_parse) from a proto message. Shared by PredictStream and
|
||||
// Predict so the content/multimodal/tool_calls handling cannot drift between the
|
||||
// two stream modes (it previously lived as two ~150-line copies with a redundant
|
||||
// Predict-only tool_calls->" " branch). Guarantees content is always a string or
|
||||
// a typed-part array, never null/missing.
|
||||
inline nlohmann::ordered_json build_reconstructed_message(const ReconstructedMessageInput& in) {
|
||||
nlohmann::ordered_json msg_json;
|
||||
msg_json["role"] = in.role;
|
||||
const bool has_media = !in.images.empty() || !in.audios.empty() || !in.videos.empty();
|
||||
|
||||
if (!in.content.empty()) {
|
||||
nlohmann::ordered_json content_val = normalize_message_content(in.role, in.content);
|
||||
if (content_val.is_string() && in.is_last_user_msg && has_media) {
|
||||
// Last user message + media: build a typed-part array (text first).
|
||||
nlohmann::ordered_json content_array = nlohmann::ordered_json::array();
|
||||
nlohmann::ordered_json text_part;
|
||||
text_part["type"] = "text";
|
||||
text_part["text"] = content_val.get<std::string>();
|
||||
content_array.push_back(text_part);
|
||||
append_media_parts(content_array, in.images, in.audios, in.videos);
|
||||
msg_json["content"] = content_array;
|
||||
} else if (content_val.is_null()) {
|
||||
msg_json["content"] = "";
|
||||
} else {
|
||||
msg_json["content"] = content_val;
|
||||
}
|
||||
} else if (in.is_last_user_msg && has_media) {
|
||||
// No text but media on the last user message: media-only typed array.
|
||||
nlohmann::ordered_json content_array = nlohmann::ordered_json::array();
|
||||
append_media_parts(content_array, in.images, in.audios, in.videos);
|
||||
msg_json["content"] = content_array;
|
||||
} else {
|
||||
// Empty content (any role, incl. tool/assistant): templates need a string.
|
||||
msg_json["content"] = "";
|
||||
}
|
||||
|
||||
if (!in.name.empty()) {
|
||||
msg_json["name"] = in.name;
|
||||
}
|
||||
if (!in.tool_call_id.empty()) {
|
||||
msg_json["tool_call_id"] = in.tool_call_id;
|
||||
}
|
||||
if (!in.reasoning_content.empty()) {
|
||||
msg_json["reasoning_content"] = in.reasoning_content;
|
||||
}
|
||||
if (!in.tool_calls.empty()) {
|
||||
try {
|
||||
nlohmann::ordered_json tool_calls = nlohmann::ordered_json::parse(in.tool_calls);
|
||||
msg_json["tool_calls"] = tool_calls;
|
||||
// tool_calls + empty/blank content: use " " not "", because llama.cpp's
|
||||
// common_chat_msgs_to_json_oaicompat turns "" into null, which breaks
|
||||
// templates that slice message.content[:tool_start_length] (#7324).
|
||||
if (!msg_json.contains("content") ||
|
||||
(msg_json["content"].is_string() && msg_json["content"].get<std::string>().empty())) {
|
||||
msg_json["content"] = " ";
|
||||
}
|
||||
} catch (const nlohmann::ordered_json::parse_error&) {
|
||||
// Malformed tool_calls JSON: leave content as-is (prior behavior).
|
||||
}
|
||||
}
|
||||
|
||||
return msg_json;
|
||||
}
|
||||
|
||||
} // namespace llama_grpc
|
||||
234
backend/cpp/llama-cpp/message_content_test.cpp
Normal file
234
backend/cpp/llama-cpp/message_content_test.cpp
Normal file
@@ -0,0 +1,234 @@
|
||||
// Unit tests for the shared message-reconstruction helpers (message_content.h).
|
||||
//
|
||||
// Build & run standalone (nlohmann/json single header on the include path):
|
||||
// g++ -std=c++17 -I<dir-with-nlohmann> message_content_test.cpp -o t && ./t
|
||||
// or via CMake: -DLLAMA_GRPC_BUILD_TESTS=ON then ctest.
|
||||
//
|
||||
// Regression coverage for:
|
||||
// #10524 - a user/system prompt that is itself a JSON-array string must stay
|
||||
// plain text, never be reinterpreted as OpenAI structured parts.
|
||||
// #7324 - assistant/tool null content -> "" (templates slice content[:N]);
|
||||
// assistant+tool_calls+empty content -> " " (not "", which becomes null).
|
||||
// #7528 - tool message array content must reach the template as a string.
|
||||
// multimodal - last user message text + media -> typed-part array, media kept.
|
||||
|
||||
#include <cassert>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
|
||||
#include "message_content.h"
|
||||
|
||||
using nlohmann::ordered_json;
|
||||
using llama_grpc::normalize_message_content;
|
||||
using llama_grpc::normalize_template_message;
|
||||
using llama_grpc::build_reconstructed_message;
|
||||
using llama_grpc::ReconstructedMessageInput;
|
||||
|
||||
static int failures = 0;
|
||||
|
||||
static void check(bool ok, const std::string& name, const std::string& detail = "") {
|
||||
if (!ok) {
|
||||
std::cerr << "FAIL " << name << (detail.empty() ? "" : ": " + detail) << "\n";
|
||||
failures++;
|
||||
}
|
||||
}
|
||||
|
||||
// ---- normalize_message_content -------------------------------------------
|
||||
|
||||
static void expect_norm_string(const char* name, const std::string& role,
|
||||
const std::string& content, const std::string& want) {
|
||||
auto got = normalize_message_content(role, content);
|
||||
if (!got.is_string()) {
|
||||
check(false, name, "expected a JSON string, got " +
|
||||
std::string(got.is_array() ? "array" : got.is_object() ? "object" : "other") +
|
||||
" (" + got.dump() + ")");
|
||||
return;
|
||||
}
|
||||
check(got.get<std::string>() == want, name, "expected \"" + want + "\", got \"" + got.get<std::string>() + "\"");
|
||||
}
|
||||
|
||||
static void test_normalize() {
|
||||
const std::string ingredients = R"(["1/4 cup brown sugar, packed","1 pound ground beef"])";
|
||||
|
||||
// #10524 - JSON-array text must stay a string. Role-INDEPENDENT array defense.
|
||||
for (const char* role : {"user", "system", "developer", "function", "assistant", "tool"}) {
|
||||
expect_norm_string((std::string("json_array_stays_text:") + role).c_str(), role, ingredients, ingredients);
|
||||
}
|
||||
|
||||
// #10524 - user/system/developer JSON-object text stays verbatim (NOT re-dumped).
|
||||
expect_norm_string("user_json_object_verbatim", "user", R"({"a":1})", R"({"a":1})");
|
||||
expect_norm_string("system_json_object_verbatim", "system", R"({"a":1})", R"({"a":1})");
|
||||
expect_norm_string("developer_json_object_verbatim", "developer", R"({"a":1})", R"({"a":1})");
|
||||
|
||||
// Plain text unchanged for all roles.
|
||||
expect_norm_string("user_plain_text", "user", "hello world", "hello world");
|
||||
expect_norm_string("assistant_non_json_text_kept", "assistant", "hi [unclosed", "hi [unclosed");
|
||||
|
||||
// #7324 boundary - user/system/developer literal "null" preserved (never parsed).
|
||||
expect_norm_string("user_literal_null_stays", "user", "null", "null");
|
||||
expect_norm_string("system_literal_null_stays", "system", "null", "null");
|
||||
expect_norm_string("developer_literal_null_stays", "developer", "null", "null");
|
||||
|
||||
// #7324 - assistant/tool literal null collapses to empty string.
|
||||
expect_norm_string("assistant_null_to_empty", "assistant", "null", "");
|
||||
expect_norm_string("tool_null_to_empty", "tool", "null", "");
|
||||
|
||||
// #7324/#7528 - assistant/tool object bookkeeping stringified (stays a string).
|
||||
check(normalize_message_content("assistant", R"({"tool":"x"})").is_string(), "assistant_object_stringified");
|
||||
check(normalize_message_content("tool", R"({"error":"boom"})").is_string(), "tool_object_stringified");
|
||||
|
||||
// #10524-family - a bare scalar that parses as a JSON number stays the string.
|
||||
expect_norm_string("assistant_scalar_number_stays_string", "assistant", "42", "42");
|
||||
|
||||
// baseline - empty content stays empty.
|
||||
expect_norm_string("user_empty_stays_empty", "user", "", "");
|
||||
}
|
||||
|
||||
// ---- normalize_template_message (BEFORE TEMPLATE sanitizer) ---------------
|
||||
|
||||
static void test_template_sanitizer() {
|
||||
// #7528 - a tool message with an ACTUAL array becomes a string.
|
||||
{
|
||||
ordered_json msg = {{"role", "tool"}, {"content", ordered_json::array({{{"type", "text"}, {"text", "r"}}})}};
|
||||
normalize_template_message(msg);
|
||||
check(msg["content"].is_string(), "before_template_tool_array_to_string", "got " + msg["content"].dump());
|
||||
}
|
||||
// #7324 - null content -> "" for any role.
|
||||
{
|
||||
ordered_json msg = {{"role", "assistant"}, {"content", nullptr}};
|
||||
normalize_template_message(msg);
|
||||
check(msg["content"].is_string() && msg["content"] == "", "before_template_null_to_empty");
|
||||
}
|
||||
// object content -> dumped string (would otherwise throw at the template).
|
||||
{
|
||||
ordered_json msg = {{"role", "assistant"}, {"content", {{"x", 1}}}};
|
||||
normalize_template_message(msg);
|
||||
check(msg["content"].is_string(), "before_template_object_to_string", "got " + msg["content"].dump());
|
||||
}
|
||||
// missing content field -> "".
|
||||
{
|
||||
ordered_json msg = {{"role", "user"}};
|
||||
normalize_template_message(msg);
|
||||
check(msg.contains("content") && msg["content"] == "", "before_template_missing_to_empty");
|
||||
}
|
||||
// multimodal: a well-typed user array must be left UNTOUCHED (role!=tool).
|
||||
{
|
||||
ordered_json parts = ordered_json::array();
|
||||
parts.push_back({{"type", "text"}, {"text", "x"}});
|
||||
ordered_json img; img["type"] = "image_url"; img["image_url"] = {{"url", "data:..."}};
|
||||
parts.push_back(img);
|
||||
ordered_json msg = {{"role", "user"}, {"content", parts}};
|
||||
normalize_template_message(msg);
|
||||
check(msg["content"].is_array() && msg["content"].size() == 2, "before_template_user_typed_array_preserved",
|
||||
"got " + msg["content"].dump());
|
||||
}
|
||||
// a plain string is left untouched.
|
||||
{
|
||||
ordered_json msg = {{"role", "user"}, {"content", "hello"}};
|
||||
normalize_template_message(msg);
|
||||
check(msg["content"] == "hello", "before_template_string_untouched");
|
||||
}
|
||||
}
|
||||
|
||||
// ---- build_reconstructed_message ----------------------------------------
|
||||
|
||||
static void test_reconstruction() {
|
||||
const std::string ingredients = R"(["1/4 cup brown sugar","1 pound ground beef"])";
|
||||
|
||||
// #10524 end-state - user JSON-array text, no media -> string content.
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "user"; in.content = ingredients;
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"].is_string() && m["content"] == ingredients, "recon_user_json_array_string",
|
||||
"got " + m["content"].dump());
|
||||
}
|
||||
// multimodal - user text + one image on last user msg -> typed array, image kept.
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "user"; in.content = ingredients; in.is_last_user_msg = true;
|
||||
in.images.push_back("BASE64IMG");
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"].is_array() && m["content"].size() == 2, "recon_multimodal_text_plus_image",
|
||||
"got " + m["content"].dump());
|
||||
check(m["content"][0]["type"] == "text" && m["content"][0]["text"] == ingredients, "recon_multimodal_text_first");
|
||||
check(m["content"][1]["type"] == "image_url", "recon_multimodal_image_kept");
|
||||
}
|
||||
// multimodal media-only - empty text + image on last user msg.
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "user"; in.content = ""; in.is_last_user_msg = true;
|
||||
in.images.push_back("BASE64IMG");
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"].is_array() && m["content"].size() == 1 && m["content"][0]["type"] == "image_url",
|
||||
"recon_media_only", "got " + m["content"].dump());
|
||||
}
|
||||
// #7528 - tool array-string content stays a string.
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "tool"; in.content = R"(["a","b"])"; in.tool_call_id = "call_1";
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"].is_string() && m["content"] == R"(["a","b"])", "recon_tool_array_string",
|
||||
"got " + m["content"].dump());
|
||||
check(m["tool_call_id"] == "call_1", "recon_tool_call_id_set");
|
||||
}
|
||||
// tool empty content -> "".
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "tool"; in.content = "";
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"].is_string() && m["content"] == "", "recon_tool_empty_to_string");
|
||||
}
|
||||
// #7324 - assistant + tool_calls + empty content -> " " (single space, not "").
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "assistant"; in.content = "";
|
||||
in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])";
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"].is_string() && m["content"] == " ", "recon_toolcalls_empty_content_space",
|
||||
"got " + m["content"].dump());
|
||||
check(m["tool_calls"].is_array() && m["tool_calls"].size() == 1, "recon_toolcalls_parsed");
|
||||
}
|
||||
// assistant + tool_calls + real content keeps the content.
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "assistant"; in.content = "I'll call f";
|
||||
in.tool_calls = R"([{"id":"c1","type":"function","function":{"name":"f","arguments":"{}"}}])";
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"] == "I'll call f", "recon_toolcalls_with_content_kept");
|
||||
}
|
||||
// assistant null content -> "".
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "assistant"; in.content = "null";
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"] == "", "recon_assistant_null_to_empty");
|
||||
}
|
||||
// malformed tool_calls JSON must not throw; content preserved.
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "assistant"; in.content = "hi"; in.tool_calls = "{not json";
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["content"] == "hi" && !m.contains("tool_calls"), "recon_malformed_toolcalls_safe");
|
||||
}
|
||||
// optional fields: name + reasoning carried through.
|
||||
{
|
||||
ReconstructedMessageInput in;
|
||||
in.role = "tool"; in.content = "result"; in.name = "get_weather"; in.reasoning_content = "thinking";
|
||||
auto m = build_reconstructed_message(in);
|
||||
check(m["name"] == "get_weather" && m["reasoning_content"] == "thinking", "recon_optional_fields");
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
test_normalize();
|
||||
test_template_sanitizer();
|
||||
test_reconstruction();
|
||||
|
||||
if (failures == 0) {
|
||||
std::cout << "OK: all message_content tests passed\n";
|
||||
return 0;
|
||||
}
|
||||
std::cerr << failures << " test(s) failed\n";
|
||||
return 1;
|
||||
}
|
||||
@@ -18,6 +18,10 @@ done
|
||||
|
||||
cp -r CMakeLists.txt llama.cpp/tools/grpc-server/
|
||||
cp -r grpc-server.cpp llama.cpp/tools/grpc-server/
|
||||
# Shared message-reconstruction helpers (included by grpc-server.cpp) and their
|
||||
# unit test (compiled only when -DLLAMA_GRPC_BUILD_TESTS=ON).
|
||||
cp -r message_content.h llama.cpp/tools/grpc-server/
|
||||
cp -r message_content_test.cpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv llama.cpp/vendor/nlohmann/json.hpp llama.cpp/tools/grpc-server/
|
||||
cp -rfv llama.cpp/vendor/cpp-httplib/httplib.h llama.cpp/tools/grpc-server/
|
||||
|
||||
|
||||
71
backend/cpp/run-unit-tests.sh
Executable file
71
backend/cpp/run-unit-tests.sh
Executable file
@@ -0,0 +1,71 @@
|
||||
#!/bin/bash
|
||||
#
|
||||
# Discovers and runs every standalone C++ unit test under backend/cpp/.
|
||||
#
|
||||
# A "standalone" unit test is a *_test.cpp that depends only on the C++ standard
|
||||
# library and nlohmann/json (single header) - i.e. it exercises pure helpers and
|
||||
# does not need the full llama.cpp + gRPC backend build. Tests that DO need the
|
||||
# backend build use the CMake/ctest path (e.g. -DLLAMA_GRPC_BUILD_TESTS=ON)
|
||||
# instead and are skipped here.
|
||||
#
|
||||
# This keeps CI generic: adding a new pure-C++ unit test file named *_test.cpp in
|
||||
# an active backend source dir is picked up automatically, with no CI edits.
|
||||
#
|
||||
# Env:
|
||||
# NLOHMANN_INCLUDE include dir that contains nlohmann/json.hpp. If unset, the
|
||||
# nlohmann/json single header is fetched to a temp dir.
|
||||
# CXX compiler (default: g++).
|
||||
# JSON_VERSION nlohmann/json tag to fetch when NLOHMANN_INCLUDE is unset
|
||||
# (default: v3.11.3).
|
||||
set -uo pipefail
|
||||
|
||||
ROOT="$(cd "$(dirname "$0")" && pwd)"
|
||||
CXX="${CXX:-g++}"
|
||||
JSON_VERSION="${JSON_VERSION:-v3.11.3}"
|
||||
|
||||
JSON_INC="${NLOHMANN_INCLUDE:-}"
|
||||
if [ -z "$JSON_INC" ]; then
|
||||
JSON_INC="$(mktemp -d)"
|
||||
mkdir -p "$JSON_INC/nlohmann"
|
||||
echo "Fetching nlohmann/json ${JSON_VERSION} single header..."
|
||||
if ! curl -L -sf \
|
||||
"https://raw.githubusercontent.com/nlohmann/json/${JSON_VERSION}/single_include/nlohmann/json.hpp" \
|
||||
-o "$JSON_INC/nlohmann/json.hpp"; then
|
||||
echo "ERROR: failed to fetch nlohmann/json header" >&2
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
|
||||
# Active source dirs only - exclude per-variant build copies, dev snapshots and
|
||||
# the vendored upstream llama.cpp tree.
|
||||
mapfile -t tests < <(find "$ROOT" -name '*_test.cpp' \
|
||||
-not -path '*/llama.cpp/*' \
|
||||
-not -path '*-build/*' \
|
||||
-not -path '*-dev/*' \
|
||||
-not -path '*fallback*' | sort)
|
||||
|
||||
if [ "${#tests[@]}" -eq 0 ]; then
|
||||
echo "No standalone C++ unit tests found under $ROOT"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
fail=0
|
||||
for test_src in "${tests[@]}"; do
|
||||
name="$(basename "$test_src" .cpp)"
|
||||
bin="$(mktemp -d)/$name"
|
||||
echo "==> $test_src"
|
||||
if ! "$CXX" -std=c++17 -Wall -Wextra \
|
||||
-I"$JSON_INC" -I"$(dirname "$test_src")" \
|
||||
"$test_src" -o "$bin"; then
|
||||
echo "COMPILE FAILED: $test_src" >&2
|
||||
fail=1
|
||||
continue
|
||||
fi
|
||||
if ! "$bin"; then
|
||||
echo "TEST FAILED: $test_src" >&2
|
||||
fail=1
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Ran ${#tests[@]} standalone C++ unit test file(s)"
|
||||
exit "$fail"
|
||||
@@ -68,6 +68,32 @@ var _ = Describe("LLM tests", func() {
|
||||
Expect(protoMessages[0].Content).To(Equal("Hello World"))
|
||||
})
|
||||
|
||||
// Regression for mudler/LocalAI#10524: a text part whose inner text is
|
||||
// itself a JSON-array string (mealie sends an ingredient list) must
|
||||
// flatten to that exact string verbatim. ToProto must NOT escape or
|
||||
// restructure it - the C++ backend then treats it as opaque text. This
|
||||
// pins the precise Go-side input that produced the "unsupported
|
||||
// content[].type" gRPC error before the backend stopped re-parsing it.
|
||||
It("flattens a JSON-array-looking text part to the verbatim string (#10524)", func() {
|
||||
ingredients := `["1/4 cup brown sugar, packed","1 pound ground beef"]`
|
||||
messages := Messages{
|
||||
{
|
||||
Role: "user",
|
||||
Content: []any{
|
||||
map[string]any{
|
||||
"type": "text",
|
||||
"text": ingredients,
|
||||
},
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
protoMessages := messages.ToProto()
|
||||
|
||||
Expect(protoMessages).To(HaveLen(1))
|
||||
Expect(protoMessages[0].Content).To(Equal(ingredients))
|
||||
})
|
||||
|
||||
It("should convert message with tool_calls", func() {
|
||||
messages := Messages{
|
||||
{
|
||||
|
||||
Reference in New Issue
Block a user