diff --git a/backend/cpp/llama-cpp/Makefile b/backend/cpp/llama-cpp/Makefile index a33f1ceaa..13f0de3ee 100644 --- a/backend/cpp/llama-cpp/Makefile +++ b/backend/cpp/llama-cpp/Makefile @@ -1,5 +1,5 @@ -LLAMA_VERSION?=9e3b928fd8c9d14dbf15a8768b9fdd7e5c721d66 +LLAMA_VERSION?=28ca1e600c5dac1854fb7e09611914013430b037 LLAMA_REPO?=https://github.com/ggerganov/llama.cpp CMAKE_ARGS?= diff --git a/backend/cpp/llama-cpp/grpc-server.cpp b/backend/cpp/llama-cpp/grpc-server.cpp index 90a5477a9..9ec806cd1 100644 --- a/backend/cpp/llama-cpp/grpc-server.cpp +++ b/backend/cpp/llama-cpp/grpc-server.cpp @@ -381,6 +381,15 @@ json parse_options(bool streaming, const backend::PredictOptions* predict, const }); } + // for each video in the request, add the video data + for (int i = 0; i < predict->videos_size(); i++) { + data["video_data"].push_back(json + { + {"id", i}, + {"data", predict->videos(i)}, + }); + } + data["stop"] = predict->stopprompts(); // data["n_probs"] = predict->nprobs(); //TODO: images, @@ -1503,7 +1512,7 @@ public: msg_json["role"] = msg.role(); bool is_last_user_msg = (i == last_user_msg_idx); - bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0); + bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0); // Handle content - can be string, null, or array // For multimodal content, we'll embed images/audio from separate fields @@ -1554,6 +1563,16 @@ public: content_array.push_back(audio_chunk); } } + if (request->videos_size() > 0) { + for (int j = 0; j < request->videos_size(); j++) { + json video_chunk; + video_chunk["type"] = "input_video"; + json input_video; + input_video["data"] = request->videos(j); + video_chunk["input_video"] = input_video; + content_array.push_back(video_chunk); + } + } msg_json["content"] = content_array; } else { // Use content as-is (already array or not last user message) @@ -1588,6 +1607,16 @@ public: content_array.push_back(audio_chunk); } } + if (request->videos_size() > 0) { + for (int j = 0; j < request->videos_size(); j++) { + json video_chunk; + video_chunk["type"] = "input_video"; + json input_video; + input_video["data"] = request->videos(j); + video_chunk["input_video"] = input_video; + content_array.push_back(video_chunk); + } + } msg_json["content"] = content_array; } else if (msg.role() == "tool") { // Tool role messages must have content field set, even if empty @@ -2039,6 +2068,16 @@ public: files.push_back(decoded_data); } } + + const auto &video_data = data.find("video_data"); + if (video_data != data.end() && video_data->is_array()) + { + for (const auto &video : *video_data) + { + auto decoded_data = base64_decode(video["data"].get()); + files.push_back(decoded_data); + } + } } const bool has_mtmd = ctx_server.impl->mctx != nullptr; @@ -2291,7 +2330,7 @@ public: } bool is_last_user_msg = (i == last_user_msg_idx); - bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0); + bool has_images_or_audio = (request->images_size() > 0 || request->audios_size() > 0 || request->videos_size() > 0); // Handle content - can be string, null, or array // For multimodal content, we'll embed images/audio from separate fields @@ -2344,6 +2383,16 @@ public: content_array.push_back(audio_chunk); } } + if (request->videos_size() > 0) { + for (int j = 0; j < request->videos_size(); j++) { + json video_chunk; + video_chunk["type"] = "input_video"; + json input_video; + input_video["data"] = request->videos(j); + video_chunk["input_video"] = input_video; + content_array.push_back(video_chunk); + } + } msg_json["content"] = content_array; } else { // Use content as-is (already array or not last user message) @@ -2383,6 +2432,16 @@ public: content_array.push_back(audio_chunk); } } + if (request->videos_size() > 0) { + for (int j = 0; j < request->videos_size(); j++) { + json video_chunk; + video_chunk["type"] = "input_video"; + json input_video; + input_video["data"] = request->videos(j); + video_chunk["input_video"] = input_video; + content_array.push_back(video_chunk); + } + } msg_json["content"] = content_array; SRV_INF("[CONTENT DEBUG] Predict: Message %d created content array with media\n", i); } else if (!msg.tool_calls().empty()) { @@ -2845,6 +2904,16 @@ public: files.push_back(decoded_data); } } + + const auto &video_data = data.find("video_data"); + if (video_data != data.end() && video_data->is_array()) + { + for (const auto &video : *video_data) + { + auto decoded_data = base64_decode(video["data"].get()); + files.push_back(decoded_data); + } + } } // process files diff --git a/core/http/react-ui/src/hooks/useChat.js b/core/http/react-ui/src/hooks/useChat.js index 539190b99..5bd491e77 100644 --- a/core/http/react-ui/src/hooks/useChat.js +++ b/core/http/react-ui/src/hooks/useChat.js @@ -216,6 +216,12 @@ export function useChat(initialModel = '') { audio_url: { url: `data:${file.type};base64,${file.base64}` }, }) userFiles.push({ name: file.name, type: 'audio' }) + } else if (file.type?.startsWith('video/')) { + messageContent.push({ + type: 'video_url', + video_url: { url: `data:${file.type};base64,${file.base64}` }, + }) + userFiles.push({ name: file.name, type: 'video' }) } else { // Text/PDF files - append to content if (file.textContent) { diff --git a/core/http/react-ui/src/pages/Chat.jsx b/core/http/react-ui/src/pages/Chat.jsx index a638aa3a8..2c51fe942 100644 --- a/core/http/react-ui/src/pages/Chat.jsx +++ b/core/http/react-ui/src/pages/Chat.jsx @@ -265,7 +265,7 @@ function UserMessageContent({ content, files }) {
{files.map((f, i) => ( - + {f.name} ))} @@ -274,6 +274,9 @@ function UserMessageContent({ content, files }) { {Array.isArray(content) && content.filter(c => c.type === 'image_url').map((img, i) => ( attached ))} + {Array.isArray(content) && content.filter(c => c.type === 'video_url').map((vid, i) => ( +