feat(llama.cpp): upgrade and use libmtmd (#5379)

* WIP * wip * wip * Make it compile * Update json.hpp * this shouldn't be private for now * Add logs * Reset auto detected template Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Re-enable grammars * This seems to be broken - 360a9c98e1 (diff-a18a8e64e12a01167d8e98fc)[…]cccf0d4eed09d76d879L2998-L3207 Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Placeholder * Simplify image loading * use completion type * disable streaming Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * correctly return timings Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Remove some debug logging * Adapt tests Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Keep header * embedding: do not use oai type Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Sync from server.cpp * Use utils and json directly from llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Sync with upstream Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: copy json.hpp from the correct location Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * fix: add httplib * sync llama.cpp Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * Embeddiongs: set OAICOMPAT_TYPE_EMBEDDING Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * feat: sync with server.cpp by including it Signed-off-by: Ettore Di Giacinto <mudler@localai.io> * make it darwin-compatible Signed-off-by: Ettore Di Giacinto <mudler@localai.io> --------- Signed-off-by: Ettore Di Giacinto <mudler@localai.io>
2026-05-16 12:38:01 -04:00 · 2025-05-17 16:02:53 +02:00
parent 6ef383033b
commit 6d5bde860b
8 changed files with 648 additions and 27490 deletions
--- a/pkg/templates/multimodal.go
+++ b/pkg/templates/multimodal.go
@@ -21,7 +21,8 @@ type MultimodalContent struct {
 	ID int
 }

-const DefaultMultiModalTemplate = "{{ range .Audio }}[audio-{{.ID}}]{{end}}{{ range .Images }}[img-{{.ID}}]{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"
+// https://github.com/ggml-org/llama.cpp/blob/be1d4a13db26750fac702ceb3af88ae4f39dc9f4/tools/mtmd/mtmd.h#L42
+const DefaultMultiModalTemplate = "{{ range .Audio }}[audio-{{.ID}}]{{end}}{{ range .Images }}<__image__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"

 func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {
 	if templateString == "" {
--- a/pkg/templates/multimodal_test.go
+++ b/pkg/templates/multimodal_test.go
@@ -20,7 +20,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[img-0]bar"))
+			Expect(result).To(Equal("<__image__>bar"))
 		})

 		It("should handle messages with more images correctly", func() {
@@ -33,7 +33,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[img-0][img-1]bar"))
+			Expect(result).To(Equal("<__image__><__image__>bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{
@@ -45,7 +45,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[audio-0][img-2][img-3]bar"))
+			Expect(result).To(Equal("[audio-0]<__image__><__image__>bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{
@@ -57,7 +57,7 @@ var _ = Describe("EvaluateTemplate", func() {
 				VideosInMessage: 0,
 			}, "bar")
 			Expect(err).NotTo(HaveOccurred())
-			Expect(result).To(Equal("[audio-0][img-2]bar"))
+			Expect(result).To(Equal("[audio-0]<__image__>bar"))
 		})
 		It("should handle messages with more images correctly", func() {
 			result, err := TemplateMultiModal("", MultiModalOptions{