Files
LocalAI/core/templates/multimodal.go
Ettore Di Giacinto 7809c5f5d0 fix(vision): propagate mtmd media marker from backend via ModelMetadata (#9412)
Upstream llama.cpp (PR #21962) switched the server-side mtmd media
marker to a random per-server string and removed the legacy
"<__media__>" backward-compat replacement in mtmd_tokenizer. The
Go layer still emitted the hardcoded "<__media__>", so on the
non-tokenizer-template path the prompt arrived with a marker mtmd
did not recognize and tokenization failed with "number of bitmaps
(1) does not match number of markers (0)".

Report the active media marker via ModelMetadataResponse.media_marker
and substitute the sentinel "<__media__>" with it right before the
gRPC call, after the backend has been loaded and probed. Also skip
the Go-side multimodal templating entirely when UseTokenizerTemplate
is true — llama.cpp's oaicompat_chat_params_parse already injects its
own marker and StringContent is unused in that path. Backends that do
not expose the field keep the legacy "<__media__>" behavior.
2026-04-18 20:30:13 +02:00

82 lines
2.4 KiB
Go

package templates
import (
"bytes"
"text/template"
"github.com/Masterminds/sprig/v3"
)
type MultiModalOptions struct {
TotalImages int
TotalAudios int
TotalVideos int
ImagesInMessage int
AudiosInMessage int
VideosInMessage int
}
type MultimodalContent struct {
ID int
}
// DefaultMultiMediaMarker is the sentinel marker LocalAI emits in the rendered
// prompt for each image/audio item. It matches llama.cpp's historical
// mtmd_default_marker() ("<__media__>"). llama.cpp's server now picks a random
// per-server marker (see PR #21962) and reports it via ModelMetadataResponse.media_marker;
// callers substitute this sentinel with the backend-reported marker right before
// the gRPC call (core/backend/llm.go).
const DefaultMultiMediaMarker = "<__media__>"
// DefaultMultiModalTemplate renders a per-message media-marker prefix followed
// by the text content. The sentinel marker is substituted late, so this
// template does not need to know the backend-specific marker.
//
// References:
// - https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
// - https://github.com/ggml-org/llama.cpp/pull/21962
const DefaultMultiModalTemplate = "{{ range .Audio }}<__media__>{{end}}{{ range .Images }}<__media__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"
func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {
if templateString == "" {
templateString = DefaultMultiModalTemplate
}
// compile the template
tmpl, err := template.New("template").Funcs(sprig.FuncMap()).Parse(templateString)
if err != nil {
return "", err
}
videos := []MultimodalContent{}
for i := range opts.VideosInMessage {
videos = append(videos, MultimodalContent{ID: i + (opts.TotalVideos - opts.VideosInMessage)})
}
audios := []MultimodalContent{}
for i := range opts.AudiosInMessage {
audios = append(audios, MultimodalContent{ID: i + (opts.TotalAudios - opts.AudiosInMessage)})
}
images := []MultimodalContent{}
for i := range opts.ImagesInMessage {
images = append(images, MultimodalContent{ID: i + (opts.TotalImages - opts.ImagesInMessage)})
}
result := bytes.NewBuffer(nil)
// execute the template
err = tmpl.Execute(result, struct {
Audio []MultimodalContent
Images []MultimodalContent
Video []MultimodalContent
Text string
}{
Audio: audios,
Images: images,
Video: videos,
Text: text,
})
return result.String(), err
}