mirror of
https://github.com/mudler/LocalAI.git
synced 2026-04-30 12:08:13 -04:00
Upstream llama.cpp (PR #21962) switched the server-side mtmd media marker to a random per-server string and removed the legacy "<__media__>" backward-compat replacement in mtmd_tokenizer. The Go layer still emitted the hardcoded "<__media__>", so on the non-tokenizer-template path the prompt arrived with a marker mtmd did not recognize and tokenization failed with "number of bitmaps (1) does not match number of markers (0)". Report the active media marker via ModelMetadataResponse.media_marker and substitute the sentinel "<__media__>" with it right before the gRPC call, after the backend has been loaded and probed. Also skip the Go-side multimodal templating entirely when UseTokenizerTemplate is true — llama.cpp's oaicompat_chat_params_parse already injects its own marker and StringContent is unused in that path. Backends that do not expose the field keep the legacy "<__media__>" behavior.
82 lines
2.4 KiB
Go
82 lines
2.4 KiB
Go
package templates
|
|
|
|
import (
|
|
"bytes"
|
|
"text/template"
|
|
|
|
"github.com/Masterminds/sprig/v3"
|
|
)
|
|
|
|
type MultiModalOptions struct {
|
|
TotalImages int
|
|
TotalAudios int
|
|
TotalVideos int
|
|
|
|
ImagesInMessage int
|
|
AudiosInMessage int
|
|
VideosInMessage int
|
|
}
|
|
|
|
type MultimodalContent struct {
|
|
ID int
|
|
}
|
|
|
|
// DefaultMultiMediaMarker is the sentinel marker LocalAI emits in the rendered
|
|
// prompt for each image/audio item. It matches llama.cpp's historical
|
|
// mtmd_default_marker() ("<__media__>"). llama.cpp's server now picks a random
|
|
// per-server marker (see PR #21962) and reports it via ModelMetadataResponse.media_marker;
|
|
// callers substitute this sentinel with the backend-reported marker right before
|
|
// the gRPC call (core/backend/llm.go).
|
|
const DefaultMultiMediaMarker = "<__media__>"
|
|
|
|
// DefaultMultiModalTemplate renders a per-message media-marker prefix followed
|
|
// by the text content. The sentinel marker is substituted late, so this
|
|
// template does not need to know the backend-specific marker.
|
|
//
|
|
// References:
|
|
// - https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
|
|
// - https://github.com/ggml-org/llama.cpp/pull/21962
|
|
const DefaultMultiModalTemplate = "{{ range .Audio }}<__media__>{{end}}{{ range .Images }}<__media__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"
|
|
|
|
func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {
|
|
if templateString == "" {
|
|
templateString = DefaultMultiModalTemplate
|
|
}
|
|
|
|
// compile the template
|
|
tmpl, err := template.New("template").Funcs(sprig.FuncMap()).Parse(templateString)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
videos := []MultimodalContent{}
|
|
for i := range opts.VideosInMessage {
|
|
videos = append(videos, MultimodalContent{ID: i + (opts.TotalVideos - opts.VideosInMessage)})
|
|
}
|
|
|
|
audios := []MultimodalContent{}
|
|
for i := range opts.AudiosInMessage {
|
|
audios = append(audios, MultimodalContent{ID: i + (opts.TotalAudios - opts.AudiosInMessage)})
|
|
}
|
|
|
|
images := []MultimodalContent{}
|
|
for i := range opts.ImagesInMessage {
|
|
images = append(images, MultimodalContent{ID: i + (opts.TotalImages - opts.ImagesInMessage)})
|
|
}
|
|
|
|
result := bytes.NewBuffer(nil)
|
|
// execute the template
|
|
err = tmpl.Execute(result, struct {
|
|
Audio []MultimodalContent
|
|
Images []MultimodalContent
|
|
Video []MultimodalContent
|
|
Text string
|
|
}{
|
|
Audio: audios,
|
|
Images: images,
|
|
Video: videos,
|
|
Text: text,
|
|
})
|
|
return result.String(), err
|
|
}
|