LocalAI/core/templates/multimodal.go

package templates

import (
	"bytes"
	"text/template"

	"github.com/Masterminds/sprig/v3"
)

type MultiModalOptions struct {
	TotalImages int
	TotalAudios int
	TotalVideos int

	ImagesInMessage int
	AudiosInMessage int
	VideosInMessage int
}

type MultimodalContent struct {
	ID int
}

// DefaultMultiMediaMarker is the sentinel marker LocalAI emits in the rendered
// prompt for each image/audio item. It matches llama.cpp's historical
// mtmd_default_marker() ("<__media__>"). llama.cpp's server now picks a random
// per-server marker (see PR #21962) and reports it via ModelMetadataResponse.media_marker;
// callers substitute this sentinel with the backend-reported marker right before
// the gRPC call (core/backend/llm.go).
const DefaultMultiMediaMarker = "<__media__>"

// DefaultMultiModalTemplate renders a per-message media-marker prefix followed
// by the text content. The sentinel marker is substituted late, so this
// template does not need to know the backend-specific marker.
//
// References:
//   - https://github.com/ggml-org/llama.cpp/blob/79c137f77677b3c8ee3c60a7da033721b938399a/tools/mtmd/mtmd.cpp#L83
//   - https://github.com/ggml-org/llama.cpp/pull/21962
const DefaultMultiModalTemplate = "{{ range .Audio }}<__media__>{{end}}{{ range .Images }}<__media__>{{end}}{{ range .Video }}[vid-{{.ID}}]{{end}}{{.Text}}"

func TemplateMultiModal(templateString string, opts MultiModalOptions, text string) (string, error) {
	if templateString == "" {
		templateString = DefaultMultiModalTemplate
	}

	// compile the template
	tmpl, err := template.New("template").Funcs(sprig.FuncMap()).Parse(templateString)
	if err != nil {
		return "", err
	}

	videos := []MultimodalContent{}
	for i := range opts.VideosInMessage {
		videos = append(videos, MultimodalContent{ID: i + (opts.TotalVideos - opts.VideosInMessage)})
	}

	audios := []MultimodalContent{}
	for i := range opts.AudiosInMessage {
		audios = append(audios, MultimodalContent{ID: i + (opts.TotalAudios - opts.AudiosInMessage)})
	}

	images := []MultimodalContent{}
	for i := range opts.ImagesInMessage {
		images = append(images, MultimodalContent{ID: i + (opts.TotalImages - opts.ImagesInMessage)})
	}

	result := bytes.NewBuffer(nil)
	// execute the template
	err = tmpl.Execute(result, struct {
		Audio  []MultimodalContent
		Images []MultimodalContent
		Video  []MultimodalContent
		Text   string
	}{
		Audio:  audios,
		Images: images,
		Video:  videos,
		Text:   text,
	})
	return result.String(), err
}