runner: add test for unicode token processing

2026-02-20 08:16:07 -05:00 · 2025-05-14 11:29:11 -07:00
23 changed files with 434 additions and 343 deletions
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -139,8 +139,7 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}

 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
-			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@@ -182,9 +181,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,6 +6,7 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
+	"math"
 	"slices"
 	"strings"

@@ -652,15 +653,24 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
+		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
+		temporalPatchSize := uint64(2)

-		numPatches := maxPixels / (patchSize * patchSize)
+		// Calculate max possible patches based on max_pixels
+		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
+		maxWidth := maxPixels / maxHeight
+		maxGridHeight := maxHeight / patchSize
+		maxGridWidth := maxWidth / patchSize
+		// Account for merged patches (2x2 grid)
+		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)

+		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * patchSize^2)
-			numPatches*numChannels*patchSize*patchSize +
-			// Self-attention calculations
+			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
+			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
+			// Self-attention calculations (similar to other architectures)
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -415,7 +415,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 	}
 	defer file.Close()

-	ggml, _, err := ggml.Decode(file, 0)
+	ggml, _, err := ggml.Decode(file, 1024)
 	if err != nil {
 		return 0
 	}
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -915,8 +915,6 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
-	} else if shape[3] != 0 {
-		panic("cuda does not support 4d tensors")
 	}

 	return &Tensor{
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,30 +2,16 @@ package input

 import "github.com/ollama/ollama/ml"

-// Multimodal is a multimodal embedding or a component of one.
-// For example, it could be a row of an image that can be processed
-// independently.
-type Multimodal struct {
-	// Tensor is the embedding data. Implementations may chose what to
-	// store here or it may be nil if not needed. However, any ml.Tensor
-	// objects must be stored here and not in Data.
-	Tensor ml.Tensor
-
-	// Data is implementation-specific opaque data, such as metadata on how
-	// to layout Tensor. It may be nil if not needed. It may also store larger
-	// objects such as complete images if they are to be processed later.
-	Data any
-}
-
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32

-	// Multimodal is represents a non-text element such as an
-	// image (or part of one if the image can be processed in pieces).
-	// It may be used either together with Token or on its own.
-	Multimodal []Multimodal
+	// Multimodal is opaque data representing a non-text
+	// element such as an image (or part of one if the image
+	// can be processed in pieces). It may be either together
+	// with Token or on its own.
+	Multimodal any

 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -46,7 +32,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal []Multimodal
+	Multimodal any
 }

 // Batch contains the inputs for a model forward pass
--- a/model/model.go
+++ b/model/model.go
@@ -40,13 +40,12 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is one or more tensors, each with optional model-specific
-	// opaque metadata. Typically, the tensors might be views into an embedding
-	// with each view representing a chunk of data that can be processed independently
-	// in different batches.
+	// The return value is most typically an ml.Tensor, however, different
+	// type are possible, such as an object containing a tensor plus
+	// additional metadata, a slice of tensors or even just the original input.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)
+	EncodeMultimodal(ml.Context, []byte) (any, error)

 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -82,7 +82,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -108,22 +108,22 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return visionOutputs, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal[0].Tensor
+			inputMultimodal := inp.Multimodal.(ml.Tensor)

 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
-				input.Input{Token: 255999},                                     // "<start_of_image>""
-				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
+				input.Input{Token: 255999},                                                   // "<start_of_image>""
+				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -165,7 +165,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal[0].Tensor
+		visionOutputs := image.Multimodal.(ml.Tensor)
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

 		for i := range visionOutputs.Dim(1) {
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -62,7 +63,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -102,79 +103,70 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-
-	var multimodal []input.Multimodal
-	aspectRatio := image.Point{ratioW, ratioH}
-
-	var offset int
-	patchesPerChunk := projectedOutputs.Dim(1)
-	if aspectRatio.Y*aspectRatio.X > 1 {
-		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
-
-		for range aspectRatio.Y {
-			for x := range aspectRatio.X {
-				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
-					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
-					patchesPerChunk)
-				var separator separator
-				if x < aspectRatio.X-1 {
-					separator.x = true // <|tile_x_separator|>
-				} else {
-					separator.y = true // <|tile_y_separator|>
-				}
-				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
-				offset += patchesPerChunk
-			}
-		}
-	}
-
-	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
-		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
-		patchesPerChunk)
-	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
-
-	return multimodal, nil
+	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
 }

-type separator struct {
-	x bool
-	y bool
+type chunks struct {
+	*Model
+	ml.Tensor
+	aspectRatio image.Point
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type chunk struct {
+	*chunks
+	s, n int
+}
+
+func (r *chunk) floats() []float32 {
+	r.dataOnce.Do(func() {
+		temp := r.Backend().NewContext()
+		defer temp.Close()
+		temp.Forward(r.Tensor).Compute(r.Tensor)
+		r.data = r.Floats()
+	})
+
+	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 			continue
 		}

+		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

-		for i, mm := range inp.Multimodal {
-			patchesPerChunk := mm.Tensor.Dim(1)
+		var offset int
+		patchesPerChunk := t.Dim(1)
+		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
+			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)

-			if i < len(inp.Multimodal)-1 {
-				separator := mm.Data.(*separator)
-
-				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-
-				if separator.x {
-					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+			for range t.aspectRatio.Y {
+				for x := range t.aspectRatio.X {
+					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+					if x < t.aspectRatio.X-1 {
+						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
+					}
+					offset += patchesPerChunk
 				}
-				if separator.y {
-					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
-				}
-			} else {
-				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
-				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
+
+				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
 			}
 		}

+		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
+		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
+
 		result = append(result, imageInputs...)
 	}

--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -210,7 +210,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
+		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,6 +4,7 @@ import (
 	"bytes"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -104,7 +105,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -128,14 +129,37 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	rows := make([]input.Multimodal, size.Y)
+	parent := imageFeatures{tensor: features}
+	rows := make([]*imageRow, size.Y)
 	for i := range rows {
-		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
+		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
 	}

 	return rows, nil
 }

+type imageFeatures struct {
+	tensor ml.Tensor
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type imageRow struct {
+	parent *imageFeatures
+	s      int
+	shape  []int
+}
+
+func (r *imageRow) data() []float32 {
+	n := 1
+	for _, s := range r.shape {
+		n *= s
+	}
+
+	return r.parent.data[r.s*n : (r.s+1)*n]
+}
+
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -144,14 +168,15 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if len(inp.Multimodal) == 0 {
+		if inp.Multimodal == nil {
 			result = append(result, inp)
 		} else {
-			for i, row := range inp.Multimodal {
+			inputMultimodal := inp.Multimodal.([]*imageRow)
+			for i, row := range inputMultimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
-				if i == len(inp.Multimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
+				if i == len(inputMultimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -9,6 +9,7 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

@@ -19,6 +20,8 @@ type TextOptions struct {
 }

 type TextModel struct {
+	model.Base
+
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -106,7 +109,20 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor

 	// image embeddings
 	for _, image := range batch.Multimodal {
-		imageFeature := image.Multimodal[0].Tensor
+		row := image.Multimodal.(*imageRow)
+		row.parent.dataOnce.Do(func() {
+			// use a new, throwaway context so the image tensor is not added to the graph
+			temp := m.Backend().NewContext()
+			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
+			row.parent.data = row.parent.tensor.Floats()
+			temp.Close()
+		})
+
+		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}

--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -3,7 +3,6 @@ package mllama
 import (
 	"bytes"
 	"image"
-	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -59,7 +58,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -74,17 +73,13 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 		return nil, err
 	}

-	if ratio.numTiles() < m.maxNumTiles {
-		// Pad tiles to maxNumTiles
-		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
-		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
-	}
-
-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
+	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
 	if err != nil {
 		return nil, err
 	}

+	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
+
 	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
 	if err != nil {
 		return nil, err
@@ -92,9 +87,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input

 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
-
-	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
+	return m.Projector.Forward(ctx, crossAttentionStates), nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
@@ -110,7 +103,7 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
 	}

 	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -5,6 +5,7 @@ import (
 	"fmt"
 	"image"
 	"slices"
+	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -76,7 +77,7 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 	return pixelValues, grid, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -87,7 +88,31 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return []input.Multimodal{{Tensor: visionOutputs}}, nil
+	return &chunks{Model: m, Tensor: visionOutputs}, nil
+}
+
+type chunks struct {
+	*Model
+	ml.Tensor
+
+	dataOnce sync.Once
+	data     []float32
+}
+
+type chunk struct {
+	*chunks
+	s, n int
+}
+
+func (r *chunk) floats() []float32 {
+	r.dataOnce.Do(func() {
+		temp := r.Backend().NewContext()
+		defer temp.Close()
+		temp.Forward(r.Tensor).Compute(r.Tensor)
+		r.data = r.Floats()
+	})
+
+	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
 }

 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
@@ -117,16 +142,20 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 				result = append(result, input.Input{Token: pre[i]})
 			}

-			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)
+			// This is an image token with multimodal data
+			chunksData := inp.Multimodal.(*chunks)
+			patchesPerChunk := chunksData.Dim(1)

 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 1})
+			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})

 			// Add the image token with the multimodal tensor data at the first position
+			// Create a chunk with proper s and n values
 			result = append(result, input.Input{
 				Token:          imageToken,
-				Multimodal:     inp.Multimodal,
+				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
 				MultimodalHash: inp.MultimodalHash,
+				SameBatch:      patchesPerChunk,
 			})

 			// Add the placeholder tokens for the remaining positions (tokensPerGrid-1)
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -129,7 +129,12 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		img := mi.Multimodal[0].Tensor
+		f32s := mi.Multimodal.(*chunk).floats()
+		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
+		if err != nil {
+			panic(err)
+		}
+
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)

-	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
+	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", int32(len(prompt))-numPast)

-	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
+	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,6 +3,7 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
+	"image"
 	"testing"
 	"time"

@@ -11,6 +12,10 @@ import (
 )

 func TestCountCommon(t *testing.T) {
+	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
+	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
+	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
+
 	tests := []struct {
 		name     string
 		t1       []input.Input
@@ -31,20 +36,20 @@ func TestCountCommon(t *testing.T) {
 		},
 		{
 			name:     "Image Prefix",
-			t1:       []input.Input{{MultimodalHash: 1}},
-			t2:       []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
+			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
+			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
 			expected: 1,
 		},
 		{
 			name:     "Mixed",
-			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
 			expected: 2,
 		},
 		{
 			name:     "Mixed, Same Length",
-			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {MultimodalHash: 2}},
+			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
 			expected: 1,
 		},
 		{
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -1,116 +0,0 @@
-package ollamarunner
-
-import (
-	"errors"
-
-	"github.com/ollama/ollama/ml"
-	"github.com/ollama/ollama/model/input"
-)
-
-// Tensors can't be used across multiple compute graphs. This is a problem
-// if a single embedding is split across batches using views since all of
-// the views will have the same source tensor. We also don't want to
-// recompute the entire embedding for each batch.
-//
-// To avoid this, we compute all of the tensors for the embedding on the
-// first use and then store the result in system memory. When we need
-// additional tensors, we recreate them from the stored data.
-
-// multimodalEntry represents the embeddings of a single object (such
-// as an image).
-type multimodalEntry struct {
-	// mm is the original set of tensors created by EncodeMultimodal
-	mm []input.Multimodal
-
-	// data is the computed result of mm. Nil if not yet computed
-	data [][]float32
-}
-
-// multimodalStore maps from an individual tensor (of which there
-// may be many in a single multimodal object) to its parent embedding
-type multimodalStore map[ml.Tensor]*multimodalEntry
-
-func newMultimodalStore() multimodalStore {
-	return make(multimodalStore)
-}
-
-// addMultimodal stores an embedding for later use in a compute graph
-func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
-	entry := &multimodalEntry{mm: embedding}
-
-	for _, e := range embedding {
-		if e.Tensor != nil {
-			m[e.Tensor] = entry
-		}
-	}
-}
-
-// getMultimodal takes a source set of tensors (which may contain a whole or
-// parts of one or more images) and returns the equivalent that can be used in
-// the current context
-func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
-	out := make([]input.Multimodal, len(in))
-	for i := range out {
-		if in[i].Tensor != nil {
-			var err error
-			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
-			if err != nil {
-				return nil, err
-			}
-		}
-
-		out[i].Data = in[i].Data
-	}
-
-	return out, nil
-}
-
-func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
-	entry := m[in]
-
-	if entry.data == nil {
-		computeCtx := backend.NewContext()
-		defer computeCtx.Close()
-
-		var tensors []ml.Tensor
-		for _, t := range entry.mm {
-			if t.Tensor != nil {
-				tensors = append(tensors, t.Tensor)
-			}
-		}
-
-		if len(tensors) == 0 {
-			return nil, nil
-		}
-
-		computeCtx.Forward(tensors...)
-		entry.data = make([][]float32, len(entry.mm))
-
-		if !reserve {
-			computeCtx.Compute(tensors...)
-
-			for i, t := range entry.mm {
-				if t.Tensor != nil {
-					entry.data[i] = t.Tensor.Floats()
-				}
-			}
-		} else {
-			err := computeCtx.Reserve()
-			if err != nil {
-				return nil, err
-			}
-		}
-	}
-
-	for i, t := range entry.mm {
-		if in == t.Tensor {
-			if !reserve {
-				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...)
-			} else {
-				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
-			}
-		}
-	}
-
-	return nil, errors.New("multimodal tensor not found")
-}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1,14 +1,12 @@
 package ollamarunner

 import (
-	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
 	"hash/maphash"
-	"image"
 	"log"
 	"log/slog"
 	"net"
@@ -22,7 +20,6 @@ import (
 	"time"
 	"unicode/utf8"

-	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"

 	"github.com/ollama/ollama/api"
@@ -43,9 +40,6 @@ type Sequence struct {
 	// multimodal embeddings
 	ctxs []ml.Context

-	// mmStore holds multimodal embeddings to mange memory and enable splitting across batches
-	mmStore multimodalStore
-
 	// batch index
 	iBatch int

@@ -107,7 +101,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe

 	startTime := time.Now()

-	inputs, ctxs, mmStore, err := s.inputs(prompt, images)
+	inputs, ctxs, err := s.inputs(prompt, images)
 	if err != nil {
 		return nil, fmt.Errorf("failed to process inputs: %w", err)
 	} else if len(inputs) == 0 {
@@ -162,7 +156,6 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe

 	return &Sequence{
 		ctxs:                ctxs,
-		mmStore:             mmStore,
 		inputs:              inputs,
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
@@ -181,10 +174,9 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 // inputs processes the prompt and images into a list of inputs
 // by splitting the prompt on [img-<n>] tags, tokenizing text and
 // decoding images
-func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, multimodalStore, error) {
+func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) {
 	var inputs []input.Input
 	var ctxs []ml.Context
-	var mmStore multimodalStore

 	var parts []string
 	var matches [][]string
@@ -195,7 +187,6 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		re := regexp.MustCompile(`\[img-(\d+)\]`)
 		parts = re.Split(prompt, -1)
 		matches = re.FindAllStringSubmatch(prompt, -1)
-		mmStore = newMultimodalStore()
 	} else {
 		parts = []string{prompt}
 	}
@@ -205,7 +196,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		// text - tokenize
 		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
 		if err != nil {
-			return nil, nil, nil, err
+			return nil, nil, err
 		}

 		for _, t := range tokens {
@@ -225,7 +216,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			}

 			if imageIndex < 0 {
-				return nil, nil, nil, fmt.Errorf("invalid image index: %d", n)
+				return nil, nil, fmt.Errorf("invalid image index: %d", n)
 			}

 			ctx := s.model.Backend().NewContext()
@@ -233,15 +224,13 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			ctxs = append(ctxs, ctx)
 			imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
 			if err != nil {
-				return nil, nil, nil, err
+				return nil, nil, err
 			}

 			s.multimodalHash.Reset()
 			_, _ = s.multimodalHash.Write(images[imageIndex].Data)
 			imageHash := s.multimodalHash.Sum64()

-			mmStore.addMultimodal(imageEmbeddings)
-
 			inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
 			postTokenize = true
 		}
@@ -251,11 +240,11 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		var err error
 		inputs, err = multimodalProcessor.PostTokenize(inputs)
 		if err != nil {
-			return nil, nil, nil, err
+			return nil, nil, err
 		}
 	}

-	return inputs, ctxs, mmStore, nil
+	return inputs, ctxs, nil
 }

 type Server struct {
@@ -374,9 +363,6 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()

-	ctx := s.model.Backend().NewContext()
-	defer ctx.Close()
-
 	var batchInputs []int32
 	var batch input.Batch

@@ -447,11 +433,7 @@ func (s *Server) processBatch() error {

 			batchInputs = append(batchInputs, inp.Token)
 			if inp.Multimodal != nil {
-				mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, false)
-				if err != nil {
-					return err
-				}
-				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: mm})
+				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
 			}

 			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
@@ -477,6 +459,9 @@ func (s *Server) processBatch() error {
 		return nil
 	}

+	ctx := s.model.Backend().NewContext()
+	defer ctx.Close()
+
 	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
@@ -735,71 +720,12 @@ func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

-	var err error
-	inputs := make([]input.Input, s.batchSize)
-	mmStore := newMultimodalStore()
-
-	// Multimodal strategy:
-	// - Encode a 2048x2048 image. This assumes that a single image of this
-	//   size is sufficient to trigger the worst case. This is currently true
-	//   because for existing models, only a single image fits in a batch.
-	// - Add the embedding to a full batch of tokens - this is necessary because
-	//   the model may be looking for non-image data, such as <image> tags.
-	// - Run PostTokenize to execute any transformations between generated
-	//   embeddings and what the forward pass expects.
-	// - The result may now be larger than a batch (images may not fit in a
-	//   single batch), so trim based on what will fit and must be grouped together.
-	// - Fill out the rest of the space with text tokens.
-	if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok {
-		mmCtx := s.model.Backend().NewContext()
-		defer mmCtx.Close()
-
-		img := image.NewGray(image.Rect(0, 0, 2048, 2048))
-		var buf bytes.Buffer
-		bmp.Encode(&buf, img)
-
-		if inputs[0].Multimodal, err = multimodalProcessor.EncodeMultimodal(mmCtx, buf.Bytes()); err == nil {
-			mmStore.addMultimodal(inputs[0].Multimodal)
-
-			inputs, err = multimodalProcessor.PostTokenize(inputs)
-			if err != nil {
-				return err
-			}
-
-			for i, inp := range inputs {
-				minBatch := 1 + inp.SameBatch
-				if minBatch > s.batchSize {
-					inputs = inputs[i:min(i+minBatch, len(inputs))]
-					break
-				} else if i+minBatch > s.batchSize {
-					inputs = inputs[:i]
-					break
-				}
-			}
-
-			if len(inputs) < s.batchSize {
-				newInputs := make([]input.Input, s.batchSize)
-				copy(newInputs, inputs)
-				inputs = newInputs
-			}
-		}
-	}
-
 	var batch input.Batch

-	batchInputs := make([]int32, len(inputs))
+	inputs := make([]int32, s.batchSize)
 	batch.Positions = make([]int32, len(inputs))
 	batch.Sequences = make([]int, len(inputs))
-	for i, inp := range inputs {
-		batchInputs[i] = inp.Token
-		if inp.Multimodal != nil {
-			mm, err := mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, true)
-			if err != nil {
-				return err
-			}
-			batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: i, Multimodal: mm})
-		}
-
+	for i := range inputs {
 		batch.Positions[i] = int32(i)
 	}

@@ -808,7 +734,8 @@ func (s *Server) reserveWorstCaseGraph() error {
 		batch.Outputs[i] = int32(i)
 	}

-	batch.Inputs, err = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))
+	var err error
+	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
 	if err != nil {
 		return err
 	}
--- a/runner/ollamarunner/runner_test.go
+++ b/runner/ollamarunner/runner_test.go
@@ -0,0 +1,218 @@
+package ollamarunner
+
+import (
+	"context"
+	"sync"
+	"testing"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+	"github.com/ollama/ollama/sample"
+	"golang.org/x/sync/semaphore"
+)
+
+// testBackend implements ml.Backend with minimal functionality required for tests.
+type testBackend struct{}
+
+func (b *testBackend) Config() fs.Config             { return testConfig{} }
+func (b *testBackend) Get(string) ml.Tensor          { return nil }
+func (b *testBackend) NewContext() ml.Context        { return &testContext{} }
+func (b *testBackend) NewContextSize(int) ml.Context { return &testContext{} }
+
+// testConfig is a stub implementation of fs.Config used by testBackend.
+type testConfig struct{}
+
+func (testConfig) Architecture() string                  { return "" }
+func (testConfig) String(string, ...string) string       { return "" }
+func (testConfig) Uint(string, ...uint32) uint32         { return 0 }
+func (testConfig) Float(string, ...float32) float32      { return 0 }
+func (testConfig) Bool(string, ...bool) bool             { return false }
+func (testConfig) Strings(string, ...[]string) []string  { return nil }
+func (testConfig) Ints(string, ...[]int32) []int32       { return nil }
+func (testConfig) Floats(string, ...[]float32) []float32 { return nil }
+
+type testContext struct{}
+
+func (c *testContext) Empty(dtype ml.DType, shape ...int) ml.Tensor {
+	sz := 1
+	for _, s := range shape {
+		sz *= s
+	}
+	return &testTensor{dtype: dtype, data: make([]float32, sz), shape: shape}
+}
+func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor { return c.Empty(dtype, shape...) }
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)
+	copy(t.data, s)
+	return t, nil
+}
+func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+	f := make([]float32, len(s))
+	for i, v := range s {
+		f[i] = float32(v)
+	}
+	out, _ := c.FromFloatSlice(f, shape...)
+	out.(*testTensor).dtype = ml.DTypeI32
+	return out, nil
+}
+func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
+	return c.Empty(dtype, int((stop-start)/step))
+}
+func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }
+func (c *testContext) Compute(...ml.Tensor)            {}
+func (c *testContext) Reserve() error                  { return nil }
+func (c *testContext) MaxGraphNodes() int              { return 0 }
+func (c *testContext) Close()                          {}
+func (c *testContext) Input() ml.Context               { return c }
+func (c *testContext) Layer(int) ml.Context            { return c }
+
+type testTensor struct {
+	ml.Tensor
+	dtype ml.DType
+	data  []float32
+	shape []int
+}
+
+func (t *testTensor) Dim(n int) int    { return t.shape[n] }
+func (t *testTensor) Stride(n int) int { return 0 }
+func (t *testTensor) Shape() []int     { return t.shape }
+func (t *testTensor) DType() ml.DType  { return t.dtype }
+func (t *testTensor) Bytes() []byte    { return nil }
+func (t *testTensor) Floats() []float32 {
+	out := make([]float32, len(t.data))
+	copy(out, t.data)
+	return out
+}
+func (t *testTensor) Neg(ctx ml.Context) ml.Tensor { return nil }
+func (t *testTensor) Add(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	out, _ := ctx.(*testContext).FromFloatSlice(nil, len(t.data))
+	return out
+}
+func (t *testTensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor            { return nil }
+func (t *testTensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor         { return nil }
+func (t *testTensor) MulmatFullPrec(ctx ml.Context, t2 ml.Tensor) ml.Tensor { return nil }
+func (t *testTensor) MulmatID(ctx ml.Context, t2, ids ml.Tensor) ml.Tensor  { return nil }
+func (t *testTensor) Softmax(ctx ml.Context) ml.Tensor                      { return nil }
+func (t *testTensor) LayerNorm(ctx ml.Context, w, b ml.Tensor, e float32) ml.Tensor {
+	return nil
+}
+func (t *testTensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
+	return ctx.(*testContext).Empty(t.dtype, shape...)
+}
+func (t *testTensor) Copy(ctx ml.Context, dest ml.Tensor) ml.Tensor {
+	copy(dest.(*testTensor).data, t.data)
+	return nil
+}
+
+// fakeModel implements model.Model and model.TextProcessor.
+type fakeModel struct {
+	model.Base
+	decode  map[int32]string
+	logits  [][]float32
+	call    int
+	backend ml.Backend
+}
+
+func (f *fakeModel) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	idx := f.call
+	if idx >= len(f.logits) {
+		idx = len(f.logits) - 1
+	}
+	f.call++
+	return ctx.FromFloatSlice(f.logits[idx], len(f.logits[idx]))
+}
+
+func (f *fakeModel) Backend() ml.Backend {
+	if f.backend == nil {
+		f.backend = &testBackend{}
+	}
+	return f.backend
+}
+
+func (f *fakeModel) Encode(string, bool) ([]int32, error) { return nil, nil }
+func (f *fakeModel) Decode(ids []int32) (string, error) {
+	var s string
+	for _, id := range ids {
+		s += f.decode[id]
+	}
+	return s, nil
+}
+func (f *fakeModel) Is(id int32, sp model.Special) bool { return false }
+func (f *fakeModel) Vocabulary() *model.Vocabulary      { return &model.Vocabulary{} }
+
+var _ model.Model = (*fakeModel)(nil)
+var _ model.TextProcessor = (*fakeModel)(nil)
+
+func TestProcessBatchUnicode(t *testing.T) {
+	tests := []struct {
+		name   string
+		decode map[int32]string
+		logits [][]float32
+		want   string
+	}{
+		{
+			name:   "emoji",
+			decode: map[int32]string{0: "A", 1: "😀", 2: "👍", 3: "!"},
+			logits: [][]float32{{10, 0, 0, 0}, {0, 10, 0, 0}, {0, 0, 10, 0}, {0, 0, 0, 10}},
+			want:   "A😀👍!",
+		},
+		{
+			name:   "ascii",
+			decode: map[int32]string{0: "H", 1: "e", 2: "y"},
+			logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
+			want:   "Hey",
+		},
+		{
+			name:   "multibyte",
+			decode: map[int32]string{0: "世", 1: "界", 2: "😊"},
+			logits: [][]float32{{10, 0, 0}, {0, 10, 0}, {0, 0, 10}},
+			want:   "世界😊",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			m := &fakeModel{decode: tt.decode, logits: tt.logits}
+
+			s := &Server{model: m, batchSize: 1, parallel: 1}
+			s.cache = &InputCache{enabled: true, slots: []InputCacheSlot{{Id: 0}}, numCtx: 10}
+			s.seqs = make([]*Sequence, 1)
+			s.seqsSem = semaphore.NewWeighted(1)
+			if err := s.seqsSem.Acquire(context.Background(), 1); err != nil {
+				t.Fatal(err)
+			}
+			s.cond = sync.NewCond(&s.mu)
+
+			seq := &Sequence{
+				inputs:     []input.Input{{Token: 0}},
+				cache:      &s.cache.slots[0],
+				responses:  make(chan string, 10),
+				quit:       make(chan bool, 1),
+				numPredict: len(tt.logits),
+				sampler:    sample.NewSampler(0, 0, 0, 0, 0, nil),
+				embedding:  make(chan []float32, 1),
+			}
+			s.seqs[0] = seq
+
+			for {
+				if err := s.processBatch(); err != nil {
+					t.Fatal(err)
+				}
+				if s.seqs[0] == nil {
+					break
+				}
+			}
+
+			var result string
+			for r := range seq.responses {
+				result += r
+			}
+
+			if result != tt.want {
+				t.Fatalf("got %q want %q", result, tt.want)
+			}
+		})
+	}
+}
--- a/server/create.go
+++ b/server/create.go
@@ -430,7 +430,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 	fnWrap := func(n uint64) {
 		done := doneBytes.Add(n)
 		progress := float32(done) / float32(totalBytes)
-		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0000000000000000000", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
+		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
 	}
 	ftype, err := ggml.ParseFileType(quantizeType)
 	if err != nil {
@@ -467,7 +467,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 		return nil, err
 	}

-	f, _, err := ggml.Decode(temp, 0)
+	f, _, err := ggml.Decode(temp, 1024)
 	if err != nil {
 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
 		return nil, err
--- a/server/images.go
+++ b/server/images.go
@@ -75,7 +75,7 @@ func (m *Model) Capabilities() []model.Capability {
 	if err == nil {
 		defer r.Close()

-		f, _, err := ggml.Decode(r, 0)
+		f, _, err := ggml.Decode(r, 1024)
 		if err == nil {
 			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
 				capabilities = append(capabilities, model.CapabilityEmbedding)