llama: add minimum memory for grammar (#10820 )

ml: Panic rather than return error on tensor allocation failure
FromFloatSlice and FromIntSlice return an error if the shape doesn't match the passed data or if memory can't be allocated. Since these are inputs, the memory being allocated is system memory rather than VRAM. In many cases, the caller can't really handle the error and panics. Empty and Zeros directly panic if they can't allocate memory. This makes things consistent by panicing for the first two cases, removing a fair amount of error handling code. This is also consistent with how Go typically handles these situations.
2026-01-05 14:11:04 -05:00 · 2025-05-22 18:53:31 -07:00 · 2025-05-22 14:38:09 -07:00 · 2025-05-22 14:38:09 -07:00 · 2025-05-22 14:38:09 -07:00 · 2025-05-22 14:31:36 -07:00
69 changed files with 2026 additions and 1023 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,6 +51,8 @@ include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/include
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu)
 include_directories(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src/ggml-cpu/amx)

+add_compile_definitions(NDEBUG)
+
 set(GGML_CPU ON)
 add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/ml/backend/ggml/ggml/src)
 set_property(TARGET ggml PROPERTY EXCLUDE_FROM_ALL TRUE)
--- a/README.md
+++ b/README.md
@@ -405,6 +405,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Writeopia](https://github.com/Writeopia/Writeopia) (Text editor with integration with Ollama)
 - [AppFlowy](https://github.com/AppFlowy-IO/AppFlowy) (AI collaborative workspace with Ollama, cross-platform and self-hostable)
 - [Lumina](https://github.com/cushydigit/lumina.git) (A lightweight, minimal React.js frontend for interacting with Ollama servers)
+- [Tiny Notepad](https://pypi.org/project/tiny-notepad) (A lightweight, notepad-like interface to chat with ollama available on PyPI)

 ### Cloud

--- a/cmd/cmd.go
+++ b/cmd/cmd.go
@@ -747,11 +747,38 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {
 				case float64:
 					v = fmt.Sprintf("%g", vData)
 				case []any:
-					n := 3
-					if len(vData) < n {
-						n = len(vData)
+					targetWidth := 10 // Small width where we are displaying the data in a column
+
+					var itemsToShow int
+					totalWidth := 1 // Start with 1 for opening bracket
+
+					// Find how many we can fit
+					for i := range vData {
+						itemStr := fmt.Sprintf("%v", vData[i])
+						width := runewidth.StringWidth(itemStr)
+
+						// Add separator width (", ") for all items except the first
+						if i > 0 {
+							width += 2
+						}
+
+						// Check if adding this item would exceed our width limit
+						if totalWidth+width > targetWidth && i > 0 {
+							break
+						}
+
+						totalWidth += width
+						itemsToShow++
+					}
+
+					// Format the output
+					if itemsToShow < len(vData) {
+						v = fmt.Sprintf("%v", vData[:itemsToShow])
+						v = strings.TrimSuffix(v, "]")
+						v += fmt.Sprintf(" ...+%d more]", len(vData)-itemsToShow)
+					} else {
+						v = fmt.Sprintf("%v", vData)
 					}
-					v = fmt.Sprintf("%v", vData[:n])
 				default:
 					v = fmt.Sprintf("%T", vData)
 				}
@@ -772,10 +799,19 @@ func showInfo(resp *api.ShowResponse, verbose bool, w io.Writer) error {

 	head := func(s string, n int) (rows [][]string) {
 		scanner := bufio.NewScanner(strings.NewReader(s))
-		for scanner.Scan() && (len(rows) < n || n < 0) {
-			if text := scanner.Text(); text != "" {
-				rows = append(rows, []string{"", strings.TrimSpace(text)})
+		count := 0
+		for scanner.Scan() {
+			text := strings.TrimSpace(scanner.Text())
+			if text == "" {
+				continue
 			}
+			count++
+			if n < 0 || count <= n {
+				rows = append(rows, []string{"", text})
+			}
+		}
+		if n >= 0 && count > n {
+			rows = append(rows, []string{"", "..."})
 		}
 		return
 	}
@@ -1200,11 +1236,11 @@ func checkServerHeartbeat(cmd *cobra.Command, _ []string) error {
 		return err
 	}
 	if err := client.Heartbeat(cmd.Context()); err != nil {
-		if !strings.Contains(err.Error(), " refused") {
+		if !(strings.Contains(err.Error(), " refused") || strings.Contains(err.Error(), "could not connect")) {
 			return err
 		}
 		if err := startApp(cmd.Context(), client); err != nil {
-			return errors.New("could not connect to ollama app, is it running?")
+			return fmt.Errorf("ollama server not responding - %w", err)
 		}
 	}
 	return nil
@@ -1282,7 +1318,7 @@ func NewCLI() *cobra.Command {
 	}

 	createCmd.Flags().StringP("file", "f", "", "Name of the Modelfile (default \"Modelfile\"")
-	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_0)")
+	createCmd.Flags().StringP("quantize", "q", "", "Quantize model to this level (e.g. q4_K_M)")

 	showCmd := &cobra.Command{
 		Use:     "show MODEL",
--- a/cmd/cmd_test.go
+++ b/cmd/cmd_test.go
@@ -225,6 +225,7 @@ Weigh anchor!
  System
    You are a pirate!    
    Ahoy, matey!         
+    ...                  

 `
 		if diff := cmp.Diff(expect, b.String()); diff != "" {
--- a/cmd/start_windows.go
+++ b/cmd/start_windows.go
@@ -4,17 +4,27 @@ import (
 	"context"
 	"errors"
 	"fmt"
+	"log/slog"
 	"os"
 	"os/exec"
+	"path"
 	"path/filepath"
 	"strings"
 	"syscall"
+	"unsafe"

 	"github.com/ollama/ollama/api"
+	"golang.org/x/sys/windows"
+)
+
+const (
+	Installer = "OllamaSetup.exe"
 )

 func startApp(ctx context.Context, client *api.Client) error {
-	// log.Printf("XXX Attempting to find and start ollama app")
+	if len(isProcRunning(Installer)) > 0 {
+		return fmt.Errorf("upgrade in progress...")
+	}
 	AppName := "ollama app.exe"
 	exe, err := os.Executable()
 	if err != nil {
@@ -56,3 +66,41 @@ func startApp(ctx context.Context, client *api.Client) error {
 	}
 	return waitForServer(ctx, client)
 }
+
+func isProcRunning(procName string) []uint32 {
+	pids := make([]uint32, 2048)
+	var ret uint32
+	if err := windows.EnumProcesses(pids, &ret); err != nil || ret == 0 {
+		slog.Debug("failed to check for running installers", "error", err)
+		return nil
+	}
+	pids = pids[:ret]
+	var matches []uint32
+	for _, pid := range pids {
+		if pid == 0 {
+			continue
+		}
+		hProcess, err := windows.OpenProcess(windows.PROCESS_QUERY_INFORMATION|windows.PROCESS_VM_READ, false, pid)
+		if err != nil {
+			continue
+		}
+		defer windows.CloseHandle(hProcess)
+		var module windows.Handle
+		var cbNeeded uint32
+		cb := (uint32)(unsafe.Sizeof(module))
+		if err := windows.EnumProcessModules(hProcess, &module, cb, &cbNeeded); err != nil {
+			continue
+		}
+		var sz uint32 = 1024 * 8
+		moduleName := make([]uint16, sz)
+		cb = uint32(len(moduleName)) * (uint32)(unsafe.Sizeof(uint16(0)))
+		if err := windows.GetModuleBaseName(hProcess, module, &moduleName[0], cb); err != nil && err != syscall.ERROR_INSUFFICIENT_BUFFER {
+			continue
+		}
+		exeFile := path.Base(strings.ToLower(syscall.UTF16ToString(moduleName)))
+		if strings.EqualFold(exeFile, procName) {
+			matches = append(matches, pid)
+		}
+	}
+	return matches
+}
--- a/convert/convert.go
+++ b/convert/convert.go
@@ -53,8 +53,11 @@ func (ModelParameters) KV(t *Tokenizer) ggml.KV {
 	}

 	for _, sv := range t.SpecialVocabulary {
-		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
 		kv[fmt.Sprintf("tokenizer.ggml.add_%s_token", sv.Key())] = sv.AddToken
+		kv[fmt.Sprintf("tokenizer.ggml.%s_token_id", sv.Key())] = uint32(sv.ID)
+		if len(sv.IDs) > 0 {
+			kv[fmt.Sprintf("tokenizer.ggml.%s_token_ids", sv.Key())] = sv.IDs
+		}
 	}

 	return kv
--- a/convert/convert_llama.go
+++ b/convert/convert_llama.go
@@ -139,7 +139,8 @@ func (p *llamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	}

 	for _, t := range ts {
-		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
+		if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") ||
+			strings.HasSuffix(t.Name(), "attn_q_proj.weight") || strings.HasSuffix(t.Name(), "attn_k_proj.weight") {
 			if !p.skipRepack {
 				t.SetRepacker(p.repack)
 			}
@@ -181,9 +182,9 @@ func (p *llamaModel) repack(name string, data []float32, shape []uint64) ([]floa
 	}

 	var heads uint32
-	if strings.HasSuffix(name, "attn_q.weight") {
+	if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_q_proj.weight") {
 		heads = p.NumAttentionHeads
-	} else if strings.HasSuffix(name, "attn_k.weight") {
+	} else if strings.HasSuffix(name, "attn_k.weight") || strings.HasSuffix(name, "attn_k_proj.weight") {
 		heads = cmp.Or(p.NumKeyValueHeads, p.NumAttentionHeads)
 	} else {
 		return nil, fmt.Errorf("unknown tensor for repack: %s", name)
--- a/convert/convert_mllama.go
+++ b/convert/convert_mllama.go
@@ -94,7 +94,9 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 	var out []*ggml.Tensor
 	var text []Tensor
 	for _, t := range ts {
-		if t.Name() == "v.position_embd.gate" {
+		if !strings.HasPrefix(t.Name(), "v.") && !strings.HasPrefix(t.Name(), "mm.") {
+			text = append(text, t)
+		} else if t.Name() == "v.position_embd.gate" {
 			for _, name := range []string{"v.position_embd.gate", "v.tile_position_embd.gate"} {
 				tt := t.Clone()
 				tt.SetRepacker(m.repack(name))
@@ -105,23 +107,21 @@ func (m *mllamaModel) Tensors(ts []Tensor) []*ggml.Tensor {
 					WriterTo: tt,
 				})
 			}
-		} else if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
-			t.SetRepacker(m.repack(t.Name()))
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
-		} else if strings.HasPrefix(t.Name(), "v.") || strings.HasPrefix(t.Name(), "mm.") {
-			out = append(out, &ggml.Tensor{
-				Name:     t.Name(),
-				Kind:     t.Kind(),
-				Shape:    t.Shape(),
-				WriterTo: t,
-			})
 		} else {
-			text = append(text, t)
+			if t.Name() == "v.pre_tile_position_embd.gate" || t.Name() == "v.post_tile_position_embd.gate" {
+				t.SetRepacker(m.repack(t.Name()))
+			} else if strings.HasSuffix(t.Name(), "attn_q.weight") || strings.HasSuffix(t.Name(), "attn_k.weight") {
+				t.SetRepacker(m.repack(t.Name()))
+			} else if strings.HasSuffix(t.Name(), "attn_gate") || strings.HasSuffix(t.Name(), "ffn_gate") {
+				t.SetRepacker(m.repack(t.Name()))
+			}
+
+			out = append(out, &ggml.Tensor{
+				Name:     t.Name(),
+				Kind:     t.Kind(),
+				Shape:    t.Shape(),
+				WriterTo: t,
+			})
 		}
 	}

@@ -137,16 +137,35 @@ func (m *mllamaModel) repack(name string) Repacker {

 		var t tensor.Tensor = tensor.New(tensor.WithShape(dims...), tensor.WithBacking(data))

-		t, err = tensor.Tanh(t)
-		if err != nil {
-			return nil, err
-		}
+		if strings.HasSuffix(name, "attn_q.weight") || strings.HasSuffix(name, "attn_k.weight") {
+			heads := m.VisionModel.AttentionHeads
+			if err := t.Reshape(append([]int{int(heads), 2, dims[0] / int(heads) / 2}, dims[1:]...)...); err != nil {
+				return nil, err
+			}

-		if name == "v.position_embd.gate" {
-			t, err = tensor.Sub(float32(1), t)
+			if err := t.T(0, 2, 1, 3); err != nil {
+				return nil, err
+			}
+
+			if err := t.Reshape(dims...); err != nil {
+				return nil, err
+			}
+
+			if err := t.Transpose(); err != nil {
+				return nil, err
+			}
+		} else {
+			t, err = tensor.Tanh(t)
 			if err != nil {
 				return nil, err
 			}
+
+			if name == "v.position_embd.gate" {
+				t, err = tensor.Sub(float32(1), t)
+				if err != nil {
+					return nil, err
+				}
+			}
 		}

 		t = tensor.Materialize(t)
--- a/convert/convert_test.go
+++ b/convert/convert_test.go
@@ -47,7 +47,7 @@ func convertFull(t *testing.T, fsys fs.FS) (*os.File, ggml.KV, ggml.Tensors) {
 	}
 	t.Cleanup(func() { r.Close() })

-	m, _, err := ggml.Decode(r, -1)
+	m, err := ggml.Decode(r, -1)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -332,7 +332,7 @@ func TestConvertAdapter(t *testing.T) {
 			}
 			defer r.Close()

-			m, _, err := ggml.Decode(r, -1)
+			m, err := ggml.Decode(r, -1)
 			if err != nil {
 				t.Fatal(err)
 			}
--- a/convert/tokenizer.go
+++ b/convert/tokenizer.go
@@ -110,6 +110,7 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 	}

 	if f, err := fsys.Open("tokenizer_config.json"); errors.Is(err, os.ErrNotExist) {
+		// noop
 	} else if err != nil {
 		return nil, err
 	} else {
@@ -171,6 +172,34 @@ func parseTokenizer(fsys fs.FS, specialTokenTypes []string) (*Tokenizer, error)
 		}
 	}

+	if f, err := fsys.Open("generation_config.json"); errors.Is(err, os.ErrNotExist) {
+	} else if err != nil {
+		return nil, err
+	} else {
+		defer f.Close()
+
+		var p map[string]json.RawMessage
+		if err := json.NewDecoder(f).Decode(&p); err != nil {
+			return nil, err
+		}
+
+		for _, st := range specialTokenTypes {
+			if bts, ok := p[fmt.Sprintf("%s_token_id", st)]; ok {
+				var ids []int32
+				if err := json.Unmarshal(bts, &ids); err != nil {
+					// value is not a list so the existing ID is used
+					continue
+				}
+
+				if i := slices.IndexFunc(t.SpecialVocabulary, func(sv *SpecialVocabulary) bool {
+					return sv.Type == st
+				}); i >= 0 {
+					t.SpecialVocabulary[i].IDs = ids
+				}
+			}
+		}
+	}
+
 	return t, nil
 }

@@ -280,6 +309,9 @@ type SpecialVocabulary struct {
 	ID       int
 	Content  string
 	AddToken bool
+
+	// IDs is populated by generation_config.json
+	IDs []int32
 }

 func (sv SpecialVocabulary) Key() string {
--- a/convert/tokenizer_test.go
+++ b/convert/tokenizer_test.go
@@ -247,6 +247,67 @@ func TestParseTokenizer(t *testing.T) {
 				Pre: "default",
 			},
 		},
+		{
+			name: "generation config eos token ids",
+			fsys: createTokenizerFS(t, t.TempDir(), map[string]io.Reader{
+				"tokenizer.json": strings.NewReader(`{
+					"added_tokens": [
+						{
+							"id": 0,
+							"content": "<bos>",
+							"special": true
+						},
+						{
+							"id": 1,
+							"content": "<eos>",
+							"special": true
+						},
+						{
+							"id": 2,
+							"content": "<eot>",
+							"special": true
+						},
+						{
+							"id": 3,
+							"content": "<eom>",
+							"special": true
+						}
+					],
+					"model": {
+						"vocab": {
+							"<bos>": 0,
+							"<eos>": 1,
+							"<eot>": 2,
+							"<eom>": 3
+						}
+					}
+				}`),
+				"tokenizer_config.json": strings.NewReader(`{
+					"add_bos_token": true,
+					"add_eos_token": false,
+					"bos_token": "<bos>",
+					"eos_token": "<eos>"
+				}`),
+				"generation_config.json": strings.NewReader(`{
+					"bos_token_id": 0,
+					"eos_token_id": [1, 2, 3]
+				}`),
+			}),
+			specialTokenTypes: []string{"pad", "eos", "bos", "unk"},
+			want: &Tokenizer{
+				Vocabulary: &Vocabulary{
+					Model:  "gpt2",
+					Tokens: []string{"<bos>", "<eos>", "<eot>", "<eom>"},
+					Scores: []float32{0, 1, 2, 3},
+					Types:  []int32{3, 3, 3, 3},
+				},
+				SpecialVocabulary: []*SpecialVocabulary{
+					{Type: "eos", Content: "<eos>", ID: 1, IDs: []int32{1, 2, 3}, AddToken: false},
+					{Type: "bos", Content: "<bos>", ID: 0, AddToken: true},
+				},
+				Pre: "default",
+			},
+		},
 	}

 	for _, tt := range cases {
--- a/fs/ggml/ggml.go
+++ b/fs/ggml/ggml.go
@@ -6,7 +6,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"math"
 	"slices"
 	"strings"

@@ -16,6 +15,7 @@ import (
 type GGML struct {
 	container
 	model
+	Length int64
 }

 type model interface {
@@ -387,12 +387,12 @@ func DetectContentType(b []byte) string {
 //
 // It collects array values for arrays with a size less than or equal to
 // maxArraySize. If the maxArraySize is negative, all arrays are collected.
-func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
+func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, error) {
 	rs = bufioutil.NewBufferedSeeker(rs, 32<<10)

 	var magic uint32
 	if err := binary.Read(rs, binary.LittleEndian, &magic); err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	var c container
@@ -402,24 +402,25 @@ func Decode(rs io.ReadSeeker, maxArraySize int) (*GGML, int64, error) {
 	case FILE_MAGIC_GGUF_BE:
 		c = &containerGGUF{ByteOrder: binary.BigEndian, maxArraySize: maxArraySize}
 	default:
-		return nil, 0, errors.New("invalid file magic")
+		return nil, errors.New("invalid file magic")
 	}

 	model, err := c.Decode(rs)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	offset, err := rs.Seek(0, io.SeekCurrent)
 	if err != nil {
-		return nil, 0, err
+		return nil, err
 	}

 	// final model type
 	return &GGML{
 		container: c,
 		model:     model,
-	}, offset, nil
+		Length:    offset,
+	}, nil
 }

 func (f GGML) GraphSize(context, batch uint64, numParallel int, kvCacheType string) (kv []uint64, partialOffload, fullOffload uint64) {
@@ -653,24 +654,15 @@ func (llm GGML) VisionGraphSize() (weights, graphSize uint64) {
 			numPatches*numPatches*headCount)
 	case "qwen25vl":
 		maxPixels := uint64(llm.KV().Uint("vision.max_pixels", 28*28*1280))
-		mergeSize := uint64(llm.KV().Uint("vision.spatial_merge_size", 2))
-		temporalPatchSize := uint64(2)

-		// Calculate max possible patches based on max_pixels
-		maxHeight := uint64(math.Sqrt(float64(maxPixels)))
-		maxWidth := maxPixels / maxHeight
-		maxGridHeight := maxHeight / patchSize
-		maxGridWidth := maxWidth / patchSize
-		// Account for merged patches (2x2 grid)
-		numPatches := (maxGridHeight * maxGridWidth) / (mergeSize * mergeSize)
+		numPatches := maxPixels / (patchSize * patchSize)

-		// Calculate graph size based on typical operations in ProcessImage and createPatches
 		graphSize = 4 * (maxPixels*numChannels + // Original image storage
 			// Normalized pixels
 			maxPixels*numChannels +
-			// Patches storage (numPatches * channels * temporalPatchSize * patchSize^2)
-			numPatches*numChannels*temporalPatchSize*patchSize*patchSize +
-			// Self-attention calculations (similar to other architectures)
+			// Patches storage (numPatches * channels * patchSize^2)
+			numPatches*numChannels*patchSize*patchSize +
+			// Self-attention calculations
 			numPatches*numPatches*headCount +
 			// Additional buffer for processing
 			embeddingLength*numPatches)
--- a/fs/ggml/gguf_test.go
+++ b/fs/ggml/gguf_test.go
@@ -35,7 +35,7 @@ func TestWriteGGUF(t *testing.T) {
 	}
 	defer r.Close()

-	ff, _, err := Decode(r, 0)
+	ff, err := Decode(r, 0)
 	if err != nil {
 		t.Fatal(err)
 	}
--- a/integration/llm_image_test.go
+++ b/integration/llm_image_test.go
@@ -19,7 +19,7 @@ func TestVisionModels(t *testing.T) {
 	}
 	testCases := []testCase{
 		{
-			model: "llava:7b",
+			model: "qwen2.5vl",
 		},
 		{
 			model: "llama3.2-vision",
@@ -60,6 +60,7 @@ func TestVisionModels(t *testing.T) {
 }

 func TestIntegrationSplitBatch(t *testing.T) {
+	skipUnderMinVRAM(t, 6)
 	image, err := base64.StdEncoding.DecodeString(imageEncoding)
 	require.NoError(t, err)
 	req := api.GenerateRequest{
--- a/kvcache/causal.go
+++ b/kvcache/causal.go
@@ -211,10 +211,9 @@ func (c *Causal) StartForward(ctx ml.Context, batch input.Batch, reserve bool) e
 		c.curCellRange.max = len(c.cells) - 1
 	}

-	var err error
-	c.curMask, err = c.buildMask(ctx)
+	c.curMask = c.buildMask(ctx)

-	return err
+	return nil
 }

 func newRange() cellRange {
@@ -297,7 +296,7 @@ func roundUp(length, pad int) int {
 // Builds a mask of history x batch indicating whether for each token in the batch the
 // token in the history should apply. This is based on both the sequence and causality (the
 // position of the history is not ahead of the token in the batch).
-func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
+func (c *Causal) buildMask(ctx ml.Context) ml.Tensor {
 	// Align and pad the two dimensions as required by the backend
 	batchSize := roundUp(c.curBatchSize, c.config.MaskBatchPadding)

@@ -325,10 +324,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		mask[i] = float32(math.Inf(-1))
 	}

-	maskTensor, err := ctx.Input().FromFloatSlice(mask, length, batchSize)
-	if err != nil {
-		return nil, err
-	}
+	maskTensor := ctx.Input().FromFloatSlice(mask, length, batchSize)

 	if c.config.MaskDType != ml.DTypeF32 {
 		out := ctx.Input().Empty(c.config.MaskDType, maskTensor.Shape()...)
@@ -336,7 +332,7 @@ func (c *Causal) buildMask(ctx ml.Context) (ml.Tensor, error) {
 		maskTensor = out
 	}

-	return maskTensor, nil
+	return maskTensor
 }

 func (c *Causal) moveCells(ctx ml.Context, src, dst, length int) {
@@ -491,12 +487,7 @@ func (c *Causal) SetCausal(ctx ml.Context, opts CausalOptions) {
 	if !slices.Equal(c.opts.Except, opts.Except) {
 		c.opts = opts
 		if ctx != nil {
-			var err error
-			c.curMask, err = c.buildMask(ctx)
-			if err != nil {
-				// This error should never occur because we have previously built a mask with the same shape
-				panic(fmt.Errorf("SetCausal: %w", err))
-			}
+			c.curMask = c.buildMask(ctx)
 		}
 	}
 }
@@ -652,10 +643,7 @@ func (c *Causal) shift(seq int, beginIndex, offset int32) error {
 		}
 	}

-	kShift, err := ctx.Input().FromIntSlice(offsets, len(offsets))
-	if err != nil {
-		return err
-	}
+	kShift := ctx.Input().FromIntSlice(offsets, len(offsets))

 	for i, key := range c.keys {
 		if key == nil {
--- a/kvcache/causal_test.go
+++ b/kvcache/causal_test.go
@@ -344,7 +344,7 @@ func testCache(t *testing.T, backend ml.Backend, cache Cache, tests []testCase)
 			}

 			cache.SetLayer(0)
-			tensor, _ := context.FromFloatSlice(test.in, test.inShape...)
+			tensor := context.FromFloatSlice(test.in, test.inShape...)
 			cache.Put(context, tensor, tensor)

 			out, _, mask := cache.Get(context)
@@ -386,7 +386,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor, _ := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
+	tensor := context.FromFloatSlice([]float32{1, 2, 3, 4}, 1, 1, 4)
 	cache.Put(context, tensor, tensor)

 	// with window size 4, nothing has slid out of the window yet
@@ -413,7 +413,7 @@ func TestCanResume(t *testing.T) {
 	}

 	cache.SetLayer(0)
-	tensor, _ = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
+	tensor = context.FromFloatSlice([]float32{5, 6}, 1, 1, 2)
 	cache.Put(context, tensor, tensor)

 	// only the latest position has overlapping windows
@@ -470,24 +470,24 @@ func (c *testContext) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
 	return c.Empty(dtype, shape...)
 }

-func (c *testContext) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
 	t := c.Empty(ml.DTypeF32, shape...).(*testTensor)

 	copy(t.data, s)

-	return t, nil
+	return t
 }

-func (c *testContext) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
+func (c *testContext) FromIntSlice(s []int32, shape ...int) ml.Tensor {
 	f := make([]float32, len(s))
 	for i := range f {
 		f[i] = float32(s[i])
 	}

-	out, _ := c.FromFloatSlice(f, shape...)
+	out := c.FromFloatSlice(f, shape...)
 	out.(*testTensor).dtype = ml.DTypeI32

-	return out, nil
+	return out
 }

 func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -496,7 +496,7 @@ func (c *testContext) Arange(start, stop, step float32, dtype ml.DType) ml.Tenso
 		s = append(s, i)
 	}

-	out, _ := c.FromFloatSlice(s, len(s))
+	out := c.FromFloatSlice(s, len(s))
 	out.(*testTensor).dtype = dtype
 	return out
 }
@@ -508,7 +508,7 @@ func (c *testContext) Forward(...ml.Tensor) ml.Context { return c }

 func (c *testContext) Compute(...ml.Tensor) {}

-func (c *testContext) Reserve() error { return nil }
+func (c *testContext) Reserve() {}

 func (c *testContext) MaxGraphNodes() int {
 	return 10
--- a/llama/llama.go
+++ b/llama/llama.go
@@ -544,7 +544,7 @@ func NewSamplingContext(model *Model, params SamplingParams) (*SamplingContext,
 	cparams.penalty_last_n = C.int32_t(params.RepeatLastN)
 	cparams.penalty_repeat = C.float(params.PenaltyRepeat)
 	cparams.penalty_freq = C.float(params.PenaltyFreq)
-	cparams.penalty_present = C.float(params.PenaltyFreq)
+	cparams.penalty_present = C.float(params.PenaltyPresent)
 	cparams.seed = C.uint32_t(params.Seed)

 	grammar := C.CString(params.Grammar)
@@ -580,7 +580,7 @@ func SchemaToGrammar(schema []byte) []byte {
 	defer C.free(unsafe.Pointer(cStr))

 	// Allocate buffer for grammar based on schema length but with upper bound
-	maxLen := min(1024*1024, len(schema)*4)
+	maxLen := max(32768, min(1024*1024, len(schema)*4))
 	buf := make([]byte, maxLen)

 	// Call C function to convert schema to grammar
@@ -602,7 +602,7 @@ type Grammar struct {
 	mu sync.Mutex
 }

-func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []uint32) *Grammar {
+func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogTokens []int32) *Grammar {
 	cGrammar := C.CString(grammar)
 	defer C.free(unsafe.Pointer(cGrammar))

@@ -622,7 +622,7 @@ func NewGrammar(grammar string, vocabIds []uint32, vocabValues []string, eogToke
 		cEogTokens[i] = C.uint32_t(token)
 	}

-	g := C.grammar_init(cGrammar, (*C.uint32_t)(unsafe.Pointer(&cTokens[0])), C.size_t(len(cTokens)), (**C.char)(unsafe.Pointer(&cPieces[0])), (*C.uint32_t)(unsafe.Pointer(&cEogTokens[0])), C.size_t(len(cEogTokens)))
+	g := C.grammar_init(cGrammar, unsafe.SliceData(cTokens), C.size_t(len(cTokens)), unsafe.SliceData(cPieces), unsafe.SliceData(cEogTokens), C.size_t(len(cEogTokens)))
 	if g == nil {
 		return nil
 	}
--- a/llama/patches/0016-graph-memory-reporting-on-failure.patch
+++ b/llama/patches/0016-graph-memory-reporting-on-failure.patch
@@ -0,0 +1,156 @@
+From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
+From: Jesse Gross <jesse@ollama.com>
+Date: Fri, 18 Apr 2025 15:58:19 -0700
+Subject: [PATCH] graph memory reporting on failure
+
+---
+ ggml/include/ggml-alloc.h   |  6 ++++++
+ ggml/include/ggml-backend.h |  6 ++++++
+ ggml/src/ggml-alloc.c       | 38 +++++++++++++++++++++++++++++++++----
+ ggml/src/ggml-backend.cpp   | 10 ++++++++++
+ 4 files changed, 56 insertions(+), 4 deletions(-)
+
+diff --git a/ggml/include/ggml-alloc.h b/ggml/include/ggml-alloc.h
+index 2cb150fd..781b1e10 100644
+--- a/ggml/include/ggml-alloc.h
+++ b/ggml/include/ggml-alloc.h
+@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph
+ 
+ GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+ 
+struct ggml_allocr_buffer_status {
+    size_t size;
+    bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
+ // Utils
+ // Create a buffer and allocate all the tensors in a ggml_context
+ GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
+index 778927f6..74e46716 100644
+--- a/ggml/include/ggml-backend.h
+++ b/ggml/include/ggml-backend.h
+@@ -304,6 +304,12 @@ extern "C" {
+ 
+     GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+ 
+    struct ggml_backend_buffer_status {
+        size_t size;
+        bool allocated;
+    };
+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
+     GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
+     GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
+ 
+diff --git a/ggml/src/ggml-alloc.c b/ggml/src/ggml-alloc.c
+index 5fd379f6..04812990 100644
+--- a/ggml/src/ggml-alloc.c
+++ b/ggml/src/ggml-alloc.c
+@@ -364,6 +364,7 @@ struct node_alloc {
+ struct ggml_gallocr {
+     ggml_backend_buffer_type_t * bufts; // [n_buffers]
+     ggml_backend_buffer_t * buffers; // [n_buffers]
+    size_t *buffer_sizes; // [n_buffers]
+     struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
+     int n_buffers;
+ 
+@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
+     galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
+     GGML_ASSERT(galloc->buffers != NULL);
+ 
+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+    GGML_ASSERT(galloc->buffer_sizes != NULL);
+
+     galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
+     GGML_ASSERT(galloc->buf_tallocs != NULL);
+ 
+@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
+     ggml_hash_set_free(&galloc->hash_set);
+     free(galloc->hash_values);
+     free(galloc->bufts);
+    free(galloc->buffer_sizes);
+     free(galloc->buffers);
+     free(galloc->buf_tallocs);
+     free(galloc->node_allocs);
+@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+         }
+     }
+ 
+    bool success = true;
+
+     // reallocate buffers if needed
+     for (int i = 0; i < galloc->n_buffers; i++) {
+         // if the buffer type is used multiple times, we reuse the same buffer
+@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
+ 
+             ggml_backend_buffer_free(galloc->buffers[i]);
+             galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+-            if (galloc->buffers[i] == NULL) {
+            if (galloc->buffers[i]) {
+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            } else {
+                 GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+-                return false;
+                galloc->buffer_sizes[i] = new_size;
+                success = false;
+             }
+-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        } else {
+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+         }
+     }
+ 
+-    return true;
+    return success;
+ }
+ 
+ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+     return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
+ }
+ 
+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
+            // case and we still want to know the attempted size.
+
+            struct ggml_allocr_buffer_status status = {0, true};
+            return status;
+        }
+    }
+
+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+    return status;
+}
+
+ // utils
+ 
+ static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
+diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
+index 0ce73a99..be335e8c 100644
+--- a/ggml/src/ggml-backend.cpp
+++ b/ggml/src/ggml-backend.cpp
+@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
+     return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
+ }
+ 
+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+    return status;
+}
+
+ void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
+     int backend_index = ggml_backend_sched_backend_id(sched, backend);
+     GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/llm/memory.go
+++ b/llm/memory.go
@@ -1,12 +1,9 @@
 package llm

 import (
-	"cmp"
 	"fmt"
 	"log/slog"
-	"maps"
 	"os"
-	"slices"
 	"strconv"
 	"strings"

@@ -85,8 +82,11 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	var graphOffload uint64

 	// Projectors loaded into GPU0 only
-	var projectorWeights uint64
-	var projectorGraph uint64
+	var llamaEngineProjectorWeights uint64
+
+	// Projectors loaded with output layer
+	var ollamaEngineProjectorWeights uint64
+	var ollamaEngineProjectorGraph uint64

 	// Conditional output size on GPU 0
 	var memoryLayerOutput uint64
@@ -111,21 +111,23 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	slog.Debug("evaluating", "library", gpus[0].Library, "gpu_count", len(gpus), "available", availableList)

 	for _, projector := range projectors {
-		weight := projectorMemoryRequirements(projector)
-		projectorWeights += weight
+		llamaEngineProjectorWeights += projectorMemoryRequirements(projector)

 		// multimodal models require at least 2048 context
 		opts.NumCtx = max(opts.NumCtx, 2048)
 	}
-	if projectorWeights == 0 && projectorGraph == 0 {
-		projectorWeights, projectorGraph = f.VisionGraphSize()
+	if llamaEngineProjectorWeights == 0 {
+		ollamaEngineProjectorWeights, ollamaEngineProjectorGraph = f.VisionGraphSize()
+		opts.NumCtx = max(opts.NumCtx, 2048)
 	}

 	layers := f.Tensors().GroupLayers()
-	// add one layer (chosing the max layer) worth of memory as a buffer
-	layerSize = slices.MaxFunc(slices.Collect(maps.Values(layers)), func(a, b ggml.Layer) int {
-		return cmp.Compare(a.Size(), b.Size())
-	}).Size()
+	// add one layer worth of memory as a buffer
+	if blk0, ok := layers["blk.0"]; ok {
+		layerSize = blk0.Size()
+	} else {
+		slog.Warn("model missing blk.0 layer size")
+	}

 	var kvct string
 	if envconfig.FlashAttention() &&
@@ -163,6 +165,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		graphFullOffload = graphPartialOffload
 	}

+	// Output layer handled at the end if we have space
 	if layer, ok := layers["output_norm"]; ok {
 		memoryLayerOutput += layer.Size()
 	}
@@ -172,8 +175,7 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput += layer.Size()
 	}

-	// Output layer handled at the end if we have space
-	gpuZeroOverhead := projectorWeights + projectorGraph
+	gpuZeroOverhead := llamaEngineProjectorWeights

 	// Reduce set of GPUs to only those that have sufficient space to fit overhead and at least one layer
 	var layerCount int
@@ -216,6 +218,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	if len(gpusWithSpace) > 0 {
 		gpuZeroID = gpusWithSpace[0].i
 		gpuAllocations[gpuZeroID] += gpuZeroOverhead
+	} else {
+		overflow += gpuZeroOverhead
 	}

 	// For all the layers, find where they can fit on the GPU(s)
@@ -256,21 +260,24 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 	}

 	// Determine if we need to consider output then find where it fits
-	if memoryLayerOutput > 0 && (opts.NumGPU < 0 || layerCount < opts.NumGPU) {
-		for j := len(gpusWithSpace); j > 0; j-- {
-			g := gpusWithSpace[layerCount%j]
-			used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
-			if g.g.FreeMemory > overhead+used+memoryLayerOutput {
-				gpuAllocations[g.i] += memoryLayerOutput
-				layerCounts[g.i]++
-				layerCount++
-				break
+	memoryLastLayer := memoryLayerOutput + ollamaEngineProjectorWeights + ollamaEngineProjectorGraph
+	if memoryLastLayer > 0 {
+		if opts.NumGPU < 0 || layerCount < opts.NumGPU {
+			for j := len(gpusWithSpace); j > 0; j-- {
+				g := gpusWithSpace[layerCount%j]
+				used := gpuAllocations[g.i] + max(graphPartialOffload, graphFullOffload)
+				if g.g.FreeMemory > overhead+used+memoryLastLayer {
+					gpuAllocations[g.i] += memoryLastLayer
+					layerCounts[g.i]++
+					layerCount++
+					break
+				}
 			}
 		}

 		if layerCount < int(f.KV().BlockCount())+1 {
 			fullyLoaded = false
-			overflow += memoryLayerOutput
+			overflow += memoryLastLayer
 		}
 	}

@@ -328,8 +335,8 @@ func EstimateGPULayers(gpus []discover.GpuInfo, f *ggml.GGML, projectors []strin
 		memoryLayerOutput:   memoryLayerOutput,
 		graphFullOffload:    graphFullOffload,
 		graphPartialOffload: graphPartialOffload,
-		projectorWeights:    projectorWeights,
-		projectorGraph:      projectorGraph,
+		projectorWeights:    llamaEngineProjectorWeights + ollamaEngineProjectorWeights,
+		projectorGraph:      ollamaEngineProjectorGraph,
 	}

 	if gpus[0].Library == "cpu" {
@@ -415,7 +422,7 @@ func projectorMemoryRequirements(filename string) (weights uint64) {
 	}
 	defer file.Close()

-	ggml, _, err := ggml.Decode(file, 1024)
+	ggml, err := ggml.Decode(file, 1024)
 	if err != nil {
 		return 0
 	}
--- a/llm/server.go
+++ b/llm/server.go
@@ -121,7 +121,7 @@ func LoadModel(model string, maxArraySize int) (*ggml.GGML, error) {
 	}
 	defer f.Close()

-	ggml, _, err := ggml.Decode(f, maxArraySize)
+	ggml, err := ggml.Decode(f, maxArraySize)
 	return ggml, err
 }

--- a/ml/backend.go
+++ b/ml/backend.go
@@ -6,7 +6,6 @@ import (
 	"encoding/binary"
 	"fmt"
 	"math"
-	"os"
 	"slices"
 	"strconv"
 	"strings"
@@ -15,6 +14,11 @@ import (
 )

 type Backend interface {
+	Load(ctx context.Context, progress func(float32)) error
+
+	// BackendMemory returns the memory allocations that were made for this model
+	BackendMemory() BackendMemory
+
 	Config() fs.Config
 	Get(name string) Tensor
 	NewContext() Context
@@ -52,10 +56,6 @@ type CacheConfig struct {

 // BackendParams controls how the backend loads and executes models
 type BackendParams struct {
-	// Progress is a callback function that allows reporting percentage completion
-	// of model loading
-	Progress func(float32)
-
 	// NumThreads sets the number of threads to use if running on the CPU
 	NumThreads int

@@ -72,9 +72,87 @@ type BackendParams struct {
 	FlashAttention bool
 }

-var backends = make(map[string]func(context.Context, *os.File, BackendParams) (Backend, error))
+// ErrNoMem is returned when panicing due to insufficient memory. It includes
+// the attempted memory allocation.
+type ErrNoMem struct {
+	BackendMemory
+}

-func RegisterBackend(name string, f func(context.Context, *os.File, BackendParams) (Backend, error)) {
+func (e ErrNoMem) Error() string {
+	return fmt.Sprintf("insufficient memory - required allocations: %+v", e.BackendMemory)
+}
+
+type AllocationStatus int
+
+const (
+	// Unallocated memory - have not yet attempted to allocate
+	Unallocated AllocationStatus = iota
+
+	// Failed memory - tried to allocate the memory and did not succeed
+	Failed
+
+	// Allocated memory = tried and succeeded to allocate memory
+	Allocated
+)
+
+// Memory is the size of an allocation and whether it was successful.
+type Memory struct {
+	Size   uint64
+	Status AllocationStatus
+}
+
+func (m Memory) String() string {
+	s := fmt.Sprint(m.Size)
+
+	switch m.Status {
+	case Unallocated:
+		s += "U"
+	case Failed:
+		s += "F"
+	case Allocated:
+		s += "A"
+	}
+
+	return s
+}
+
+// DeviceMemory provides a breakdown of the memory needed
+// per device, such as a CPU or GPU.
+type DeviceMemory struct {
+	// Name is the name of the device as labeled by the backend. It
+	// may not be persistent across instances of the runner.
+	Name string
+
+	// Weights is the per-layer memory needed for the model weights.
+	Weights []Memory
+
+	// Cache is the per-layer memory needed for the KV cache.
+	Cache []Memory
+
+	// Graph is the size of the compute graph. It is not per-layer.
+	Graph Memory
+}
+
+// BackendMemory provides the amount of memory required to load the model
+// per device based on the BackendParams. In some cases, not all required
+// allocations will be known at this point. However, the size of the most recent
+// allocation is guaranteed to be provided so that if it failed, the caller can
+// accommodate that to make forward progress.
+type BackendMemory struct {
+	// InputsWeights are always located on the CPU and cannot be moved
+	InputWeights Memory
+
+	// CPU model components are located in system memory. This does not
+	// include unified memory allocated through the GPU.
+	CPU DeviceMemory
+
+	// GPU model components are located on one or more GPUs.
+	GPUs []DeviceMemory
+}
+
+var backends = make(map[string]func(string, BackendParams) (Backend, error))
+
+func RegisterBackend(name string, f func(string, BackendParams) (Backend, error)) {
 	if _, ok := backends[name]; ok {
 		panic("backend: backend already registered")
 	}
@@ -82,9 +160,9 @@ func RegisterBackend(name string, f func(context.Context, *os.File, BackendParam
 	backends[name] = f
 }

-func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend, error) {
+func NewBackend(modelPath string, params BackendParams) (Backend, error) {
 	if backend, ok := backends["ggml"]; ok {
-		return backend(ctx, f, params)
+		return backend(modelPath, params)
 	}

 	return nil, fmt.Errorf("unsupported backend")
@@ -93,8 +171,8 @@ func NewBackend(ctx context.Context, f *os.File, params BackendParams) (Backend,
 type Context interface {
 	Empty(dtype DType, shape ...int) Tensor
 	Zeros(dtype DType, shape ...int) Tensor
-	FromFloatSlice(s []float32, shape ...int) (Tensor, error)
-	FromIntSlice(s []int32, shape ...int) (Tensor, error)
+	FromFloatSlice(s []float32, shape ...int) Tensor
+	FromIntSlice(s []int32, shape ...int) Tensor

 	// Arange creates a 1D tensor with values within an interval (start, stop] increased by step.
 	Arange(start, stop, step float32, dtype DType) Tensor
@@ -106,7 +184,7 @@ type Context interface {
 	// graph, simply preallocates memory. Typically called with a
 	// worst case graph to ensure all resources are available for
 	// for future inference.
-	Reserve() error
+	Reserve()

 	MaxGraphNodes() int
 	Close()
@@ -119,21 +197,6 @@ type Context interface {
 	Layer(int) Context
 }

-// RopeOptions contains optional parameters for RoPE function
-type RopeOptions struct {
-	OriginalContextLen uint32
-}
-
-// RopeOption defines a function that modifies RopeOpts
-type RopeOption func(*RopeOptions)
-
-// WithContextLen sets a custom context length
-func WithContextLen(len uint32) RopeOption {
-	return func(opts *RopeOptions) {
-		opts.OriginalContextLen = len
-	}
-}
-
 type Tensor interface {
 	Dim(n int) int
 	Stride(n int) int
@@ -147,6 +210,8 @@ type Tensor interface {
 	Neg(ctx Context) Tensor
 	Add(ctx Context, t2 Tensor) Tensor
 	Mul(ctx Context, t2 Tensor) Tensor
+	Div(ctx Context, t2 Tensor) Tensor
+
 	Mulmat(ctx Context, t2 Tensor) Tensor
 	MulmatFullPrec(ctx Context, t2 Tensor) Tensor
 	MulmatID(ctx Context, t2, ids Tensor) Tensor
@@ -155,11 +220,11 @@ type Tensor interface {
 	LayerNorm(ctx Context, weight, bias Tensor, eps float32) Tensor
 	RMSNorm(ctx Context, weight Tensor, eps float32) Tensor
 	Scale(ctx Context, s float64) Tensor
+	SumRows(ctx Context) Tensor

 	AvgPool2D(ctx Context, k, s int, p float32) Tensor
 	Conv2D(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

-	RoPE(ctx Context, positionIDs, ropeFactors Tensor, dim, ropeType uint32, base, scale float32, options ...RopeOption) Tensor
 	IM2Col(ctx Context, weight Tensor, s0, s1, p0, p1, d0, d1 int) Tensor

 	Sin(ctx Context) Tensor
--- a/ml/backend/ggml/ggml.go
+++ b/ml/backend/ggml/ggml.go
@@ -10,7 +10,6 @@ import "C"

 import (
 	"context"
-	"errors"
 	"fmt"
 	"io"
 	"log/slog"
@@ -30,6 +29,7 @@ import (
 	"github.com/ollama/ollama/logutil"
 	"github.com/ollama/ollama/ml"
 	ggml "github.com/ollama/ollama/ml/backend/ggml/ggml/src"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"golang.org/x/sync/errgroup"
 )

@@ -44,8 +44,15 @@ func devices() []*C.struct_ggml_backend_device {
 }

 type Backend struct {
+	// modelPath is the location of the model data
+	modelPath string
+
 	meta *fsggml.GGML

+	// tensorLoadTargets maps from the name of the tensor in the file
+	// to the name that is used by the model definition
+	tensorLoadTargets map[string][]string
+
 	sched         *C.struct_ggml_backend_sched
 	schedBackends []*C.struct_ggml_backend
 	schedBufts    []*C.struct_ggml_backend_buffer_type
@@ -58,14 +65,26 @@ type Backend struct {
 	// layers is the backend used for repeating layers
 	layers map[int]*C.struct_ggml_backend_buffer_type

+	// requiredMemory is the cumulative memory allocations needed by the backend
+	requiredMemory *ml.BackendMemory
+
+	// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
+	btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
+
 	flashAttention bool

 	// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
 	maxGraphNodes int
 }

-func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend, error) {
-	meta, n, err := fsggml.Decode(r, -1)
+func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
+	r, err := os.Open(modelPath)
+	if err != nil {
+		return nil, err
+	}
+	defer r.Close()
+
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -80,6 +99,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		"num_key_values", len(meta.KV()),
 	)

+	var requiredMemory ml.BackendMemory
+	btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
+
 	type deviceBufferType struct {
 		d   *C.struct_ggml_backend_device
 		bts []*C.struct_ggml_backend_buffer_type
@@ -100,6 +122,8 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}

+	blocks := int(meta.KV().BlockCount())
+
 	// create list of buffer types for the cpu
 	cpuDeviceBufferType := deviceBufferType{d: C.ggml_backend_dev_by_type(C.GGML_BACKEND_DEVICE_TYPE_CPU)}
 	for _, d := range append(accels, append(gpus, cpus...)...) {
@@ -107,17 +131,27 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		case C.GGML_BACKEND_DEVICE_TYPE_CPU,
 			C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
 			cpuDeviceBufferType.bts = append(cpuDeviceBufferType.bts, C.ggml_backend_dev_buffer_type(d))
+			btDeviceMemory[C.ggml_backend_dev_buffer_type(d)] = &requiredMemory.CPU
 		}
 	}

+	requiredMemory.CPU.Name = C.GoString(C.ggml_backend_dev_name(cpuDeviceBufferType.d))
+	requiredMemory.CPU.Weights = make([]ml.Memory, blocks+1)
+	requiredMemory.CPU.Cache = make([]ml.Memory, blocks+1)
+
 	// create list of buffer types for each gpu
 	var gpuDeviceBufferTypes []deviceBufferType
-	for _, d := range gpus {
+	requiredMemory.GPUs = make([]ml.DeviceMemory, len(gpus))
+	for i, d := range gpus {
 		bt := C.ggml_backend_dev_buffer_type(d)
 		gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
 			d:   d,
 			bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
 		})
+		btDeviceMemory[bt] = &requiredMemory.GPUs[i]
+		requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
+		requiredMemory.GPUs[i].Weights = make([]ml.Memory, blocks+1)
+		requiredMemory.GPUs[i].Cache = make([]ml.Memory, blocks+1)
 	}

 	useDefaultSplit := true
@@ -156,8 +190,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	// inputs always use cpu
 	input := cpuDeviceBufferType

-	blocks := int(meta.KV().BlockCount())
-
 	// define a range of gpu layers. anything outside of this range is assigned to the cpu
 	gpuRangeStart := max(0, blocks-params.NumGPULayers)
 	gpuRangeStop := min(gpuRangeStart+params.NumGPULayers, blocks+1)
@@ -198,7 +230,7 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,

 	// contexts are shared by tensors of the same buffer type
 	ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
-	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type) *C.struct_ggml_tensor {
+	createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
 		for _, bt := range bts {
 			if _, ok := ctxs[bt]; !ok {
 				ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
@@ -224,6 +256,16 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			C.ggml_set_name(tt, cname)

 			slog.Log(context.TODO(), logutil.LevelTrace, "created tensor", "name", name, "shape", t.source.Shape, "dtype", t.source.Kind, "buffer_type", C.GoString(C.ggml_backend_buft_name(bt)))
+
+			size := pad(C.ggml_backend_buft_get_alloc_size(bt, tt), C.ggml_backend_buft_get_alignment(bt))
+			if layer == -1 {
+				// Assume that InputWeights can be allocated - they're always in system memory and can't be moved in any case
+				requiredMemory.InputWeights.Status = ml.Allocated
+				requiredMemory.InputWeights.Size += uint64(size)
+			} else {
+				btDeviceMemory[bt].Weights[layer].Size += uint64(size)
+			}
+
 			//nolint:staticcheck // TODO: check if buffer type supports this tensor
 			return tt
 		}
@@ -245,22 +287,22 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 	for _, t := range meta.Tensors().Items() {
 		switch {
 		case contains(t.Name, "position_embd", "token_embd", "token_norm_embd", "token_types"):
-			createTensor(tensor{source: t}, input.bts)
+			createTensor(tensor{source: t}, input.bts, -1)
 			if _, ok := meta.Tensors().GroupLayers()["output"]; !ok && t.Name == "token_embd.weight" {
-				createTensor(tensor{source: t, target: "output.weight"}, output.bts)
+				createTensor(tensor{source: t, target: "output.weight"}, output.bts, blocks)
 			}
 		case contains(t.Name, "cls", "output", "output_norm"):
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case strings.HasPrefix(t.Name, "v.") || strings.HasPrefix(t.Name, "mm."):
 			// TODO: assign vision tensors to the gpu if possible
-			createTensor(tensor{source: t}, output.bts)
+			createTensor(tensor{source: t}, output.bts, blocks)
 		case contains(t.Name, "rope_freqs", "rope_factors_long", "rope_factors_short"):
 			// these tensors should be repeated per layer
 			for i, layer := range layers {
 				createTensor(tensor{
 					source: t,
 					target: "blk." + strconv.Itoa(i) + "." + t.Name,
-				}, layer.bts)
+				}, layer.bts, i)
 			}
 		default:
 			layerIndex := -1
@@ -271,10 +313,10 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			}

 			if layerIndex >= 0 {
-				createTensor(tensor{source: t}, layers[layerIndex].bts)
+				createTensor(tensor{source: t}, layers[layerIndex].bts, layerIndex)
 			} else {
 				// load all other tensors on the cpu
-				createTensor(tensor{source: t}, input.bts)
+				createTensor(tensor{source: t}, input.bts, -1)
 			}
 		}
 	}
@@ -287,8 +329,18 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}

 		b := C.ggml_backend_alloc_ctx_tensors_from_buft(c, bt)
+		for i := range btDeviceMemory[bt].Weights {
+			if btDeviceMemory[bt].Weights[i].Size != 0 {
+				if b != nil {
+					btDeviceMemory[bt].Weights[i].Status = ml.Allocated
+				} else {
+					btDeviceMemory[bt].Weights[i].Status = ml.Failed
+				}
+			}
+		}
+
 		if b == nil {
-			return nil, fmt.Errorf("unable to allocate memory from device %v for model weights", C.GoString(C.ggml_backend_buft_name(bt)))
+			panic(ml.ErrNoMem{BackendMemory: requiredMemory})
 		}

 		C.ggml_backend_buffer_set_usage(b, C.GGML_BACKEND_BUFFER_USAGE_WEIGHTS)
@@ -307,73 +359,6 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 		}
 	}

-	var doneBytes atomic.Uint64
-	totalBytes := uint64(n) - meta.Tensors().Offset
-
-	g, ctx := errgroup.WithContext(ctx)
-	g.SetLimit(runtime.GOMAXPROCS(0))
-	for _, t := range meta.Tensors().Items() {
-		t := t
-		g.Go(func() error {
-			tts := make([]*C.struct_ggml_tensor, max(1, len(targets[t.Name])))
-			for i := range tts {
-				target := targets[t.Name][i]
-				if target == "" {
-					target = t.Name
-				}
-
-				tt, ok := tensors[target]
-				if !ok {
-					return fmt.Errorf("unassigned tensor: %s", t.Name)
-				}
-
-				tts[i] = tt
-			}
-
-			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
-			// seeking around within an FD shared between all goroutines.
-			file, err := os.Open(r.Name())
-			if err != nil {
-				slog.Warn("file open error", "file", r.Name(), "error", err)
-				return err
-			}
-			defer file.Close()
-			sr := io.NewSectionReader(file, int64(meta.Tensors().Offset+t.Offset), int64(t.Size()))
-			bts := make([]byte, 128*format.KibiByte)
-
-			var s uint64
-			for s < t.Size() {
-				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
-				if err := ctx.Err(); err != nil {
-					return err
-				}
-
-				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
-				if err != nil {
-					slog.Warn("file read error", "file", r.Name(), "error", err)
-					return err
-				}
-
-				for _, tt := range tts {
-					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
-				}
-
-				s += uint64(n)
-
-				if params.Progress != nil {
-					done := doneBytes.Add(uint64(n))
-					params.Progress(float32(done) / float32(totalBytes))
-				}
-			}
-
-			return nil
-		})
-	}
-
-	if err := g.Wait(); err != nil {
-		return nil, err
-	}
-
 	// map devices to backend buffer types so new tensors can be assigned to the correct device
 	deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)

@@ -397,9 +382,11 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,

 	maxGraphNodes := max(8192, len(meta.Tensors().Items())*5)
 	return &Backend{
-		flashAttention: params.FlashAttention,
-		meta:           meta,
-		tensors:        tensors,
+		modelPath:         modelPath,
+		flashAttention:    params.FlashAttention,
+		meta:              meta,
+		tensorLoadTargets: targets,
+		tensors:           tensors,
 		sched: C.ggml_backend_sched_new(
 			(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
 			(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
@@ -418,7 +405,9 @@ func New(ctx context.Context, r *os.File, params ml.BackendParams) (ml.Backend,
 			}
 			return m
 		}(),
-		maxGraphNodes: maxGraphNodes,
+		requiredMemory: &requiredMemory,
+		btDeviceMemory: btDeviceMemory,
+		maxGraphNodes:  maxGraphNodes,
 	}, nil
 }

@@ -426,6 +415,81 @@ func init() {
 	ml.RegisterBackend("ggml", New)
 }

+func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
+	var doneBytes atomic.Uint64
+	totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
+
+	g, ctx := errgroup.WithContext(ctx)
+	g.SetLimit(runtime.GOMAXPROCS(0))
+	for _, t := range b.meta.Tensors().Items() {
+		t := t
+		g.Go(func() error {
+			tts := make([]*C.struct_ggml_tensor, max(1, len(b.tensorLoadTargets[t.Name])))
+			for i := range tts {
+				target := b.tensorLoadTargets[t.Name][i]
+				if target == "" {
+					target = t.Name
+				}
+
+				tt, ok := b.tensors[target]
+				if !ok {
+					return fmt.Errorf("unassigned tensor: %s", t.Name)
+				}
+
+				tts[i] = tt
+			}
+
+			// Create a new FD for each goroutine so that each FD is read sequentially, rather than
+			// seeking around within an FD shared between all goroutines.
+			file, err := os.Open(b.modelPath)
+			if err != nil {
+				slog.Warn("file open error", "file", b.modelPath, "error", err)
+				return err
+			}
+			defer file.Close()
+			sr := io.NewSectionReader(file, int64(b.meta.Tensors().Offset+t.Offset), int64(t.Size()))
+			bts := make([]byte, 128*format.KibiByte)
+
+			var s uint64
+			for s < t.Size() {
+				// Stop if either the parent context has been canceled or if any of the other tensors returned an error
+				if err := ctx.Err(); err != nil {
+					return err
+				}
+
+				n, err := io.ReadFull(sr, bts[:min(len(bts), int(t.Size()-s))])
+				if err != nil {
+					slog.Warn("file read error", "file", b.modelPath, "error", err)
+					return err
+				}
+
+				for _, tt := range tts {
+					C.ggml_backend_tensor_set(tt, unsafe.Pointer(&bts[0]), C.size_t(s), C.size_t(n))
+				}
+
+				s += uint64(n)
+
+				if progress != nil {
+					done := doneBytes.Add(uint64(n))
+					progress(float32(done) / float32(totalBytes))
+				}
+			}
+
+			return nil
+		})
+	}
+
+	if err := g.Wait(); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func (b *Backend) BackendMemory() ml.BackendMemory {
+	return *b.requiredMemory
+}
+
 func (b *Backend) Config() fs.Config {
 	return b.meta.KV()
 }
@@ -457,6 +521,7 @@ func (b *Backend) NewContextSize(n int) ml.Context {
 			no_alloc: true,
 		}),
 		allocatedBuffers: &allocatedBuffers,
+		layer:            -1,
 	}
 }

@@ -483,6 +548,9 @@ type Context struct {

 	// maxGraphNodes is the maximum allowed number of graph nodes in this context
 	maxGraphNodes int
+
+	// layer is the graph layer that this context is allocating for - assumed to be cache
+	layer int
 }

 func (c *Context) Input() ml.Context {
@@ -493,6 +561,7 @@ func (c *Context) Input() ml.Context {
 			buft:             c.b.input,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            -1,
 		}
 	}

@@ -507,6 +576,7 @@ func (c *Context) Layer(i int) ml.Context {
 			buft:             buft,
 			allocatedBuffers: c.allocatedBuffers,
 			maxGraphNodes:    c.maxGraphNodes,
+			layer:            i,
 		}
 	}

@@ -544,22 +614,34 @@ func (c *Context) Compute(tensors ...ml.Tensor) {
 	}
 }

-func (c *Context) Reserve() error {
-	if !C.ggml_backend_sched_reserve(c.b.sched, c.graph) {
-		C.ggml_backend_sched_reset(c.b.sched)
-		return errors.New("failed to reserve graph")
-	}
+func (c *Context) Reserve() {
+	reserved := C.ggml_backend_sched_reserve(c.b.sched, c.graph)

 	slog.Debug("compute graph", "nodes", C.ggml_graph_n_nodes(c.graph), "splits", C.ggml_backend_sched_get_n_splits(c.b.sched))
-	for i := range c.b.schedBackends {
-		size := C.ggml_backend_sched_get_buffer_size(c.b.sched, c.b.schedBackends[i])
-		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
-			"size", format.HumanBytes2(uint64(size)))
+
+	// Reserve may get called multiple times for different graphs - we just want the last run, which will contain the max allocations
+	for _, bt := range c.b.schedBufts {
+		c.b.btDeviceMemory[bt].Graph = ml.Memory{}
 	}

-	C.ggml_backend_sched_reset(c.b.sched)
+	for i := range c.b.schedBackends {
+		bufferStatus := C.ggml_backend_sched_get_attempted_buffer_size(c.b.sched, c.b.schedBackends[i])

-	return nil
+		graph := &c.b.btDeviceMemory[c.b.schedBufts[i]].Graph
+		graph.Size += uint64(bufferStatus.size)
+		if bufferStatus.allocated && graph.Status != ml.Failed {
+			graph.Status = ml.Allocated
+		} else {
+			graph.Status = ml.Failed
+		}
+
+		slog.Info("compute graph", "backend", C.GoString(C.ggml_backend_name(c.b.schedBackends[i])), "buffer_type", C.GoString(C.ggml_backend_buft_name(c.b.schedBufts[i])),
+			"size", format.HumanBytes2(uint64(bufferStatus.size)))
+	}
+
+	if !reserved {
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+	}
 }

 func (c *Context) MaxGraphNodes() int {
@@ -579,7 +661,7 @@ func pad(length, pad C.size_t) C.size_t {
 	return ((length + pad - 1) / pad) * pad
 }

-func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {
+func (c *Context) newTensor(dtype ml.DType, shape []int) ml.Tensor {
 	if c.buft == nil {
 		panic("set Input or Layer before creating tensors")
 	}
@@ -602,7 +684,7 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {

 	if len(shape) < 1 || shape[0] == 0 {
 		var shape C.int64_t = 0
-		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}, nil
+		return &Tensor{b: c.b, t: C.ggml_new_tensor(c.ctx, cdtype, 1, &shape)}
 	} else if len(shape) > 4 {
 		panic("unsupported number of dimensions")
 	}
@@ -615,40 +697,43 @@ func (c *Context) newTensor(dtype ml.DType, shape []int) (ml.Tensor, error) {

 	t := C.ggml_new_tensor(c.ctx, cdtype, C.int(len(shape)), shapeToGGML(shape))
 	size := pad(C.ggml_backend_buft_get_alloc_size(c.buft, t), C.ggml_backend_buft_get_alignment(c.buft))
-	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
-	if b == nil {
-		return nil, fmt.Errorf("unable to allocate %v from device %v for new tensor", format.HumanBytes2(uint64(size)), C.GoString(C.ggml_backend_buft_name(c.buft)))
-	}
-	*c.allocatedBuffers = append(*c.allocatedBuffers, b)

+	b := C.ggml_backend_buft_alloc_buffer(c.buft, size)
+	if c.layer >= 0 {
+		cache := &c.b.btDeviceMemory[c.buft].Cache[c.layer]
+
+		cache.Size += uint64(size)
+		if b != nil {
+			cache.Status = ml.Allocated
+		} else {
+			cache.Status = ml.Failed
+		}
+	}
+
+	if b == nil {
+		panic(ml.ErrNoMem{BackendMemory: *c.b.requiredMemory})
+	}
+
+	*c.allocatedBuffers = append(*c.allocatedBuffers, b)
 	C.ggml_backend_tensor_alloc(b, t, C.ggml_backend_buffer_get_base(b))
-	return &Tensor{b: c.b, t: t}, nil
+	return &Tensor{b: c.b, t: t}
 }

 func (c *Context) Empty(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
-
-	return t
+	return c.newTensor(dtype, shape)
 }

 func (c *Context) Zeros(dtype ml.DType, shape ...int) ml.Tensor {
-	t, err := c.newTensor(dtype, shape)
-	if err != nil {
-		panic(err)
-	}
-
+	t := c.newTensor(dtype, shape)
 	C.ggml_set_zero(t.(*Tensor).t)
 	return t
 }

-func checkShape[S ~[]E, E any](s S, shape ...int) error {
+func checkShape[S ~[]E, E any](s S, shape ...int) {
 	n := len(s)

 	if n == 0 {
-		return nil
+		return
 	}

 	for _, v := range shape {
@@ -656,44 +741,32 @@ func checkShape[S ~[]E, E any](s S, shape ...int) error {
 	}

 	if n != 1 {
-		return fmt.Errorf("invalid shape: %v", shape)
+		panic(fmt.Errorf("invalid shape: %v", shape))
 	}
-
-	return nil
 }

-func (c *Context) FromFloatSlice(s []float32, shape ...int) (ml.Tensor, error) {
-	if err := checkShape(s, shape...); err != nil {
-		return nil, err
-	}
+func (c *Context) FromFloatSlice(s []float32, shape ...int) ml.Tensor {
+	checkShape(s, shape...)

-	t, err := c.newTensor(ml.DTypeF32, shape)
-	if err != nil {
-		return nil, err
-	}
+	t := c.newTensor(ml.DTypeF32, shape)

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t, nil
+	return t
 }

-func (c *Context) FromIntSlice(s []int32, shape ...int) (ml.Tensor, error) {
-	if err := checkShape(s, shape...); err != nil {
-		return nil, err
-	}
+func (c *Context) FromIntSlice(s []int32, shape ...int) ml.Tensor {
+	checkShape(s, shape...)

-	t, err := c.newTensor(ml.DTypeI32, shape)
-	if err != nil {
-		return nil, err
-	}
+	t := c.newTensor(ml.DTypeI32, shape)

 	if len(s) > 0 {
 		C.ggml_backend_tensor_set(t.(*Tensor).t, unsafe.Pointer(&s[0]), 0, C.ggml_nbytes(t.(*Tensor).t))
 	}

-	return t, nil
+	return t
 }

 func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
@@ -711,12 +784,7 @@ func (c Context) Arange(start, stop, step float32, dtype ml.DType) ml.Tensor {
 			arange = append(arange, int32(i))
 		}

-		t, err := c.Input().FromIntSlice(arange, len(arange))
-		if err != nil {
-			panic(err)
-		}
-
-		return t
+		return c.Input().FromIntSlice(arange, len(arange))
 	default:
 		panic("unsupported dtype for arange")
 	}
@@ -867,6 +935,13 @@ func (t *Tensor) Mul(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	}
 }

+func (t *Tensor) Div(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_div(ctx.(*Context).ctx, t.t, t2.(*Tensor).t),
+	}
+}
+
 func (t *Tensor) Mulmat(ctx ml.Context, t2 ml.Tensor) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -915,6 +990,8 @@ func (t *Tensor) RMSNorm(ctx ml.Context, w ml.Tensor, eps float32) ml.Tensor {
 func (t *Tensor) Pad(ctx ml.Context, shape ...int) ml.Tensor {
 	if len(shape) != 4 {
 		panic("expected 4 dimensions")
+	} else if shape[3] != 0 {
+		panic("cuda does not support 4d tensors")
 	}

 	return &Tensor{
@@ -982,6 +1059,13 @@ func (t *Tensor) Scale(ctx ml.Context, s float64) ml.Tensor {
 	}
 }

+func (t *Tensor) SumRows(ctx ml.Context) ml.Tensor {
+	return &Tensor{
+		b: t.b,
+		t: C.ggml_sum_rows(ctx.(*Context).ctx, t.t),
+	}
+}
+
 func (t *Tensor) Softmax(ctx ml.Context) ml.Tensor {
 	return &Tensor{
 		b: t.b,
@@ -1053,28 +1137,15 @@ func (t *Tensor) View(ctx ml.Context, offset int, shape ...int) ml.Tensor {
 	}
 }

-const (
-	ropeTypeNorm   C.int = 0
-	ropeTypeNeox   C.int = 2
-	ropeTypeMrope  C.int = 8
-	ropeTypeVision C.int = 24
-)
-
-func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDim, ropeType uint32, ropeBase, ropeScale float32, options ...ml.RopeOption) ml.Tensor {
+func (t *Tensor) RoPE(ctx ml.Context, positions ml.Tensor, ropeDim int, ropeBase, ropeScale float32, options ...func(*rope.Options)) ml.Tensor {
 	// Default options
-	opts := &ml.RopeOptions{
-		OriginalContextLen: 131072,
-	}
+	opts := &rope.Options{OriginalContextLength: 131072, Factors: &Tensor{}}

 	// Apply any provided options
 	for _, option := range options {
 		option(opts)
 	}

-	if ropeFactors == nil {
-		ropeFactors = &Tensor{b: t.b}
-	}
-
 	dequant := t.t
 	if C.ggml_is_quantized(t.t._type) {
 		dequant = C.ggml_cast(ctx.(*Context).ctx, t.t, C.GGML_TYPE_F32)
@@ -1085,11 +1156,11 @@ func (t *Tensor) RoPE(ctx ml.Context, positionIDs, ropeFactors ml.Tensor, ropeDi
 		t: C.ggml_rope_ext(
 			ctx.(*Context).ctx,
 			dequant,
-			positionIDs.(*Tensor).t,
-			ropeFactors.(*Tensor).t,
+			positions.(*Tensor).t,
+			opts.Factors.(*Tensor).t,
 			C.int(ropeDim),
-			C.int(ropeType),
-			C.int(opts.OriginalContextLen),
+			C.int(opts.Type),
+			C.int(opts.OriginalContextLength),
 			C.float(ropeBase),
 			C.float(ropeScale),
 			C.float(0.0),
--- a/ml/backend/ggml/ggml/include/ggml-alloc.h
+++ b/ml/backend/ggml/ggml/include/ggml-alloc.h
@@ -66,6 +66,12 @@ GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph

 GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);

+struct ggml_allocr_buffer_status {
+    size_t size;
+    bool allocated;
+};
+GGML_API struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id);
+
 // Utils
 // Create a buffer and allocate all the tensors in a ggml_context
 GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
--- a/ml/backend/ggml/ggml/include/ggml-backend.h
+++ b/ml/backend/ggml/ggml/include/ggml-backend.h
@@ -304,6 +304,12 @@ extern "C" {

    GGML_API size_t               ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);

+    struct ggml_backend_buffer_status {
+        size_t size;
+        bool allocated;
+    };
+    GGML_API struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
+
    GGML_API void                 ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
    GGML_API ggml_backend_t       ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);

--- a/ml/backend/ggml/ggml/src/ggml-alloc.c
+++ b/ml/backend/ggml/ggml/src/ggml-alloc.c
@@ -364,6 +364,7 @@ struct node_alloc {
 struct ggml_gallocr {
    ggml_backend_buffer_type_t * bufts; // [n_buffers]
    ggml_backend_buffer_t * buffers; // [n_buffers]
+    size_t *buffer_sizes; // [n_buffers]
    struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
    int n_buffers;

@@ -387,6 +388,9 @@ ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs
    galloc->buffers = calloc(n_bufs, sizeof(ggml_backend_buffer_t));
    GGML_ASSERT(galloc->buffers != NULL);

+    galloc->buffer_sizes = calloc(n_bufs, sizeof(size_t));
+    GGML_ASSERT(galloc->buffer_sizes != NULL);
+
    galloc->buf_tallocs = calloc(n_bufs, sizeof(struct ggml_dyn_tallocr *));
    GGML_ASSERT(galloc->buf_tallocs != NULL);

@@ -453,6 +457,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
    ggml_hash_set_free(&galloc->hash_set);
    free(galloc->hash_values);
    free(galloc->bufts);
+    free(galloc->buffer_sizes);
    free(galloc->buffers);
    free(galloc->buf_tallocs);
    free(galloc->node_allocs);
@@ -748,6 +753,8 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
        }
    }

+    bool success = true;
+
    // reallocate buffers if needed
    for (int i = 0; i < galloc->n_buffers; i++) {
        // if the buffer type is used multiple times, we reuse the same buffer
@@ -769,15 +776,20 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c

            ggml_backend_buffer_free(galloc->buffers[i]);
            galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
-            if (galloc->buffers[i] == NULL) {
+            if (galloc->buffers[i]) {
+                galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
+                ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+            } else {
                GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
-                return false;
+                galloc->buffer_sizes[i] = new_size;
+                success = false;
            }
-            ggml_backend_buffer_set_usage(galloc->buffers[i], GGML_BACKEND_BUFFER_USAGE_COMPUTE);
+        } else {
+            galloc->buffer_sizes[i] = ggml_backend_buffer_get_size(galloc->buffers[i]);
        }
    }

-    return true;
+    return success;
 }

 bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
@@ -934,6 +946,24 @@ size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
    return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
 }

+struct ggml_allocr_buffer_status ggml_gallocr_get_attempted_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+    GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+    for (int i = 0; i < buffer_id; i++) {
+        if (galloc->buf_tallocs[i] == galloc->buf_tallocs[buffer_id]) {
+            // This buffer is the same as a previous one due to the same buffer type being used multiple times
+            // (See above.) However, we need a different check because multiple buffers might be NULL in our
+            // case and we still want to know the attempted size.
+
+            struct ggml_allocr_buffer_status status = {0, true};
+            return status;
+        }
+    }
+
+    struct ggml_allocr_buffer_status status = {galloc->buffer_sizes[buffer_id], galloc->buffers[buffer_id] != NULL};
+    return status;
+}
+
 // utils

 static void free_buffers(ggml_backend_buffer_t ** buffers, const size_t * n_buffers) {
--- a/ml/backend/ggml/ggml/src/ggml-backend.cpp
+++ b/ml/backend/ggml/ggml/src/ggml-backend.cpp
@@ -1629,6 +1629,16 @@ size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backe
    return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
 }

+struct ggml_backend_buffer_status ggml_backend_sched_get_attempted_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+    int backend_index = ggml_backend_sched_backend_id(sched, backend);
+    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
+
+    struct ggml_allocr_buffer_status allocr_status = ggml_gallocr_get_attempted_buffer_size(sched->galloc, backend_index);
+    struct ggml_backend_buffer_status status = {allocr_status.size, allocr_status.allocated};
+
+    return status;
+}
+
 void ggml_backend_sched_set_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
    int backend_index = ggml_backend_sched_backend_id(sched, backend);
    GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
--- a/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
+++ b/ml/backend/ggml/ggml/src/ggml-cpu/cpu.go
@@ -3,7 +3,7 @@ package cpu
 // #cgo CFLAGS: -O3 -Wno-implicit-function-declaration
 // #cgo CXXFLAGS: -std=c++17
 // #cgo CPPFLAGS: -I${SRCDIR}/amx -I${SRCDIR}/llamafile -I${SRCDIR}/.. -I${SRCDIR}/../../include
-// #cgo CPPFLAGS: -DGGML_USE_LLAMAFILE
+// #cgo CPPFLAGS: -DNDEBUG -DGGML_USE_LLAMAFILE
 // #cgo linux CPPFLAGS: -D_GNU_SOURCE
 // #cgo darwin,arm64 CPPFLAGS: -DGGML_USE_ACCELERATE -DACCELERATE_NEW_LAPACK -DACCELERATE_LAPACK_ILP64
 // #cgo darwin,arm64 LDFLAGS: -framework Accelerate
--- a/ml/backend/ggml/ggml/src/ggml-metal/metal.go
+++ b/ml/backend/ggml/ggml/src/ggml-metal/metal.go
@@ -4,6 +4,6 @@ package metal

 //go:generate sh -c "{ echo // Code generated by 'go generate'. DO NOT EDIT.; sed -e '/__embed_ggml-common.h__/r ../ggml-common.h' -e '/__embed_ggml-common.h__/d' -e '/#include \"ggml-metal-impl.h\"/r ggml-metal-impl.h' -e '/#include \"ggml-metal-impl.h\"/d' ggml-metal.metal; } >ggml-metal-embed.metal"

-// #cgo CPPFLAGS: -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
+// #cgo CPPFLAGS: -DGGML_METAL_NDEBUG -DGGML_METAL_EMBED_LIBRARY -I.. -I../../include
 // #cgo LDFLAGS: -framework Metal -framework MetalKit
 import "C"
--- a/ml/nn/fast/rope.go
+++ b/ml/nn/fast/rope.go
@@ -0,0 +1,21 @@
+// fast provides implementations of fast (fused) operations for increased performance.
+package fast
+
+import (
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn/rope"
+)
+
+// fastRoPE is an interface for tensors that support fast rotary positional embedding.
+type fastRoPE interface {
+	RoPE(ctx ml.Context, positionIDs ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor
+}
+
+// RoPE applies rotary positional embedding to tensor `t`.
+func RoPE(ctx ml.Context, t, positions ml.Tensor, dim int, base, scale float32, options ...func(*rope.Options)) ml.Tensor {
+	if t, ok := t.(fastRoPE); ok {
+		return t.RoPE(ctx, positions, dim, base, scale, options...)
+	}
+
+	panic("RoPE not implemented for this tensor type")
+}
--- a/ml/nn/rope/rope.go
+++ b/ml/nn/rope/rope.go
@@ -0,0 +1,33 @@
+package rope
+
+import "github.com/ollama/ollama/ml"
+
+// Options contains optional parameters for RoPE function
+type Options struct {
+	OriginalContextLength int
+	Type                  int
+	Factors               ml.Tensor
+}
+
+// WithOriginalContextLength sets a custom context length
+func WithOriginalContextLength(n int) func(*Options) {
+	return func(opts *Options) {
+		opts.OriginalContextLength = n
+	}
+}
+
+// WithType sets RoPE type to NeoX
+func WithTypeNeoX() func(*Options) {
+	return func(opts *Options) {
+		opts.Type = 2
+	}
+}
+
+// WithFactors sets custom rope factors
+func WithFactors(factors ml.Tensor) func(*Options) {
+	return func(opts *Options) {
+		if factors != nil {
+			opts.Factors = factors
+		}
+	}
+}
--- a/model/bytepairencoding.go
+++ b/model/bytepairencoding.go
@@ -5,116 +5,13 @@ import (
 	"context"
 	"iter"
 	"log/slog"
-	"slices"
 	"strings"
-	"sync"

 	"github.com/dlclark/regexp2"
 	heap "github.com/emirpasic/gods/v2/trees/binaryheap"
 	"github.com/ollama/ollama/logutil"
 )

-type Special int32
-
-const (
-	SpecialBOS Special = iota
-	SpecialEOS
-)
-
-const (
-	TOKEN_TYPE_NORMAL = iota + 1
-	TOKEN_TYPE_UNKNOWN
-	TOKEN_TYPE_CONTROL
-	TOKEN_TYPE_USER_DEFINED
-	TOKEN_TYPE_UNUSED
-	TOKEN_TYPE_BYTE
-)
-
-type TextProcessor interface {
-	Encode(s string, addSpecial bool) ([]int32, error)
-	Decode([]int32) (string, error)
-	Is(int32, Special) bool
-	Vocabulary() *Vocabulary
-}
-
-type Vocabulary struct {
-	Values []string
-	Types  []int32
-	Scores []float32
-	Merges []string
-
-	BOS, EOS, EOT          int32
-	AddBOS, AddEOS, AddEOT bool
-
-	specialOnce sync.Once
-	special     []string
-
-	valuesOnce sync.Once
-	values     map[string]int32
-
-	mergeOnce sync.Once
-	merge     map[string]int32
-}
-
-func (v *Vocabulary) Is(id int32, special Special) bool {
-	switch special {
-	case SpecialBOS:
-		return id == v.BOS
-	case SpecialEOS:
-		return id == v.EOS || id == v.EOT
-	default:
-		return false
-	}
-}
-
-func (v *Vocabulary) Encode(s string) int32 {
-	v.valuesOnce.Do(func() {
-		v.values = make(map[string]int32, len(v.Values))
-		for i, value := range v.Values {
-			v.values[value] = int32(i)
-		}
-	})
-
-	if id, ok := v.values[s]; ok {
-		return id
-	}
-
-	return -1
-}
-
-func (v *Vocabulary) Decode(id int32) string {
-	return v.Values[id]
-}
-
-func (v *Vocabulary) SpecialVocabulary() []string {
-	v.specialOnce.Do(func() {
-		for i := range v.Values {
-			if slices.Contains([]int{105, 106}, i) {
-				v.special = append(v.special, v.Values[i])
-			} else if v.Types[i] == TOKEN_TYPE_CONTROL {
-				v.special = append(v.special, v.Values[i])
-			}
-		}
-	})
-
-	return v.special
-}
-
-func (v *Vocabulary) Merge(left, right string) int {
-	v.mergeOnce.Do(func() {
-		v.merge = make(map[string]int32, len(v.Merges))
-		for i, merge := range v.Merges {
-			v.merge[merge] = int32(i)
-		}
-	})
-
-	if id, ok := v.merge[left+" "+right]; ok {
-		return int(id)
-	}
-
-	return -1
-}
-
 type BytePairEncoding struct {
 	pre   *regexp2.Regexp
 	vocab *Vocabulary
@@ -304,27 +201,12 @@ func (bpe BytePairEncoding) Encode(s string, addSpecial bool) ([]int32, error) {
 		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
-		if bpe.vocab.AddBOS {
-			if ids[0] == bpe.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", bpe.vocab.BOS)
-			}
-
-			slog.Debug("adding bos token to prompt", "id", bpe.vocab.BOS)
-			ids = append([]int32{bpe.vocab.BOS}, ids...)
-		}
-
-		if bpe.vocab.AddEOS {
-			if ids[len(ids)-1] == bpe.vocab.EOS {
-				slog.Warn("adding eos token to prompt which already has it", "id", bpe.vocab.EOS)
-			}
-
-			slog.Debug("adding eos token to prompt", "id", bpe.vocab.EOS)
-			ids = append(ids, bpe.vocab.EOS)
-		}
+		ids = bpe.vocab.addSpecials(ids)
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

@@ -352,6 +234,6 @@ func (bpe BytePairEncoding) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/bytepairencoding_test.go
+++ b/model/bytepairencoding_test.go
--- a/model/input/input.go
+++ b/model/input/input.go
@@ -2,16 +2,30 @@ package input

 import "github.com/ollama/ollama/ml"

+// Multimodal is a multimodal embedding or a component of one.
+// For example, it could be a row of an image that can be processed
+// independently.
+type Multimodal struct {
+	// Tensor is the embedding data. Implementations may chose what to
+	// store here or it may be nil if not needed. However, any ml.Tensor
+	// objects must be stored here and not in Data.
+	Tensor ml.Tensor
+
+	// Data is implementation-specific opaque data, such as metadata on how
+	// to layout Tensor. It may be nil if not needed. It may also store larger
+	// objects such as complete images if they are to be processed later.
+	Data any
+}
+
 // Input represents one token in the input stream
 type Input struct {
 	// Token is a single element of text.
 	Token int32

-	// Multimodal is opaque data representing a non-text
-	// element such as an image (or part of one if the image
-	// can be processed in pieces). It may be either together
-	// with Token or on its own.
-	Multimodal any
+	// Multimodal is represents a non-text element such as an
+	// image (or part of one if the image can be processed in pieces).
+	// It may be used either together with Token or on its own.
+	Multimodal []Multimodal

 	// MultimodalHash is a unique representation of the data
 	// stored in Multimodal, used for caching and comparing
@@ -32,7 +46,7 @@ type Input struct {
 // Positions slice.
 type MultimodalIndex struct {
 	Index      int
-	Multimodal any
+	Multimodal []Multimodal
 }

 // Batch contains the inputs for a model forward pass
--- a/model/model.go
+++ b/model/model.go
@@ -40,12 +40,13 @@ type MultimodalProcessor interface {
 	// EncodeMultimodal processes a single input (such as an image) and
 	// generates an output (typically an embedding) that can be used by the model.
 	//
-	// The return value is most typically an ml.Tensor, however, different
-	// type are possible, such as an object containing a tensor plus
-	// additional metadata, a slice of tensors or even just the original input.
+	// The return value is one or more tensors, each with optional model-specific
+	// opaque metadata. Typically, the tensors might be views into an embedding
+	// with each view representing a chunk of data that can be processed independently
+	// in different batches.
 	//
 	// The result may be cached by the runner.
-	EncodeMultimodal(ml.Context, []byte) (any, error)
+	EncodeMultimodal(ml.Context, []byte) ([]input.Multimodal, error)

 	// PostTokenize is called after tokenization to allow the model to edit the
 	// input stream to correctly arrange multimodal elements.
@@ -97,14 +98,8 @@ func Register(name string, f func(fs.Config) (Model, error)) {
 }

 // New initializes a new model instance with the provided configuration based on the metadata in the model file
-func New(ctx context.Context, modelPath string, params ml.BackendParams) (Model, error) {
-	r, err := os.Open(modelPath)
-	if err != nil {
-		return nil, err
-	}
-	defer r.Close()
-
-	b, err := ml.NewBackend(ctx, r, params)
+func New(modelPath string, params ml.BackendParams) (Model, error) {
+	b, err := ml.NewBackend(modelPath, params)
 	if err != nil {
 		return nil, err
 	}
@@ -133,7 +128,7 @@ func NewTextProcessor(s string) (TextProcessor, error) {
 		return nil, err
 	}
 	defer r.Close()
-	meta, _, err := fsggml.Decode(r, -1)
+	meta, err := fsggml.Decode(r, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -292,11 +287,7 @@ func Forward(ctx ml.Context, m Model, inputs []int32, batch input.Batch) (ml.Ten
 		return nil, errors.New("batch size cannot be less than 1")
 	}

-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
-	if err != nil {
-		return nil, err
-	}
+	batch.Inputs = ctx.Input().FromIntSlice(inputs, len(inputs))

 	cache := m.Config().Cache
 	if cache != nil {
--- a/model/models/gemma2/model.go
+++ b/model/models/gemma2/model.go
@@ -7,6 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )
@@ -43,10 +45,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT: int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -80,11 +85,10 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -94,7 +98,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -124,7 +128,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(m.Options.attnKeyLen), uint32(2), m.Options.ropeBase, m.Options.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.Options.attnKeyLen, m.Options.ropeBase, m.Options.ropeScale, rope.WithTypeNeoX()), nil
 }

 type MLP struct {
@@ -171,15 +175,8 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)
 	hiddenState = hiddenState.Scale(ctx, math.Sqrt(float64(m.Options.hiddenSize)))
--- a/model/models/gemma3/model.go
+++ b/model/models/gemma3/model.go
@@ -60,12 +60,16 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Scores: c.Floats("tokenizer.ggml.scores"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(1),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOT:    int32(106),
-				AddEOT: c.Bool("tokenizer.ggml.add_eot_token", false),
+				EOS: append(
+					[]int32{
+						int32(c.Uint("tokenizer.ggml.eos_token_id")),
+						int32(c.Uint("tokenizer.ggml.eot_token_id", 106)),
+					},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -82,7 +86,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -97,33 +101,30 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s,
+	pixelValues := ctx.Input().FromFloatSlice(f32s,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.imageSize,
 		m.ImageProcessor.numChannels,
 	)
-	if err != nil {
-		return nil, err
-	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = m.MultiModalProjector.Forward(ctx, visionOutputs, m.imageSize, m.patchSize, m.VisionModel.eps)
-	return visionOutputs, nil
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input

 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.(ml.Tensor)
+			inputMultimodal := inp.Multimodal[0].Tensor

 			result = append(result,
-				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3},               // "\n\n"
-				input.Input{Token: 255999},                                                   // "<start_of_image>""
-				input.Input{Multimodal: inputMultimodal, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
+				input.Input{Token: 108, SameBatch: inputMultimodal.Dim(1) + 3}, // "\n\n"
+				input.Input{Token: 255999},                                     // "<start_of_image>""
+				input.Input{Multimodal: []input.Multimodal{{Tensor: inputMultimodal}}, MultimodalHash: inp.MultimodalHash}, // image data is on the first placeholder
 			)

 			// add image token placeholders
@@ -140,15 +141,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/gemma3/model_text.go
+++ b/model/models/gemma3/model_text.go
@@ -7,6 +7,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -73,7 +75,6 @@ type TextSelfAttention struct {

 func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextConfig) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(2)

 	ropeBase := opts.ropeLocalBase
 	if (layer+1)%gemmaGlobalCacheCount == 0 {
@@ -83,7 +84,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, opts.attnKeyLen, opts.numHeads, batchSize)
 	q = sa.QueryNorm.Forward(ctx, q, opts.eps)
-	q = q.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	if opts.largeModelScaling {
 		q = q.Scale(ctx, 1.0/math.Sqrt(float64(opts.hiddenSize/opts.numHeads)))
@@ -94,7 +95,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, layer int, hiddenState, pos
 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, opts.attnKeyLen, opts.numKVHeads, batchSize)
 	k = sa.KeyNorm.Forward(ctx, k, opts.eps)
-	k = k.RoPE(ctx, positionIDs, nil, uint32(opts.attnKeyLen), ropeType, ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.attnKeyLen, ropeBase, opts.ropeScale, rope.WithTypeNeoX())

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, opts.attnValLen, opts.numKVHeads, batchSize)
@@ -112,7 +113,7 @@ func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.T
 		ropeBase = m.TextConfig.ropeGlobalBase
 	}

-	return key.RoPE(ctx, shift, nil, uint32(m.TextConfig.attnKeyLen), uint32(2), ropeBase, m.TextConfig.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.TextConfig.attnKeyLen, ropeBase, m.TextConfig.ropeScale, rope.WithTypeNeoX()), nil
 }

 type TextMLP struct {
@@ -165,7 +166,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	// set image embeddings
 	var except []int
 	for _, image := range batch.Multimodal {
-		visionOutputs := image.Multimodal.(ml.Tensor)
+		visionOutputs := image.Multimodal[0].Tensor
 		ctx.Forward(visionOutputs.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), visionOutputs.Dim(0)*visionOutputs.Dim(1))))

 		for i := range visionOutputs.Dim(1) {
--- a/model/models/llama/model.go
+++ b/model/models/llama/model.go
@@ -1,22 +1,23 @@
 package llama

 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model"
 	"github.com/ollama/ollama/model/input"
 )

 type Options struct {
 	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
 	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32
 }

 type Model struct {
@@ -32,10 +33,6 @@ type Model struct {
 }

 func New(c fs.Config) (model.Model, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
 	m := Model{
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
@@ -43,13 +40,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		Layers: make([]Layer, c.Uint("block_count")),
@@ -57,10 +54,11 @@ func New(c fs.Config) (model.Model, error) {
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}

@@ -77,31 +75,31 @@ type SelfAttention struct {
 	RopeFactors ml.Tensor  `gguf:"rope_freqs.weight"`
 }

-func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)

-	q := sa.Query.Forward(ctx, hiddenState)
-	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query := sa.Query.Forward(ctx, hiddenState)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)

-	k := sa.Key.Forward(ctx, hiddenState)
-	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key := sa.Key.Forward(ctx, hiddenState)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	v := sa.Value.Forward(ctx, hiddenState)
-	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+	value := sa.Value.Forward(ctx, hiddenState)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

-	scaleFactor := 1.0 / math.Sqrt(float64(headDim))
-	kqv := nn.Attention(ctx, q, k, v, scaleFactor, cache)
-	kqv = kqv.Reshape(ctx, opts.hiddenSize, batchSize)
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

-	return sa.Output.Forward(ctx, kqv)
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+	return sa.Output.Forward(ctx, attention)
 }

 func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].SelfAttention.RopeFactors, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].SelfAttention.RopeFactors)), nil
 }

 type MLP struct {
@@ -122,11 +120,11 @@ type Layer struct {
 	MLP           *MLP
 }

-func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+func (l *Layer) Forward(ctx ml.Context, hiddenState, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
 	residual := hiddenState

 	hiddenState = l.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
-	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positionIDs, cache, opts)
+	hiddenState = l.SelfAttention.Forward(ctx, hiddenState, positions, cache, opts)

 	// In the final layer (outputs != nil), optimize by pruning to just the token positions
 	// we need logits for.
@@ -144,27 +142,19 @@ func (l *Layer) Forward(ctx ml.Context, hiddenState, positionIDs, outputs ml.Ten
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))

 	hiddenState := m.TokenEmbedding.Forward(ctx, batch.Inputs)

 	for i, layer := range m.Layers {
 		m.Cache.SetLayer(i)

-		var lastLayerOutputs ml.Tensor
+		var outputs ml.Tensor
 		if i == len(m.Layers)-1 {
-			lastLayerOutputs = outputs
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
 		}

-		hiddenState = layer.Forward(ctx, hiddenState, positions, lastLayerOutputs, m.Cache, m.Options)
+		hiddenState = layer.Forward(ctx, hiddenState, positions, outputs, m.Cache, m.Options)
 	}

 	hiddenState = m.OutputNorm.Forward(ctx, hiddenState, m.eps)
--- a/model/models/llama4/model.go
+++ b/model/models/llama4/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -41,13 +40,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -63,7 +62,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) < 1 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -78,10 +77,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	tilesLocal, err := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	tilesLocal := ctx.Input().FromFloatSlice(pixelsLocal, size.X, size.Y, m.numChannels)

 	ratioW, ratioH := size.X/m.imageSize, size.Y/m.imageSize

@@ -92,81 +88,86 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	pixelValues := tilesLocal

 	if len(pixelsGlobal) > 0 {
-		tilesGlobal, err := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
-		if err != nil {
-			return nil, err
-		}
-
+		tilesGlobal := ctx.Input().FromFloatSlice(pixelsGlobal, m.imageSize, m.imageSize, m.numChannels)
 		pixelValues = pixelValues.Concat(ctx, tilesGlobal, 3)
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	visionOutputs = visionOutputs.Reshape(ctx, visionOutputs.Dim(0), visionOutputs.Dim(1)*visionOutputs.Dim(2)*visionOutputs.Dim(3))
 	projectedOutputs := m.Projector.Forward(ctx, visionOutputs)
-	return &chunks{Model: m, Tensor: projectedOutputs, aspectRatio: image.Point{ratioW, ratioH}}, nil
+
+	var multimodal []input.Multimodal
+	aspectRatio := image.Point{ratioW, ratioH}
+
+	var offset int
+	patchesPerChunk := projectedOutputs.Dim(1)
+	if aspectRatio.Y*aspectRatio.X > 1 {
+		patchesPerChunk = projectedOutputs.Dim(1) / (aspectRatio.X*aspectRatio.Y + 1)
+
+		for range aspectRatio.Y {
+			for x := range aspectRatio.X {
+				view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+					projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+					patchesPerChunk)
+				var separator separator
+				if x < aspectRatio.X-1 {
+					separator.x = true // <|tile_x_separator|>
+				} else {
+					separator.y = true // <|tile_y_separator|>
+				}
+				multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator})
+				offset += patchesPerChunk
+			}
+		}
+	}
+
+	view := projectedOutputs.View(ctx, projectedOutputs.Stride(1)*offset,
+		projectedOutputs.Dim(0), projectedOutputs.Stride(1),
+		patchesPerChunk)
+	multimodal = append(multimodal, input.Multimodal{Tensor: view, Data: &separator{}})
+
+	return multimodal, nil
 }

-type chunks struct {
-	*Model
-	ml.Tensor
-	aspectRatio image.Point
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+type separator struct {
+	x bool
+	y bool
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 			continue
 		}

-		t := inp.Multimodal.(*chunks)
 		var imageInputs []input.Input
 		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_start|>

-		var offset int
-		patchesPerChunk := t.Dim(1)
-		if t.aspectRatio.Y*t.aspectRatio.X > 1 {
-			patchesPerChunk = t.Dim(1) / (t.aspectRatio.X*t.aspectRatio.Y + 1)
+		for i, mm := range inp.Multimodal {
+			patchesPerChunk := mm.Tensor.Dim(1)

-			for range t.aspectRatio.Y {
-				for x := range t.aspectRatio.X {
-					imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-					imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-					if x < t.aspectRatio.X-1 {
-						imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
-					}
-					offset += patchesPerChunk
+			if i < len(inp.Multimodal)-1 {
+				separator := mm.Data.(*separator)
+
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+
+				if separator.x {
+					imageInputs = append(imageInputs, input.Input{Token: 200084}) // <|tile_x_separator|>
 				}
-
-				imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				if separator.y {
+					imageInputs = append(imageInputs, input.Input{Token: 200085}) // <|tile_y_separator|>
+				}
+			} else {
+				imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                      // <|image|>
+				imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: []input.Multimodal{{Tensor: mm.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
+				imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
+				imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
 			}
 		}

-		imageInputs = append(imageInputs, input.Input{Token: 200090})                                                                                                                 // <|image|>
-		imageInputs = append(imageInputs, input.Input{Token: 200092, Multimodal: &chunk{t, offset, patchesPerChunk}, MultimodalHash: inp.MultimodalHash, SameBatch: patchesPerChunk}) // <|patch|>
-		imageInputs = append(imageInputs, slices.Repeat([]input.Input{{Token: 200092}}, patchesPerChunk-1)...)
-		imageInputs = append(imageInputs, input.Input{Token: 200080}) // <|image_end|>
-
 		result = append(result, imageInputs...)
 	}

@@ -174,15 +175,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/llama4/model_text.go
+++ b/model/models/llama4/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

@@ -31,8 +33,8 @@ func (sa *TextAttention) Forward(ctx ml.Context, hiddenStates, positions, attent
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)

 	if useRope {
-		query = query.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
-		key = key.RoPE(ctx, positions, sa.RopeFactors, uint32(opts.ropeDim), uint32(0), opts.ropeBase, opts.ropeScale)
+		query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
+		key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))
 	}

 	if opts.useQKNorm {
@@ -80,7 +82,7 @@ func (e *TextExperts) Forward(ctx ml.Context, hiddenStates, routerLogits ml.Tens

 	nextStates := downStates.View(ctx, 0, hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2))
 	for i := 1; i < opts.numExpertsUsed; i++ {
-		nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
+		nextStates = nextStates.Add(ctx, downStates.View(ctx, i*downStates.Stride(1), hiddenStates.Dim(0), downStates.Stride(2), hiddenStates.Dim(2)))
 	}

 	return nextStates
@@ -210,12 +212,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
+		img := mi.Multimodal[0].Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

@@ -226,11 +223,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 			scales[i] = float32(math.Log(math.Floor(((float64(p)+1.0)/float64(m.attentionFloorScale))+1.0))*m.attentionScale + 1.0)
 		}

-		var err error
-		attentionScales, err = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
-		if err != nil {
-			panic(err)
-		}
+		attentionScales = ctx.Input().FromFloatSlice(scales, 1, 1, len(scales))
 	}

 	for i, layer := range m.Layers {
@@ -255,5 +248,5 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, m.Layers[layer].Attention.RopeFactors, uint32(0), uint32(m.ropeDim), m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(m.Layers[layer].Attention.RopeFactors)), nil
 }
--- a/model/models/llama4/model_vision.go
+++ b/model/models/llama4/model_vision.go
@@ -245,10 +245,7 @@ func (m *VisionModel) rotaryEmbedding(ctx ml.Context) (ml.Tensor, ml.Tensor) {
 		}
 	}

-	ropeFreqs, err := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)
-	if err != nil {
-		panic(err)
-	}
+	ropeFreqs := ctx.Input().FromFloatSlice(freqs, freqDim/2, numPatches, 2)

 	ropeFreqs = ropeFreqs.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
 	ropeFreqs = ropeFreqs.Reshape(ctx, freqDim, 1, numPatches)
--- a/model/models/mistral3/model.go
+++ b/model/models/mistral3/model.go
@@ -4,7 +4,6 @@ import (
 	"bytes"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -32,31 +31,26 @@ var _ model.MultimodalProcessor = (*Model)(nil)
 var _ model.TextProcessor = (*Model)(nil)

 func New(c fs.Config) (model.Model, error) {
-	textModel, err := NewTextModel(c)
-	if err != nil {
-		return nil, err
-	}
-
 	m := &Model{
-		TextModel:           textModel,
-		VisionModel:         newVisionModel(c),
-		ImageProcessor:      newImageProcessor(c),
-		MultiModalProjector: newMultiModalProjector(c),
 		BytePairEncoding: model.NewBytePairEncoding(
 			c.String("tokenizer.ggml.pretokenizer", `[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
 			&model.Vocabulary{
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id", 1)),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id", 2)),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
+		TextModel:           newTextModel(c),
+		VisionModel:         newVisionModel(c),
+		ImageProcessor:      newImageProcessor(c),
+		MultiModalProjector: newMultiModalProjector(c),
 	}

 	m.Cache = kvcache.NewCausalCache(m.TextModel.Shift)
@@ -105,7 +99,7 @@ func newMultiModalProjector(c fs.Config) *MultiModalProjector {
 	}
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -120,46 +114,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, size.X, size.Y, m.ImageProcessor.numChannels)

 	visionOutputs := m.VisionModel.Forward(ctx, pixelValues)
 	features, size := m.MultiModalProjector.Forward(ctx, visionOutputs, size)

 	// split into patches to be sent to the text transformer
-	parent := imageFeatures{tensor: features}
-	rows := make([]*imageRow, size.Y)
+	rows := make([]input.Multimodal, size.Y)
 	for i := range rows {
-		rows[i] = &imageRow{parent: &parent, s: i, shape: []int{features.Dim(0), size.X}}
+		rows[i].Tensor = features.View(ctx, features.Stride(1)*size.X*i, features.Dim(0), features.Stride(1), size.X)
 	}

 	return rows, nil
 }

-type imageFeatures struct {
-	tensor ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type imageRow struct {
-	parent *imageFeatures
-	s      int
-	shape  []int
-}
-
-func (r *imageRow) data() []float32 {
-	n := 1
-	for _, s := range r.shape {
-		n *= s
-	}
-
-	return r.parent.data[r.s*n : (r.s+1)*n]
-}
-
 // PostTokenize arranges Mistral 3's inputs for the forward pass
 // In Mistral 3 and Pixtral, the input patches are arranged as follows:
 // [IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_BREAK][IMG]...[IMG][IMG_END]
@@ -168,15 +136,14 @@ func (r *imageRow) data() []float32 {
 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 	var result []input.Input
 	for _, inp := range inputs {
-		if inp.Multimodal == nil {
+		if len(inp.Multimodal) == 0 {
 			result = append(result, inp)
 		} else {
-			inputMultimodal := inp.Multimodal.([]*imageRow)
-			for i, row := range inputMultimodal {
+			for i, row := range inp.Multimodal {
 				// [IMG]
-				result = append(result, input.Input{Token: 10, Multimodal: row, MultimodalHash: inp.MultimodalHash, SameBatch: row.shape[1]})
-				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.shape[1]-1)...)
-				if i == len(inputMultimodal)-1 {
+				result = append(result, input.Input{Token: 10, Multimodal: []input.Multimodal{{Tensor: row.Tensor}}, MultimodalHash: inp.MultimodalHash, SameBatch: row.Tensor.Dim(1)})
+				result = append(result, slices.Repeat([]input.Input{{Token: 10}}, row.Tensor.Dim(1)-1)...)
+				if i == len(inp.Multimodal)-1 {
 					// [IMG_END]
 					result = append(result, input.Input{Token: 13})
 				} else {
@@ -191,15 +158,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache), nil
 }
--- a/model/models/mistral3/model_text.go
+++ b/model/models/mistral3/model_text.go
@@ -1,27 +1,24 @@
 package mistral3

 import (
-	"fmt"
+	"cmp"
 	"math"
-	"strings"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
-	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/ml/nn/fast"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	hiddenSize, numHeads, numKVHeads, headDim int
-	eps, ropeBase, ropeScale                  float32
-	ropeDim                                   uint32
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
 }

 type TextModel struct {
-	model.Base
-
 	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
 	Layers         []Layer       `gguf:"blk"`
 	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
@@ -39,19 +36,15 @@ type SelfAttention struct {

 func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Tensor, cache kvcache.Cache, opts *TextOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
-	ropeType := uint32(0)
-	headDim := opts.headDim
-	if headDim == 0 {
-		headDim = opts.hiddenSize / opts.numHeads
-	}
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale)

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -62,7 +55,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten
 }

 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, uint32(0), m.ropeDim, m.ropeBase, m.ropeScale), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale), nil
 }

 type MLP struct {
@@ -109,20 +102,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor

 	// image embeddings
 	for _, image := range batch.Multimodal {
-		row := image.Multimodal.(*imageRow)
-		row.parent.dataOnce.Do(func() {
-			// use a new, throwaway context so the image tensor is not added to the graph
-			temp := m.Backend().NewContext()
-			temp.Forward(row.parent.tensor).Compute(row.parent.tensor)
-			row.parent.data = row.parent.tensor.Floats()
-			temp.Close()
-		})
-
-		imageFeature, err := ctx.Input().FromFloatSlice(row.data(), row.shape...)
-		if err != nil {
-			panic(err)
-		}
-
+		imageFeature := image.Multimodal[0].Tensor
 		ctx.Forward(imageFeature.Copy(ctx, hiddenState.View(ctx, image.Index*hiddenState.Stride(1), imageFeature.Dim(0)*imageFeature.Dim(1))))
 	}

@@ -141,24 +121,18 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	return m.Output.Forward(ctx, hiddenState)
 }

-func NewTextModel(c fs.Config) (*TextModel, error) {
-	if !strings.EqualFold(c.String("tokenizer.ggml.model"), "gpt2") {
-		return nil, fmt.Errorf("tokenizer %s not yet supported", c.String("tokenizer.ggml.model"))
-	}
-
-	textModel := &TextModel{
+func newTextModel(c fs.Config) *TextModel {
+	return &TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
 			hiddenSize: int(c.Uint("embedding_length")),
 			numHeads:   int(c.Uint("attention.head_count")),
 			numKVHeads: int(c.Uint("attention.head_count_kv")),
 			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
 			eps:        c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:   c.Float("rope.freq_base"),
 			ropeScale:  c.Float("rope.freq_scale", 1),
-			ropeDim:    c.Uint("rope.dimension_count"),
 		},
 	}
-
-	return textModel, nil
 }
--- a/model/models/mistral3/model_vision.go
+++ b/model/models/mistral3/model_vision.go
@@ -110,15 +110,8 @@ func (m *VisionModel) positionalEmbedding(ctx ml.Context, positionIDs ml.Tensor)
 		}
 	}

-	h, err := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
-
-	w, err := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)
-	if err != nil {
-		panic(err)
-	}
+	h := ctx.Input().FromFloatSlice(frequenciesHeight, maxPatchesPerSide, frequencies/2)
+	w := ctx.Input().FromFloatSlice(frequenciesWidth, maxPatchesPerSide, frequencies/2)

 	h = h.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
 	w = w.Permute(ctx, 1, 0, 2, 3).Contiguous(ctx)
@@ -151,10 +144,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {
 		}
 	}

-	positionIDs, err := ctx.Input().FromIntSlice(positions, len(positions))
-	if err != nil {
-		panic(err)
-	}
+	positionIDs := ctx.Input().FromIntSlice(positions, len(positions))

 	positionEmbedding := m.positionalEmbedding(ctx, positionIDs)
 	cos, sin := positionEmbedding.Cos(ctx), positionEmbedding.Sin(ctx)
@@ -170,7 +160,7 @@ func (m *VisionModel) Forward(ctx ml.Context, pixelValues ml.Tensor) ml.Tensor {

 func newVisionModel(c fs.Config) *VisionModel {
 	return &VisionModel{
-		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count", 24)),
+		Layers: make([]VisionEncoderLayer, c.Uint("vision.block_count")),
 		VisionModelOptions: &VisionModelOptions{
 			hiddenSize:       int(c.Uint("vision.embedding_length", 1024)),
 			numHeads:         int(c.Uint("vision.attention.head_count", 16)),
--- a/model/models/mllama/model.go
+++ b/model/models/mllama/model.go
@@ -3,6 +3,7 @@ package mllama
 import (
 	"bytes"
 	"image"
+	"slices"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -37,13 +38,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
 				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				// TODO: set EOT to EOS otherwise 0 will stop generation
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		ImageProcessor: newImageProcessor(c),
@@ -58,7 +59,7 @@ func New(c fs.Config) (model.Model, error) {
 	return &m, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Transformer.Layers) == 0 || len(m.GlobalTransformer.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -73,21 +74,20 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 		return nil, err
 	}

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, ratio.numTiles())
-	if err != nil {
-		return nil, err
+	if ratio.numTiles() < m.maxNumTiles {
+		// Pad tiles to maxNumTiles
+		f32s = slices.Grow(f32s, m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles)
+		f32s = f32s[:m.imageSize*m.imageSize*m.numChannels*m.maxNumTiles]
 	}

-	pixelValues = pixelValues.Pad(ctx, 0, 0, 0, m.ImageProcessor.maxNumTiles-ratio.numTiles())
-
-	aspectRatio, err := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)
-	if err != nil {
-		return nil, err
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, m.imageSize, m.imageSize, m.numChannels, m.maxNumTiles)
+	aspectRatio := ctx.Input().FromIntSlice([]int32{int32(ratio.rank)}, 1)

 	positionIDs := ctx.Arange(0, 1601, 1, ml.DTypeI32)
 	crossAttentionStates := m.VisionModel.Forward(ctx, pixelValues, positionIDs, aspectRatio)
-	return m.Projector.Forward(ctx, crossAttentionStates), nil
+	projectedOutputs := m.Projector.Forward(ctx, crossAttentionStates)
+
+	return []input.Multimodal{{Tensor: projectedOutputs}}, nil
 }

 func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
@@ -103,18 +103,11 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
 	var crossAttentionStates ml.Tensor
 	if len(batch.Multimodal) > 0 {
-		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal.(ml.Tensor)
+		crossAttentionStates = batch.Multimodal[len(batch.Multimodal)-1].Multimodal[0].Tensor
 	}

-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	// TODO: attention mask, cross attention mask
 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, crossAttentionStates, nil, m.Cache.(*kvcache.WrapperCache)), nil
--- a/model/models/mllama/model_text.go
+++ b/model/models/mllama/model_text.go
@@ -8,6 +8,8 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 )

 type TextSelfAttention struct {
@@ -21,15 +23,14 @@ type TextSelfAttention struct {
 func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.Tensor, cache *kvcache.WrapperCache, opts *TextModelOptions) ml.Tensor {
 	batchSize := hiddenState.Dim(1)
 	headDim := opts.hiddenSize / opts.numHeads
-	ropeType := uint32(0)

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	query = query.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	query = fast.RoPE(ctx, query, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	key = key.RoPE(ctx, positions, sa.RopeFactors, opts.ropeDim, ropeType, opts.ropeBase, opts.ropeScale)
+	key = fast.RoPE(ctx, key, positions, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithFactors(sa.RopeFactors))

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -44,7 +45,7 @@ func (sa *TextSelfAttention) Forward(ctx ml.Context, hiddenState, positions ml.T
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
 	// This will only get called for layers in the cache, which are just the self attention layers
 	if sa, ok := m.Transformer.Layers[layer].(*TextSelfAttentionDecoderLayer); ok {
-		return key.RoPE(ctx, shift, sa.SelfAttention.RopeFactors, m.ropeDim, uint32(0), m.ropeBase, m.ropeScale), nil
+		return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithFactors(sa.SelfAttention.RopeFactors)), nil
 	}

 	return key, nil
@@ -199,8 +200,8 @@ func (d *TextDecoder) Forward(ctx ml.Context, hiddenState, positionIDs, outputs,

 type TextModelOptions struct {
 	hiddenSize, numHeads, numKVHeads int
+	ropeDim                          int
 	eps, ropeBase, ropeScale         float32
-	ropeDim                          uint32

 	crossAttentionLayers []int32
 }
@@ -240,10 +241,10 @@ func newTextModel(c fs.Config) *TextModel {
 			hiddenSize:           int(c.Uint("embedding_length")),
 			numHeads:             int(c.Uint("attention.head_count")),
 			numKVHeads:           int(c.Uint("attention.head_count_kv")),
+			ropeDim:              int(c.Uint("rope.dimension_count")),
 			eps:                  c.Float("attention.layer_norm_rms_epsilon"),
 			ropeBase:             c.Float("rope.freq_base"),
 			ropeScale:            c.Float("rope.freq_scale", 1),
-			ropeDim:              c.Uint("rope.dimension_count"),
 			crossAttentionLayers: c.Ints("attention.cross_attention_layers"),
 		},
 	}
--- a/model/models/mllama/model_vision.go
+++ b/model/models/mllama/model_vision.go
@@ -16,8 +16,6 @@ type VisionSelfAttention struct {
 	Key    *nn.Linear `gguf:"attn_k"`
 	Value  *nn.Linear `gguf:"attn_v"`
 	Output *nn.Linear `gguf:"attn_output"`
-
-	Gate ml.Tensor `gguf:"attn_gate"`
 }

 func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, opts *VisionModelOptions) ml.Tensor {
@@ -25,27 +23,16 @@ func (sa *VisionSelfAttention) Forward(ctx ml.Context, hiddenState ml.Tensor, op

 	query := sa.Query.Forward(ctx, hiddenState)
 	query = query.Reshape(ctx, headDim, opts.numHeads, query.Dim(1), batchSize)
-	query = query.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	key := sa.Key.Forward(ctx, hiddenState)
 	key = key.Reshape(ctx, headDim, opts.numHeads, key.Dim(1), batchSize)
-	key = key.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)

 	value := sa.Value.Forward(ctx, hiddenState)
 	value = value.Reshape(ctx, headDim, opts.numHeads, value.Dim(1), batchSize)
-	value = value.Permute(ctx, 1, 2, 0, 3).Contiguous(ctx)

-	scores := key.Mulmat(ctx, query)
-	scores = scores.Scale(ctx, 1.0/math.Sqrt(float64(headDim)))
-	scores = scores.Softmax(ctx)
-
-	attention := value.Mulmat(ctx, scores)
-	attention = attention.Reshape(ctx, headDim, attention.Dim(1), opts.numHeads, batchSize)
-	attention = attention.Permute(ctx, 0, 2, 1, 3).Contiguous(ctx)
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(headDim)), nil)
 	attention = attention.Reshape(ctx, opts.hiddenSize, attention.Dim(2), batchSize)
-
-	hiddenState = sa.Output.Forward(ctx, attention)
-	return hiddenState
+	return sa.Output.Forward(ctx, attention)
 }

 type VisionMLP struct {
@@ -76,21 +63,18 @@ func (e *VisionEncoderLayer) Forward(ctx ml.Context, hiddenState ml.Tensor, opts
 	// self attention
 	hiddenState = e.AttentionNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.SelfAttention.Forward(ctx, hiddenState, opts)
-
 	if e.AttentionGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.AttentionGate)
 	}
 	hiddenState = hiddenState.Add(ctx, residual)
 	residual = hiddenState

-	// feed forward
 	hiddenState = e.MLPNorm.Forward(ctx, hiddenState, opts.eps)
 	hiddenState = e.MLP.Forward(ctx, hiddenState, opts)
-	hiddenState = hiddenState.Add(ctx, residual)
 	if e.MLPGate != nil {
 		hiddenState = hiddenState.Mul(ctx, e.MLPGate)
 	}
-
+	hiddenState = hiddenState.Add(ctx, residual)
 	return hiddenState
 }

--- a/model/models/models.go
+++ b/model/models/models.go
@@ -7,5 +7,7 @@ import (
 	_ "github.com/ollama/ollama/model/models/llama4"
 	_ "github.com/ollama/ollama/model/models/mistral3"
 	_ "github.com/ollama/ollama/model/models/mllama"
+	_ "github.com/ollama/ollama/model/models/qwen2"
 	_ "github.com/ollama/ollama/model/models/qwen25vl"
+	_ "github.com/ollama/ollama/model/models/qwen3"
 )
--- a/model/models/qwen2/model.go
+++ b/model/models/qwen2/model.go
@@ -0,0 +1,164 @@
+package qwen2
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	headDim, ropeDim                 int
+	eps, ropeBase, ropeScale         float32
+}
+
+type Attention struct {
+	Query  *nn.Linear `gguf:"attn_q"`
+	Key    *nn.Linear `gguf:"attn_k"`
+	Value  *nn.Linear `gguf:"attn_v"`
+	Output *nn.Linear `gguf:"attn_output"`
+}
+
+func (attn Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+	headDim := cmp.Or(opts.headDim, opts.hiddenSize/opts.numHeads)
+	ropeDim := cmp.Or(opts.ropeDim, headDim)
+
+	query := attn.Query.Forward(ctx, hiddenStates)
+	query = query.Reshape(ctx, headDim, opts.numHeads, batchSize)
+
+	key := attn.Key.Forward(ctx, hiddenStates)
+	key = key.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	value := attn.Value.Forward(ctx, hiddenStates)
+	value = value.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
+
+	query = fast.RoPE(ctx, query, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, ropeDim, opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+	attention := nn.Attention(ctx, query, key, value, 1.0/math.Sqrt(float64(headDim)), cache)
+	attention = attention.Reshape(ctx, headDim*opts.numHeads, batchSize)
+
+	return attn.Output.Forward(ctx, attention)
+}
+
+type MLP struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp MLP) Forward(ctx ml.Context, hiddenStates ml.Tensor) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type DecoderLayer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	Attention     *Attention
+	MLPNorm       *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP           *MLP
+}
+
+func (d DecoderLayer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+	residual = hiddenStates
+
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding  `gguf:"token_embd"`
+	Layers         []DecoderLayer `gguf:"blk"`
+	OutputNorm     *nn.RMSNorm    `gguf:"output_norm"`
+	Output         *nn.Linear     `gguf:"output,alt:token_embd"`
+
+	Options
+}
+
+// Forward implements model.Model.
+func (m Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, &m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	hiddenStates = m.Output.Forward(ctx, hiddenStates)
+	return hiddenStates, nil
+}
+
+func (m Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	ropeDim := cmp.Or(m.ropeDim, m.hiddenSize/m.numHeads)
+	return fast.RoPE(ctx, key, shift, ropeDim, m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+
+func New(c fs.Config) (model.Model, error) {
+	m := Model{
+		Layers: make([]DecoderLayer, c.Uint("block_count")),
+		BytePairEncoding: model.NewBytePairEncoding(
+			c.String("tokenizer.ggml.pretokenizer", `(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`),
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Options: Options{
+			hiddenSize: int(c.Uint("embedding_length")),
+			numHeads:   int(c.Uint("attention.head_count")),
+			numKVHeads: int(c.Uint("attention.head_count_kv")),
+			headDim:    int(c.Uint("attention.key_length")),
+			ropeDim:    int(c.Uint("rope.dimension_count")),
+			ropeBase:   c.Float("rope.freq_base"),
+			ropeScale:  c.Float("rope.freq_scale", 1),
+			eps:        c.Float("attention.layer_norm_rms_epsilon"),
+		},
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen2", New)
+}
--- a/model/models/qwen25vl/model.go
+++ b/model/models/qwen25vl/model.go
@@ -5,7 +5,6 @@ import (
 	"fmt"
 	"image"
 	"slices"
-	"sync"

 	"github.com/ollama/ollama/fs"
 	"github.com/ollama/ollama/kvcache"
@@ -35,12 +34,13 @@ func New(c fs.Config) (model.Model, error) {
 				Values: c.Strings("tokenizer.ggml.tokens"),
 				Types:  c.Ints("tokenizer.ggml.token_type"),
 				Merges: c.Strings("tokenizer.ggml.merges"),
-				BOS:    int32(c.Uint("tokenizer.ggml.bos_token_id")),
-				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", false),
-				EOS:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
 				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
-				EOT:    int32(c.Uint("tokenizer.ggml.eos_token_id")),
-				AddEOT: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
 			},
 		),
 		TextModel:      NewTextModel(c),
@@ -69,15 +69,12 @@ func (m *Model) PixelValues(ctx ml.Context, multimodalData []byte) (ml.Tensor, *
 		m.ImageProcessor.patchSize * m.ImageProcessor.patchSize
 	numPatches := grid.Temporal * grid.Height * grid.Width

-	pixelValues, err := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)
-	if err != nil {
-		return nil, nil, fmt.Errorf("failed to create tensor from image: %w", err)
-	}
+	pixelValues := ctx.Input().FromFloatSlice(f32s, patchDim, numPatches)

 	return pixelValues, grid, nil
 }

-func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, error) {
+func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) ([]input.Multimodal, error) {
 	if len(m.VisionModel.Layers) == 0 {
 		return nil, model.ErrNoVisionModel
 	}
@@ -88,31 +85,7 @@ func (m *Model) EncodeMultimodal(ctx ml.Context, multimodalData []byte) (any, er
 	}

 	visionOutputs := m.VisionModel.Forward(ctx, pixels, grid)
-	return &chunks{Model: m, Tensor: visionOutputs}, nil
-}
-
-type chunks struct {
-	*Model
-	ml.Tensor
-
-	dataOnce sync.Once
-	data     []float32
-}
-
-type chunk struct {
-	*chunks
-	s, n int
-}
-
-func (r *chunk) floats() []float32 {
-	r.dataOnce.Do(func() {
-		temp := r.Backend().NewContext()
-		defer temp.Close()
-		temp.Forward(r.Tensor).Compute(r.Tensor)
-		r.data = r.Floats()
-	})
-
-	return r.data[r.s*r.Dim(0) : (r.s+r.n)*r.Dim(0)]
+	return []input.Multimodal{{Tensor: visionOutputs}}, nil
 }

 // PostTokenize arranges Qwen-2.5-VL's inputs for the forward pass
@@ -142,18 +115,15 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 				result = append(result, input.Input{Token: pre[i]})
 			}

-			// This is an image token with multimodal data
-			chunksData := inp.Multimodal.(*chunks)
-			patchesPerChunk := chunksData.Dim(1)
+			patchesPerChunk := inp.Multimodal[0].Tensor.Dim(1)

 			// First add the vision start token
-			result = append(result, input.Input{Token: visionStartToken, SameBatch: patchesPerChunk + 2})
+			result = append(result, input.Input{Token: visionStartToken})

 			// Add the image token with the multimodal tensor data at the first position
-			// Create a chunk with proper s and n values
 			result = append(result, input.Input{
 				Token:          imageToken,
-				Multimodal:     &chunk{chunks: chunksData, s: 0, n: patchesPerChunk},
+				Multimodal:     inp.Multimodal,
 				MultimodalHash: inp.MultimodalHash,
 				SameBatch:      patchesPerChunk,
 			})
@@ -169,15 +139,8 @@ func (m *Model) PostTokenize(inputs []input.Input) ([]input.Input, error) {
 }

 func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
-	positions, err := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
-	if err != nil {
-		return nil, err
-	}
-
-	outputs, err := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
-	if err != nil {
-		return nil, err
-	}
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+	outputs := ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))

 	return m.TextModel.Forward(ctx, batch.Inputs, positions, outputs, batch, m.Cache)
 }
--- a/model/models/qwen25vl/model_text.go
+++ b/model/models/qwen25vl/model_text.go
@@ -7,13 +7,15 @@ import (
 	"github.com/ollama/ollama/kvcache"
 	"github.com/ollama/ollama/ml"
 	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
 	"github.com/ollama/ollama/model/input"
 )

 type TextOptions struct {
-	ctxLen, hiddenSize, numHeads, numKVHeads int
-	eps, ropeBase, ropeScale                 float32
-	ropeDim, defaultContextLen               uint32
+	hiddenSize, numHeads, numKVHeads int
+	ropeDim, originalContextLength   int
+	eps, ropeBase, ropeScale         float32
 }

 type TextModel struct {
@@ -29,15 +31,14 @@ func NewTextModel(c fs.Config) *TextModel {
 	m := TextModel{
 		Layers: make([]Layer, c.Uint("block_count")),
 		TextOptions: &TextOptions{
-			ctxLen:            int(c.Uint("context_length")),
-			hiddenSize:        int(c.Uint("embedding_length")),
-			numHeads:          int(c.Uint("attention.head_count")),
-			numKVHeads:        int(c.Uint("attention.head_count_kv")),
-			eps:               c.Float("attention.layer_norm_rms_epsilon"),
-			ropeBase:          c.Float("rope.freq_base"),
-			ropeScale:         c.Float("rope.freq_scale", 1),
-			ropeDim:           c.Uint("rope.dimension_count", 128),
-			defaultContextLen: c.Uint("context_length", 128000),
+			hiddenSize:            int(c.Uint("embedding_length")),
+			numHeads:              int(c.Uint("attention.head_count")),
+			numKVHeads:            int(c.Uint("attention.head_count_kv")),
+			ropeDim:               int(c.Uint("rope.dimension_count", 128)),
+			originalContextLength: int(c.Uint("context_length", 128000)),
+			eps:                   c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:              c.Float("rope.freq_base"),
+			ropeScale:             c.Float("rope.freq_scale", 1),
 		},
 	}

@@ -59,11 +60,11 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 	q := sa.Query.Forward(ctx, hiddenState)
 	q = q.Reshape(ctx, headDim, opts.numHeads, batchSize)
-	q = q.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+	q = fast.RoPE(ctx, q, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())

 	k := sa.Key.Forward(ctx, hiddenState)
 	k = k.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
-	k = k.RoPE(ctx, positionIDs, nil, opts.ropeDim, 2, opts.ropeBase, opts.ropeScale, ml.WithContextLen(opts.defaultContextLen))
+	k = fast.RoPE(ctx, k, positionIDs, opts.ropeDim, opts.ropeBase, opts.ropeScale, rope.WithOriginalContextLength(opts.originalContextLength), rope.WithTypeNeoX())

 	v := sa.Value.Forward(ctx, hiddenState)
 	v = v.Reshape(ctx, headDim, opts.numKVHeads, batchSize)
@@ -77,7 +78,7 @@ func (sa *SelfAttention) Forward(ctx ml.Context, hiddenState, positionIDs ml.Ten

 // Shift applies rotary position embeddings to the key tensor for causal attention caching
 func (m *TextModel) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
-	return key.RoPE(ctx, shift, nil, m.ropeDim, 2, m.ropeBase, m.ropeScale, ml.WithContextLen(m.defaultContextLen)), nil
+	return fast.RoPE(ctx, key, shift, m.ropeDim, m.ropeBase, m.ropeScale, rope.WithOriginalContextLength(m.originalContextLength), rope.WithTypeNeoX()), nil
 }

 // MLP implements the feed-forward network component with SwiGLU activation
@@ -129,12 +130,7 @@ func (m *TextModel) Forward(ctx ml.Context, inputs, positions, outputs ml.Tensor
 	hiddenStates := m.TokenEmbedding.Forward(ctx, inputs).Duplicate(ctx)

 	for _, mi := range batch.Multimodal {
-		f32s := mi.Multimodal.(*chunk).floats()
-		img, err := ctx.Input().FromFloatSlice(f32s, len(f32s)/m.hiddenSize, m.hiddenSize)
-		if err != nil {
-			panic(err)
-		}
-
+		img := mi.Multimodal[0].Tensor
 		ctx.Forward(img.Copy(ctx, hiddenStates.View(ctx, mi.Index*hiddenStates.Stride(1), img.Dim(0)*img.Dim(1))))
 	}

--- a/model/models/qwen25vl/model_vision.go
+++ b/model/models/qwen25vl/model_vision.go
@@ -1,7 +1,6 @@
 package qwen25vl

 import (
-	"fmt"
 	"math"
 	"slices"

@@ -44,10 +43,8 @@ func blockDiagonalMask(ctx ml.Context, seqLength int, bounds []int, numHeads int
 		}
 	}

-	mask, err := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
-	if err != nil {
-		panic(err)
-	}
+	mask := ctx.Input().FromFloatSlice(flat, seqLength, seqLength)
+
 	// Reshape to match [seqLength, seqLength, 1] for broadcasting
 	mask = mask.Reshape(ctx, seqLength, seqLength, 1)

@@ -303,10 +300,7 @@ func (m *VisionModel) WindowIndex(ctx ml.Context, grid *Grid) (ml.Tensor, []int)
 		}
 	}

-	t, err := ctx.Input().FromIntSlice(index, len(index))
-	if err != nil {
-		panic(err)
-	}
+	t := ctx.Input().FromIntSlice(index, len(index))

 	return t, bounds
 }
@@ -326,10 +320,7 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			freqVals[i*freq+j] = float32(i) / float32(math.Pow(theta, float64(j*2)/float64(dim)))
 		}
 	}
-	freqs, err := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)
-	if err != nil {
-		panic(fmt.Errorf("failed to create tensor from frequencies: %w", err))
-	}
+	freqs := ctx.Input().FromFloatSlice(freqVals, freq, maxGridSize)

 	// Create position coordinates (y,x pairs) for the grid
 	// In PyTorch: Equivalent to generating position ids with torch.arange()
@@ -339,10 +330,7 @@ func (m *VisionModel) PositionalEmbedding(ctx ml.Context, grid *Grid) ml.Tensor
 			coords = append(coords, int32(y), int32(x))
 		}
 	}
-	pos, err := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)
-	if err != nil {
-		panic(fmt.Errorf("failed to create tensor from positions: %w", err))
-	}
+	pos := ctx.Input().FromIntSlice(coords, 2, grid.Width, grid.Height)

 	// Reshape and permute positions to match spatial merging pattern
 	pos = pos.Reshape(ctx, 2, grid.Width, merge, grid.Height/merge)
--- a/model/models/qwen3/model.go
+++ b/model/models/qwen3/model.go
@@ -0,0 +1,233 @@
+package qwen3
+
+import (
+	"cmp"
+	"math"
+
+	"github.com/ollama/ollama/fs"
+	"github.com/ollama/ollama/kvcache"
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/ml/nn"
+	"github.com/ollama/ollama/ml/nn/fast"
+	"github.com/ollama/ollama/ml/nn/rope"
+	"github.com/ollama/ollama/model"
+	"github.com/ollama/ollama/model/input"
+)
+
+type Options struct {
+	hiddenSize, numHeads, numKVHeads int
+	eps                              float32
+	ropeBase, ropeScale              float32
+
+	keyLength, valueLength int
+
+	numExperts, numExpertsUsed int
+	normTopKProb               bool
+}
+
+func (o Options) headDim() int {
+	return cmp.Or(o.keyLength, o.valueLength, o.hiddenSize/o.numHeads)
+}
+
+type Attention struct {
+	QueryNorm *nn.RMSNorm `gguf:"attn_q_norm"`
+	Query     *nn.Linear  `gguf:"attn_q"`
+	KeyNorm   *nn.RMSNorm `gguf:"attn_k_norm"`
+	Key       *nn.Linear  `gguf:"attn_k"`
+	Value     *nn.Linear  `gguf:"attn_v"`
+	Output    *nn.Linear  `gguf:"attn_output"`
+}
+
+func (sa *Attention) Forward(ctx ml.Context, hiddenStates, positions ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	batchSize := hiddenStates.Dim(1)
+
+	query := sa.Query.Forward(ctx, hiddenStates)
+	key := sa.Key.Forward(ctx, hiddenStates)
+	value := sa.Value.Forward(ctx, hiddenStates)
+
+	query = query.Reshape(ctx, opts.headDim(), opts.numHeads, batchSize)
+	key = key.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+	value = value.Reshape(ctx, opts.headDim(), opts.numKVHeads, batchSize)
+
+	query = sa.QueryNorm.Forward(ctx, query, opts.eps)
+	key = sa.KeyNorm.Forward(ctx, key, opts.eps)
+
+	query = fast.RoPE(ctx, query, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+	key = fast.RoPE(ctx, key, positions, opts.headDim(), opts.ropeBase, opts.ropeScale, rope.WithTypeNeoX())
+
+	attention := nn.Attention(ctx, query, key, value, 1./math.Sqrt(float64(opts.headDim())), cache)
+	attention = attention.Reshape(ctx, attention.Dim(0)*attention.Dim(1), batchSize)
+	return sa.Output.Forward(ctx, attention)
+}
+
+type MLP interface {
+	Forward(ml.Context, ml.Tensor, *Options) ml.Tensor
+}
+
+type sparse struct {
+	Router *nn.Linear `gguf:"ffn_gate_inp"`
+	Gate   ml.Tensor  `gguf:"ffn_gate_exps.weight"`
+	Up     ml.Tensor  `gguf:"ffn_up_exps.weight"`
+	Down   ml.Tensor  `gguf:"ffn_down_exps.weight"`
+}
+
+func (mlp *sparse) Forward(ctx ml.Context, hiddenStates ml.Tensor, opts *Options) ml.Tensor {
+	hiddenDim, sequenceLength, batchSize := hiddenStates.Dim(0), hiddenStates.Dim(1), hiddenStates.Dim(2)
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenDim, sequenceLength*batchSize)
+	routerLogits := mlp.Router.Forward(ctx, hiddenStates)
+
+	routingWeights := routerLogits.Softmax(ctx)
+	selectedExperts := routingWeights.TopK(ctx, opts.numExpertsUsed)
+	routingWeights = routingWeights.Reshape(ctx, 1, opts.numExperts, hiddenStates.Dim(1)).Rows(ctx, selectedExperts)
+	if opts.normTopKProb {
+		routingWeights = routingWeights.Reshape(ctx, opts.numExpertsUsed, hiddenStates.Dim(1))
+		routingWeights = routingWeights.Div(ctx, routingWeights.SumRows(ctx))
+		routingWeights = routingWeights.Reshape(ctx, 1, opts.numExpertsUsed, hiddenStates.Dim(1))
+	}
+
+	hiddenStates = hiddenStates.Reshape(ctx, hiddenStates.Dim(0), 1, hiddenStates.Dim(1))
+
+	upStates := mlp.Up.MulmatID(ctx, hiddenStates, selectedExperts)
+
+	hiddenStates = mlp.Gate.MulmatID(ctx, hiddenStates, selectedExperts)
+	hiddenStates = hiddenStates.SILU(ctx)
+	hiddenStates = hiddenStates.Mul(ctx, upStates)
+
+	experts := mlp.Down.MulmatID(ctx, hiddenStates, selectedExperts)
+	experts = experts.Mul(ctx, routingWeights)
+
+	nextStates := experts.View(ctx, 0, experts.Dim(0), experts.Stride(2), experts.Dim(2))
+	for i := 1; i < opts.numExpertsUsed; i++ {
+		nextStates = nextStates.Add(ctx, experts.View(ctx, i*experts.Stride(1), experts.Dim(0), experts.Stride(2), experts.Dim(2)))
+	}
+
+	return nextStates
+}
+
+type dense struct {
+	Gate *nn.Linear `gguf:"ffn_gate"`
+	Up   *nn.Linear `gguf:"ffn_up"`
+	Down *nn.Linear `gguf:"ffn_down"`
+}
+
+func (mlp *dense) Forward(ctx ml.Context, hiddenStates ml.Tensor, _ *Options) ml.Tensor {
+	hiddenStates = mlp.Gate.Forward(ctx, hiddenStates).SILU(ctx).Mul(ctx, mlp.Up.Forward(ctx, hiddenStates))
+	return mlp.Down.Forward(ctx, hiddenStates)
+}
+
+type Layer struct {
+	AttentionNorm *nn.RMSNorm `gguf:"attn_norm"`
+	*Attention
+
+	MLPNorm *nn.RMSNorm `gguf:"ffn_norm"`
+	MLP
+}
+
+func (d *Layer) Forward(ctx ml.Context, hiddenStates, positions, outputs ml.Tensor, cache kvcache.Cache, opts *Options) ml.Tensor {
+	residual := hiddenStates
+	hiddenStates = d.AttentionNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.Attention.Forward(ctx, hiddenStates, positions, cache, opts)
+
+	if outputs != nil {
+		hiddenStates = hiddenStates.Rows(ctx, outputs)
+		residual = residual.Rows(ctx, outputs)
+	}
+
+	hiddenStates = hiddenStates.Add(ctx, residual)
+
+	residual = hiddenStates
+	hiddenStates = d.MLPNorm.Forward(ctx, hiddenStates, opts.eps)
+	hiddenStates = d.MLP.Forward(ctx, hiddenStates, opts)
+	return hiddenStates.Add(ctx, residual)
+}
+
+type Model struct {
+	model.Base
+	model.BytePairEncoding
+
+	TokenEmbedding *nn.Embedding `gguf:"token_embd"`
+	OutputNorm     *nn.RMSNorm   `gguf:"output_norm"`
+	Output         *nn.Linear    `gguf:"output,alt:token_embd"`
+
+	Layers []Layer `gguf:"blk"`
+
+	*Options
+}
+
+// Forward implements model.Model.
+func (m *Model) Forward(ctx ml.Context, batch input.Batch) (ml.Tensor, error) {
+	positions := ctx.Input().FromIntSlice(batch.Positions, len(batch.Positions))
+
+	hiddenStates := m.TokenEmbedding.Forward(ctx, batch.Inputs)
+
+	for i, layer := range m.Layers {
+		m.Cache.SetLayer(i)
+
+		var outputs ml.Tensor
+		if i == len(m.Layers)-1 {
+			outputs = ctx.Input().FromIntSlice(batch.Outputs, len(batch.Outputs))
+		}
+
+		hiddenStates = layer.Forward(ctx, hiddenStates, positions, outputs, m.Cache, m.Options)
+	}
+
+	hiddenStates = m.OutputNorm.Forward(ctx, hiddenStates, m.eps)
+	return m.Output.Forward(ctx, hiddenStates), nil
+}
+
+func (m *Model) Shift(ctx ml.Context, layer int, key, shift ml.Tensor) (ml.Tensor, error) {
+	return fast.RoPE(ctx, key, shift, m.headDim(), m.ropeBase, m.ropeScale, rope.WithTypeNeoX()), nil
+}
+
+var _ model.Model = (*Model)(nil)
+
+func New(c fs.Config) (model.Model, error) {
+	layers := make([]Layer, c.Uint("block_count"))
+	for i := range layers {
+		if c.String("general.architecture") == "qwen3moe" {
+			layers[i].MLP = &sparse{}
+		} else {
+			layers[i].MLP = &dense{}
+		}
+	}
+
+	m := Model{
+		BytePairEncoding: model.NewBytePairEncoding(
+			`(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+`,
+			&model.Vocabulary{
+				Values: c.Strings("tokenizer.ggml.tokens"),
+				Types:  c.Ints("tokenizer.ggml.token_type"),
+				Merges: c.Strings("tokenizer.ggml.merges"),
+				AddBOS: c.Bool("tokenizer.ggml.add_bos_token", true),
+				BOS:    []int32{int32(c.Uint("tokenizer.ggml.bos_token_id"))},
+				AddEOS: c.Bool("tokenizer.ggml.add_eos_token", false),
+				EOS: append(
+					[]int32{int32(c.Uint("tokenizer.ggml.eos_token_id"))},
+					c.Ints("tokenizer.ggml.eos_token_ids")...,
+				),
+			},
+		),
+		Layers: layers,
+		Options: &Options{
+			hiddenSize:     int(c.Uint("embedding_length")),
+			numHeads:       int(c.Uint("attention.head_count")),
+			numKVHeads:     int(c.Uint("attention.head_count_kv")),
+			keyLength:      int(c.Uint("attention.key_length")),
+			valueLength:    int(c.Uint("attention.value_length")),
+			eps:            c.Float("attention.layer_norm_rms_epsilon"),
+			ropeBase:       c.Float("rope.freq_base"),
+			ropeScale:      c.Float("rope.freq_scale", 1),
+			numExperts:     int(c.Uint("expert_count")),
+			numExpertsUsed: int(c.Uint("expert_used_count")),
+			normTopKProb:   c.Bool("norm_top_k_prob", true),
+		},
+	}
+
+	m.Cache = kvcache.NewCausalCache(m.Shift)
+	return &m, nil
+}
+
+func init() {
+	model.Register("qwen3", New)
+	model.Register("qwen3moe", New)
+}
--- a/model/process_text_spm.go
+++ b/model/process_text_spm.go
@@ -182,27 +182,12 @@ func (spm SentencePieceModel) Encode(s string, addSpecial bool) ([]int32, error)
 		}
 	}

+	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "string", s, "ids", ids)
+
 	if addSpecial && len(ids) > 0 {
-		if spm.vocab.AddBOS {
-			if ids[0] == spm.vocab.BOS {
-				slog.Warn("adding bos token to prompt which already has it", "id", spm.vocab.BOS)
-			}
-
-			slog.Debug("adding bos token to prompt", "id", spm.vocab.BOS)
-			ids = append([]int32{spm.vocab.BOS}, ids...)
-		}
-
-		if spm.vocab.AddEOS {
-			if ids[len(ids)-1] == spm.vocab.EOS {
-				slog.Warn("adding eos token to prompt which already has it", "id", spm.vocab.EOS)
-			}
-
-			slog.Debug("adding eos token to prompt", "id", spm.vocab.EOS)
-			ids = append(ids, spm.vocab.EOS)
-		}
+		ids = spm.vocab.addSpecials(ids)
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "encoded", "ids", ids)
 	return ids, nil
 }

@@ -261,6 +246,6 @@ func (spm SentencePieceModel) Decode(ids []int32) (string, error) {
 		}
 	}

-	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "string", sb.String())
+	slog.Log(context.TODO(), logutil.LevelTrace, "decoded", "ids", ids, "string", sb.String())
 	return sb.String(), nil
 }
--- a/model/process_text_spm_test.go
+++ b/model/process_text_spm_test.go
--- a/model/textprocessor.go
+++ b/model/textprocessor.go
@@ -0,0 +1,17 @@
+package model
+
+const (
+	TOKEN_TYPE_NORMAL = iota + 1
+	TOKEN_TYPE_UNKNOWN
+	TOKEN_TYPE_CONTROL
+	TOKEN_TYPE_USER_DEFINED
+	TOKEN_TYPE_UNUSED
+	TOKEN_TYPE_BYTE
+)
+
+type TextProcessor interface {
+	Encode(s string, addSpecial bool) ([]int32, error)
+	Decode([]int32) (string, error)
+	Is(int32, Special) bool
+	Vocabulary() *Vocabulary
+}
--- a/model/vocabulary.go
+++ b/model/vocabulary.go
@@ -0,0 +1,112 @@
+package model
+
+import (
+	"log/slog"
+	"slices"
+	"sync"
+)
+
+type Special int32
+
+const (
+	SpecialBOS Special = iota
+	SpecialEOS
+)
+
+type Vocabulary struct {
+	Values []string
+	Types  []int32
+	Scores []float32
+	Merges []string
+
+	BOS, EOS       []int32
+	AddBOS, AddEOS bool
+
+	specialOnce sync.Once
+	special     []string
+
+	valuesOnce sync.Once
+	values     map[string]int32
+
+	mergeOnce sync.Once
+	merge     map[string]int32
+}
+
+func (v *Vocabulary) Is(id int32, special Special) bool {
+	switch special {
+	case SpecialBOS:
+		return slices.Contains(v.BOS, id)
+	case SpecialEOS:
+		return slices.Contains(v.EOS, id)
+	default:
+		return false
+	}
+}
+
+func (v *Vocabulary) addSpecials(ids []int32) []int32 {
+	if v.AddBOS && len(v.BOS) > 0 {
+		if slices.Contains(v.BOS, ids[0]) {
+			slog.Warn("adding bos token to prompt which already has it", "id", v.BOS)
+		}
+
+		slog.Debug("adding bos token to prompt", "id", v.BOS)
+		ids = append([]int32{v.BOS[0]}, ids...)
+	}
+
+	if v.AddEOS && len(v.EOS) > 0 {
+		if slices.Contains(v.BOS, ids[len(ids)-1]) {
+			slog.Warn("adding eos token to prompt which already has it", "id", v.EOS)
+		}
+
+		slog.Debug("adding eos token to prompt", "id", v.EOS)
+		ids = append(ids, v.EOS[0])
+	}
+
+	return ids
+}
+
+func (v *Vocabulary) Encode(s string) int32 {
+	v.valuesOnce.Do(func() {
+		v.values = make(map[string]int32, len(v.Values))
+		for i, value := range v.Values {
+			v.values[value] = int32(i)
+		}
+	})
+
+	if id, ok := v.values[s]; ok {
+		return id
+	}
+
+	return -1
+}
+
+func (v *Vocabulary) Decode(id int32) string {
+	return v.Values[id]
+}
+
+func (v *Vocabulary) SpecialVocabulary() []string {
+	v.specialOnce.Do(func() {
+		for i := range v.Values {
+			if v.Types[i] == TOKEN_TYPE_CONTROL {
+				v.special = append(v.special, v.Values[i])
+			}
+		}
+	})
+
+	return v.special
+}
+
+func (v *Vocabulary) Merge(left, right string) int {
+	v.mergeOnce.Do(func() {
+		v.merge = make(map[string]int32, len(v.Merges))
+		for i, merge := range v.Merges {
+			v.merge[merge] = int32(i)
+		}
+	})
+
+	if id, ok := v.merge[left+" "+right]; ok {
+		return int(id)
+	}
+
+	return -1
+}
--- a/runner/llamarunner/cache.go
+++ b/runner/llamarunner/cache.go
@@ -104,8 +104,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input, cachePrompt bool) (*InputCach
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", len(prompt)-numPast)

+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache.go
+++ b/runner/ollamarunner/cache.go
@@ -136,8 +136,8 @@ func (c *InputCache) LoadCacheSlot(prompt []input.Input) (*InputCacheSlot, []inp
 	slog.Debug("loading cache slot", "id", slot.Id, "cache", len(slot.Inputs), "prompt", len(prompt),
 		"used", numPast, "remaining", int32(len(prompt))-numPast)

+	slot.Inputs = prompt[:numPast]
 	prompt = prompt[numPast:]
-	slot.Inputs = slot.Inputs[:numPast]

 	return slot, prompt, nil
 }
--- a/runner/ollamarunner/cache_test.go
+++ b/runner/ollamarunner/cache_test.go
@@ -3,7 +3,6 @@ package ollamarunner
 import (
 	"errors"
 	"fmt"
-	"image"
 	"testing"
 	"time"

@@ -12,10 +11,6 @@ import (
 )

 func TestCountCommon(t *testing.T) {
-	imgA := image.NewRGBA(image.Rect(0, 0, 100, 100))
-	imgB := image.NewRGBA(image.Rect(0, 0, 50, 50))
-	imgC := image.NewRGBA(image.Rect(50, 50, 100, 100))
-
 	tests := []struct {
 		name     string
 		t1       []input.Input
@@ -36,20 +31,20 @@ func TestCountCommon(t *testing.T) {
 		},
 		{
 			name:     "Image Prefix",
-			t1:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Multimodal: imgA, MultimodalHash: 1}, {Multimodal: imgB, MultimodalHash: 2}, {Multimodal: imgC, MultimodalHash: 3}},
+			t1:       []input.Input{{MultimodalHash: 1}},
+			t2:       []input.Input{{MultimodalHash: 1}, {MultimodalHash: 2}, {MultimodalHash: 3}},
 			expected: 1,
 		},
 		{
 			name:     "Mixed",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}, {Token: 5}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 1}, {Token: 5}},
 			expected: 2,
 		},
 		{
 			name:     "Mixed, Same Length",
-			t1:       []input.Input{{Token: 1}, {Multimodal: imgA, MultimodalHash: 1}},
-			t2:       []input.Input{{Token: 1}, {Multimodal: imgB, MultimodalHash: 2}},
+			t1:       []input.Input{{Token: 1}, {MultimodalHash: 1}},
+			t2:       []input.Input{{Token: 1}, {MultimodalHash: 2}},
 			expected: 1,
 		},
 		{
--- a/runner/ollamarunner/multimodal.go
+++ b/runner/ollamarunner/multimodal.go
@@ -0,0 +1,113 @@
+package ollamarunner
+
+import (
+	"errors"
+
+	"github.com/ollama/ollama/ml"
+	"github.com/ollama/ollama/model/input"
+)
+
+// Tensors can't be used across multiple compute graphs. This is a problem
+// if a single embedding is split across batches using views since all of
+// the views will have the same source tensor. We also don't want to
+// recompute the entire embedding for each batch.
+//
+// To avoid this, we compute all of the tensors for the embedding on the
+// first use and then store the result in system memory. When we need
+// additional tensors, we recreate them from the stored data.
+
+// multimodalEntry represents the embeddings of a single object (such
+// as an image).
+type multimodalEntry struct {
+	// mm is the original set of tensors created by EncodeMultimodal
+	mm []input.Multimodal
+
+	// data is the computed result of mm. Nil if not yet computed
+	data [][]float32
+}
+
+// multimodalStore maps from an individual tensor (of which there
+// may be many in a single multimodal object) to its parent embedding
+type multimodalStore map[ml.Tensor]*multimodalEntry
+
+func newMultimodalStore() multimodalStore {
+	return make(multimodalStore)
+}
+
+// addMultimodal stores an embedding for later use in a compute graph
+func (m multimodalStore) addMultimodal(embedding []input.Multimodal) {
+	entry := &multimodalEntry{mm: embedding}
+
+	for _, e := range embedding {
+		if e.Tensor != nil {
+			m[e.Tensor] = entry
+		}
+	}
+}
+
+// getMultimodal takes a source set of tensors (which may contain a whole or
+// parts of one or more images) and returns the equivalent that can be used in
+// the current context
+func (m multimodalStore) getMultimodal(backend ml.Backend, ctx ml.Context, in []input.Multimodal, reserve bool) ([]input.Multimodal, error) {
+	out := make([]input.Multimodal, len(in))
+	for i := range out {
+		if in[i].Tensor != nil {
+			var err error
+			out[i].Tensor, err = m.getTensor(backend, ctx, in[i].Tensor, reserve)
+			if err != nil {
+				return nil, err
+			}
+		}
+
+		out[i].Data = in[i].Data
+	}
+
+	return out, nil
+}
+
+func (m multimodalStore) getTensor(backend ml.Backend, ctx ml.Context, in ml.Tensor, reserve bool) (ml.Tensor, error) {
+	entry := m[in]
+
+	if entry.data == nil {
+		computeCtx := backend.NewContext()
+		defer computeCtx.Close()
+
+		var tensors []ml.Tensor
+		for _, t := range entry.mm {
+			if t.Tensor != nil {
+				tensors = append(tensors, t.Tensor)
+			}
+		}
+
+		if len(tensors) == 0 {
+			return nil, nil
+		}
+
+		computeCtx.Forward(tensors...)
+		entry.data = make([][]float32, len(entry.mm))
+
+		if !reserve {
+			computeCtx.Compute(tensors...)
+
+			for i, t := range entry.mm {
+				if t.Tensor != nil {
+					entry.data[i] = t.Tensor.Floats()
+				}
+			}
+		} else {
+			computeCtx.Reserve()
+		}
+	}
+
+	for i, t := range entry.mm {
+		if in == t.Tensor {
+			if !reserve {
+				return ctx.Input().FromFloatSlice(entry.data[i], t.Tensor.Shape()...), nil
+			} else {
+				return ctx.Input().Empty(t.Tensor.DType(), t.Tensor.Shape()...), nil
+			}
+		}
+	}
+
+	return nil, errors.New("multimodal tensor not found")
+}
--- a/runner/ollamarunner/runner.go
+++ b/runner/ollamarunner/runner.go
@@ -1,12 +1,14 @@
 package ollamarunner

 import (
+	"bytes"
 	"context"
 	"encoding/json"
 	"errors"
 	"flag"
 	"fmt"
 	"hash/maphash"
+	"image"
 	"log"
 	"log/slog"
 	"net"
@@ -20,6 +22,7 @@ import (
 	"time"
 	"unicode/utf8"

+	"golang.org/x/image/bmp"
 	"golang.org/x/sync/semaphore"

 	"github.com/ollama/ollama/api"
@@ -40,6 +43,9 @@ type Sequence struct {
 	// multimodal embeddings
 	ctxs []ml.Context

+	// mmStore holds multimodal embeddings to mange memory and enable splitting across batches
+	mmStore multimodalStore
+
 	// batch index
 	iBatch int

@@ -101,7 +107,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe

 	startTime := time.Now()

-	inputs, ctxs, err := s.inputs(prompt, images)
+	inputs, ctxs, mmStore, err := s.inputs(prompt, images)
 	if err != nil {
 		return nil, fmt.Errorf("failed to process inputs: %w", err)
 	} else if len(inputs) == 0 {
@@ -156,6 +162,7 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe

 	return &Sequence{
 		ctxs:                ctxs,
+		mmStore:             mmStore,
 		inputs:              inputs,
 		numPromptInputs:     len(inputs),
 		startProcessingTime: startTime,
@@ -174,9 +181,10 @@ func (s *Server) NewSequence(prompt string, images []llm.ImageData, params NewSe
 // inputs processes the prompt and images into a list of inputs
 // by splitting the prompt on [img-<n>] tags, tokenizing text and
 // decoding images
-func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, error) {
+func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, []ml.Context, multimodalStore, error) {
 	var inputs []input.Input
 	var ctxs []ml.Context
+	var mmStore multimodalStore

 	var parts []string
 	var matches [][]string
@@ -187,6 +195,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		re := regexp.MustCompile(`\[img-(\d+)\]`)
 		parts = re.Split(prompt, -1)
 		matches = re.FindAllStringSubmatch(prompt, -1)
+		mmStore = newMultimodalStore()
 	} else {
 		parts = []string{prompt}
 	}
@@ -196,7 +205,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		// text - tokenize
 		tokens, err := s.model.(model.TextProcessor).Encode(part, i == 0)
 		if err != nil {
-			return nil, nil, err
+			return nil, nil, nil, err
 		}

 		for _, t := range tokens {
@@ -216,7 +225,7 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			}

 			if imageIndex < 0 {
-				return nil, nil, fmt.Errorf("invalid image index: %d", n)
+				return nil, nil, nil, fmt.Errorf("invalid image index: %d", n)
 			}

 			ctx := s.model.Backend().NewContext()
@@ -224,13 +233,15 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 			ctxs = append(ctxs, ctx)
 			imageEmbeddings, err := multimodalProcessor.EncodeMultimodal(ctx, images[imageIndex].Data)
 			if err != nil {
-				return nil, nil, err
+				return nil, nil, nil, err
 			}

 			s.multimodalHash.Reset()
 			_, _ = s.multimodalHash.Write(images[imageIndex].Data)
 			imageHash := s.multimodalHash.Sum64()

+			mmStore.addMultimodal(imageEmbeddings)
+
 			inputs = append(inputs, input.Input{Multimodal: imageEmbeddings, MultimodalHash: imageHash})
 			postTokenize = true
 		}
@@ -240,11 +251,11 @@ func (s *Server) inputs(prompt string, images []llm.ImageData) ([]input.Input, [
 		var err error
 		inputs, err = multimodalProcessor.PostTokenize(inputs)
 		if err != nil {
-			return nil, nil, err
+			return nil, nil, nil, err
 		}
 	}

-	return inputs, ctxs, nil
+	return inputs, ctxs, mmStore, nil
 }

 type Server struct {
@@ -363,6 +374,9 @@ func (s *Server) processBatch() error {
 	}
 	defer s.mu.Unlock()

+	ctx := s.model.Backend().NewContext()
+	defer ctx.Close()
+
 	var batchInputs []int32
 	var batch input.Batch

@@ -433,7 +447,11 @@ func (s *Server) processBatch() error {

 			batchInputs = append(batchInputs, inp.Token)
 			if inp.Multimodal != nil {
-				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: inp.Multimodal})
+				mm, err := seq.mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, false)
+				if err != nil {
+					return err
+				}
+				batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: len(batchInputs) - 1, Multimodal: mm})
 			}

 			batch.Positions = append(batch.Positions, int32(len(seq.cache.Inputs)+len(seq.pendingInputs)))
@@ -459,9 +477,6 @@ func (s *Server) processBatch() error {
 		return nil
 	}

-	ctx := s.model.Backend().NewContext()
-	defer ctx.Close()
-
 	modelOutput, err := model.Forward(ctx, s.model, batchInputs, batch)
 	if err != nil {
 		return fmt.Errorf("failed to decode batch: %w", err)
@@ -720,12 +735,71 @@ func (s *Server) reserveWorstCaseGraph() error {
 	ctx := s.model.Backend().NewContext()
 	defer ctx.Close()

+	var err error
+	inputs := make([]input.Input, s.batchSize)
+	mmStore := newMultimodalStore()
+
+	// Multimodal strategy:
+	// - Encode a 2048x2048 image. This assumes that a single image of this
+	//   size is sufficient to trigger the worst case. This is currently true
+	//   because for existing models, only a single image fits in a batch.
+	// - Add the embedding to a full batch of tokens - this is necessary because
+	//   the model may be looking for non-image data, such as <image> tags.
+	// - Run PostTokenize to execute any transformations between generated
+	//   embeddings and what the forward pass expects.
+	// - The result may now be larger than a batch (images may not fit in a
+	//   single batch), so trim based on what will fit and must be grouped together.
+	// - Fill out the rest of the space with text tokens.
+	if multimodalProcessor, ok := s.model.(model.MultimodalProcessor); ok {
+		mmCtx := s.model.Backend().NewContext()
+		defer mmCtx.Close()
+
+		img := image.NewGray(image.Rect(0, 0, 2048, 2048))
+		var buf bytes.Buffer
+		bmp.Encode(&buf, img)
+
+		if inputs[0].Multimodal, err = multimodalProcessor.EncodeMultimodal(mmCtx, buf.Bytes()); err == nil {
+			mmStore.addMultimodal(inputs[0].Multimodal)
+
+			inputs, err = multimodalProcessor.PostTokenize(inputs)
+			if err != nil {
+				return err
+			}
+
+			for i, inp := range inputs {
+				minBatch := 1 + inp.SameBatch
+				if minBatch > s.batchSize {
+					inputs = inputs[i:min(i+minBatch, len(inputs))]
+					break
+				} else if i+minBatch > s.batchSize {
+					inputs = inputs[:i]
+					break
+				}
+			}
+
+			if len(inputs) < s.batchSize {
+				newInputs := make([]input.Input, s.batchSize)
+				copy(newInputs, inputs)
+				inputs = newInputs
+			}
+		}
+	}
+
 	var batch input.Batch

-	inputs := make([]int32, s.batchSize)
+	batchInputs := make([]int32, len(inputs))
 	batch.Positions = make([]int32, len(inputs))
 	batch.Sequences = make([]int, len(inputs))
-	for i := range inputs {
+	for i, inp := range inputs {
+		batchInputs[i] = inp.Token
+		if inp.Multimodal != nil {
+			mm, err := mmStore.getMultimodal(s.model.Backend(), ctx, inp.Multimodal, true)
+			if err != nil {
+				return err
+			}
+			batch.Multimodal = append(batch.Multimodal, input.MultimodalIndex{Index: i, Multimodal: mm})
+		}
+
 		batch.Positions[i] = int32(i)
 	}

@@ -734,11 +808,7 @@ func (s *Server) reserveWorstCaseGraph() error {
 		batch.Outputs[i] = int32(i)
 	}

-	var err error
-	batch.Inputs, err = ctx.Input().FromIntSlice(inputs, len(inputs))
-	if err != nil {
-		return err
-	}
+	batch.Inputs = ctx.Input().FromIntSlice(batchInputs, len(batchInputs))

 	cache := s.model.Config().Cache
 	if cache != nil {
@@ -753,16 +823,12 @@ func (s *Server) reserveWorstCaseGraph() error {
 		return err
 	}

-	err = ctx.Forward(t).Reserve()
-	if err != nil {
-		return err
-	}
+	ctx.Forward(t).Reserve()

 	return nil
 }

-func (s *Server) loadModel(
-	ctx context.Context,
+func (s *Server) initModel(
 	mpath string,
 	params ml.BackendParams,
 	lpath multiLPath,
@@ -770,21 +836,21 @@ func (s *Server) loadModel(
 	kvCacheType string,
 	kvSize int,
 	multiUserCache bool,
-) {
+) error {
 	var err error
-	s.model, err = model.New(ctx, mpath, params)
+	s.model, err = model.New(mpath, params)
 	if err != nil {
-		panic(err)
+		return err
 	}

 	// TODO(jessegross): LoRA loading
 	if lpath.String() != "" {
-		panic("loras are not yet implemented")
+		return errors.New("loras are not yet implemented")
 	}

 	s.cache, err = NewInputCache(s.model, kvCacheType, int32(kvSize), parallel, s.batchSize, multiUserCache)
 	if err != nil {
-		panic(err)
+		return err
 	}

 	if !s.cache.enabled && parallel > 1 {
@@ -796,7 +862,30 @@ func (s *Server) loadModel(
 	s.seqs = make([]*Sequence, s.parallel)
 	s.seqsSem = semaphore.NewWeighted(int64(s.parallel))

-	err = s.reserveWorstCaseGraph()
+	return s.reserveWorstCaseGraph()
+}
+
+func (s *Server) load(
+	ctx context.Context,
+	mpath string,
+	params ml.BackendParams,
+	lpath multiLPath,
+	parallel int,
+	kvCacheType string,
+	kvSize int,
+	multiUserCache bool,
+) {
+	err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
+	if err != nil {
+		panic(err)
+	}
+
+	slog.Debug("memory", "allocated", s.model.Backend().BackendMemory())
+
+	err = s.model.Backend().Load(ctx,
+		func(progress float32) {
+			s.progress = progress
+		})
 	if err != nil {
 		panic(err)
 	}
@@ -840,9 +929,14 @@ func Execute(args []string) error {
 		status:    llm.ServerStatusLoadingModel,
 	}

+	server.cond = sync.NewCond(&server.mu)
+	server.ready.Add(1)
+
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
 	// TODO(jessegross): Parameters that need to be implemented:
 	//	no-mmap
-	//	mlock

 	var tensorSplitFloats []float32
 	if *tensorSplit != "" {
@@ -855,9 +949,6 @@ func Execute(args []string) error {
 	}

 	params := ml.BackendParams{
-		Progress: func(progress float32) {
-			server.progress = progress
-		},
 		NumThreads:     *threads,
 		NumGPULayers:   *numGPULayers,
 		MainGPU:        *mainGPU,
@@ -865,14 +956,7 @@ func Execute(args []string) error {
 		FlashAttention: *flashAttention,
 	}

-	server.ready.Add(1)
-	ctx, cancel := context.WithCancel(context.Background())
-	defer cancel()
-
-	go server.loadModel(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
-
-	server.cond = sync.NewCond(&server.mu)
-
+	go server.load(ctx, *mpath, params, lpaths, *parallel, *kvCacheType, *kvSize, *multiUserCache)
 	go server.run(ctx)

 	addr := "127.0.0.1:" + strconv.Itoa(*port)
--- a/sample/samplers.go
+++ b/sample/samplers.go
@@ -176,7 +176,7 @@ func NewGrammarSampler(model model.TextProcessor, grammarStr string) (*GrammarSa
 		vocabIds[i] = uint32(i)
 	}

-	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, []uint32{uint32(model.Vocabulary().EOS), uint32(model.Vocabulary().EOT)})
+	grammar := llama.NewGrammar(grammarStr, vocabIds, pieces, model.Vocabulary().EOS)
 	if grammar == nil {
 		return nil, errors.New("sample: failed to initialize grammar")
 	}
--- a/server/create.go
+++ b/server/create.go
@@ -295,7 +295,7 @@ func convertFromSafetensors(files map[string]string, baseLayers []*layerGGML, is
 	}
 	defer bin.Close()

-	f, _, err := ggml.Decode(bin, -1)
+	f, err := ggml.Decode(bin, -1)
 	if err != nil {
 		return nil, err
 	}
@@ -430,7 +430,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 	fnWrap := func(n uint64) {
 		done := doneBytes.Add(n)
 		progress := float32(done) / float32(totalBytes)
-		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
+		fn(api.ProgressResponse{Status: fmt.Sprintf("quantizing %s model to %s", ft, quantizeType), Digest: "0000000000000000000", Total: layer.Size, Completed: int64(progress * float32(layer.Size))})
 	}
 	ftype, err := ggml.ParseFileType(quantizeType)
 	if err != nil {
@@ -467,7 +467,7 @@ func quantizeLayer(layer *layerGGML, quantizeType string, fn func(resp api.Progr
 		return nil, err
 	}

-	f, _, err := ggml.Decode(temp, 1024)
+	f, err := ggml.Decode(temp, 1024)
 	if err != nil {
 		slog.Error(fmt.Sprintf("error decoding ggml: %s\n", err))
 		return nil, err
@@ -501,48 +501,27 @@ func ggufLayers(digest string, fn func(resp api.ProgressResponse)) ([]*layerGGML
 		return nil, errOnlyGGUFSupported
 	}

-	stat, err := blob.Stat()
+	f, err := ggml.Decode(blob, -1)
 	if err != nil {
 		return nil, err
 	}

-	var offset int64
-	for offset < stat.Size() {
-		f, n, err := ggml.Decode(blob, 1024)
-		if errors.Is(err, io.EOF) {
-			break
-		} else if err != nil {
-			return nil, err
-		}
-
-		mediatype := "application/vnd.ollama.image.model"
-		if f.KV().Kind() == "adapter" {
-			mediatype = "application/vnd.ollama.image.adapter"
-		} else if _, ok := f.KV()[fmt.Sprintf("%s.vision.block_count", f.KV().Architecture())]; ok || f.KV().Kind() == "projector" {
-			mediatype = "application/vnd.ollama.image.projector"
-		}
-
-		var layer Layer
-		if digest != "" && n == stat.Size() && offset == 0 {
-			layer, err = NewLayerFromLayer(digest, mediatype, blob.Name())
-			if err != nil {
-				slog.Debug("could not create new layer from layer", "error", err)
-				return nil, err
-			}
-		}
-
-		// Fallback to creating layer from file copy (either NewLayerFromLayer failed, or digest empty/n != stat.Size())
-		if layer.Digest == "" {
-			layer, err = NewLayer(io.NewSectionReader(blob, offset, n), mediatype)
-			if err != nil {
-				return nil, err
-			}
-		}
-
-		layers = append(layers, &layerGGML{layer, f})
-		offset = n
+	mediatype := "application/vnd.ollama.image.model"
+	if f.KV().Kind() == "adapter" {
+		mediatype = "application/vnd.ollama.image.adapter"
+	} else if (f.KV().Uint("block_count") == 0 && f.KV().Uint("vision.block_count") > 0) || f.KV().Kind() == "projector" {
+		// if a model has vision.block_count but not block_count, it is a standalone vision model
+		mediatype = "application/vnd.ollama.image.projector"
 	}

+	layer, err := NewLayerFromLayer(digest, mediatype, blob.Name())
+	if err != nil {
+		slog.Debug("could not create new layer from layer", "error", err)
+		return nil, err
+	}
+
+	layers = append(layers, &layerGGML{layer, f})
+
 	return detectChatTemplate(layers)
 }

--- a/server/images.go
+++ b/server/images.go
@@ -75,7 +75,7 @@ func (m *Model) Capabilities() []model.Capability {
 	if err == nil {
 		defer r.Close()

-		f, _, err := ggml.Decode(r, 1024)
+		f, err := ggml.Decode(r, 1024)
 		if err == nil {
 			if _, ok := f.KV()[fmt.Sprintf("%s.pooling_type", f.KV().Architecture())]; ok {
 				capabilities = append(capabilities, model.CapabilityEmbedding)
--- a/server/model.go
+++ b/server/model.go
@@ -64,7 +64,7 @@ func parseFromModel(ctx context.Context, name model.Name, fn func(api.ProgressRe
 			}
 			defer blob.Close()

-			f, _, err := ggml.Decode(blob, -1)
+			f, err := ggml.Decode(blob, -1)
 			if err != nil {
 				return nil, err
 			}
--- a/server/quantization.go
+++ b/server/quantization.go
@@ -120,14 +120,30 @@ func getTensorNewType(kv fsggml.KV, qs *quantizeState, newType fsggml.TensorType

 	if newType.IsQuantized() {
 		nx := shape[0]
-		ny := uint64(1)
-		if len(shape) > 1 {
-			ny = shape[1]
-		}
 		qk_k := newType.BlockSize()
+
+		// Check if first dimension is divisible by block size
 		if nx%qk_k != 0 {
-			slog.Warn(fmt.Sprintf("tensor cols %d x %d are not divisible by %d, required for %s.  Falling back to quantization %s", nx, ny, qk_k, newType.String(), fsggml.TensorTypeF16.String()))
-			newType = fsggml.TensorTypeF16
+			// Store the original type for logging
+			originalType := newType
+
+			// Select appropriate fallback based on original type
+			switch newType {
+			case fsggml.TensorTypeQ4_K:
+				newType = fsggml.TensorTypeQ5_0
+			case fsggml.TensorTypeQ5_K:
+				newType = fsggml.TensorTypeQ5_1
+			case fsggml.TensorTypeQ6_K:
+				newType = fsggml.TensorTypeQ8_0
+			}
+
+			// Final check - if still incompatible, fall back to F16
+			if nx%newType.BlockSize() != 0 {
+				newType = fsggml.TensorTypeF16
+			}
+
+			slog.Warn(fmt.Sprintf("tensor cols %d are not divisible by %d, required for %s - using fallback quantization %s",
+				nx, qk_k, originalType.String(), newType.String()))
 		}
 	}
 	return newType
--- a/server/quantization_test.go
+++ b/server/quantization_test.go
@@ -271,7 +271,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatal(err.Error())
 			}
 			defer fp.Close()
-			meta, _, err := fsggml.Decode(fp, -1)
+			meta, err := fsggml.Decode(fp, -1)
 			if err != nil {
 				t.Fatal(err.Error())
 			}
@@ -303,7 +303,7 @@ func TestQuantizeModel(t *testing.T) {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}
 			defer fpNew.Close()
-			newMeta, _, err := fsggml.Decode(fpNew, -1)
+			newMeta, err := fsggml.Decode(fpNew, -1)
 			if err != nil {
 				t.Fatalf("failed to load the quantized model %s: %s", tmp.Name(), err)
 			}
--- a/server/sched.go
+++ b/server/sched.go
@@ -387,6 +387,17 @@ func (s *Scheduler) processCompleted(ctx context.Context) {
 				s.loadedMu.Unlock()
 				runner.refMu.Unlock()
 				slog.Debug("duplicate expired event, ignoring", "runner", runner)
+			} else if runner.pid != runnerToUnload.pid {
+				// If the pids do not match, we likely had multiple load
+				// failures for the same model in quick succession due to
+				// request context canceled and are draining the queue of
+				// events. Ensure the orphaned runner is properly shut down, but
+				// do not delete the mismatched loaded runner, or wait for VRAM
+				// convergence.
+				slog.Debug("orphaned runner shutting down", "orphan", runner, "loaded", runnerToUnload)
+				runner.unload()
+				s.loadedMu.Unlock()
+				runner.refMu.Unlock()
 			} else {
 				slog.Debug("starting background wait for VRAM recovery", "runner", runner)
 				finished := runner.waitForVRAMRecovery()
Author	SHA1	Message	Date
Parth Sareen	884d26093c	llama: add minimum memory for grammar (#10820 )	2025-05-22 18:53:31 -07:00
Jesse Gross	1f371ea92f	ml: Panic rather than return error on tensor allocation failure FromFloatSlice and FromIntSlice return an error if the shape doesn't match the passed data or if memory can't be allocated. Since these are inputs, the memory being allocated is system memory rather than VRAM. In many cases, the caller can't really handle the error and panics. Empty and Zeros directly panic if they can't allocate memory. This makes things consistent by panicing for the first two cases, removing a fair amount of error handling code. This is also consistent with how Go typically handles these situations.	2025-05-22 14:38:09 -07:00
Jesse Gross	73d6a82cce	ollamarunner: Memory usage reporting This provides granular information about the backend memory allocations required by the runner: - Per backend - Per layer - Weights, cache and graph - Allocation status This can be used for debugging and validating memory estimates.	2025-05-22 14:38:09 -07:00
Jesse Gross	6db8a3771c	ggml: Report graph memory for failed allocations GGML has a function to report the allocated size of a backend buffer. However, this returns 0 if we tried to allocate a buffer and it failed. For memory management purposes, it's important to know how much we were trying to allocate. This extends the API to report attempted sizes for all buffers and whether it succeeeded.	2025-05-22 14:38:09 -07:00
Daniel Hiltgen	d950ff12c0	sched: fix runner leak during reloading unload (#10819 ) When the same model is being reloaded rapidly with client connections being canceled before the model finishes loading, the queued unload event could cause a leak of runners by deleting a different runner from the loaded list.	2025-05-22 14:31:36 -07:00
Michael Yang	adff143bcd	fix: mllama quality (#10807 ) * fix mllama convert - transform attn_gate and ffn_gate - swap attention heads for vision models * fix mllama the mlp gate which was applied in the wrong place	2025-05-22 11:30:49 -07:00
Bruce MacDonald	fbe6ae285a	server: improve tensor quantization fallback logic (#10806 ) Fall back to alternative quantization types when a tensor's dimensions aren't divisible by the block size required for the original desired quantization type. If retried quantization types fail, the system ultimately falls back to F16 (half-precision floating point) which has a block size of 1 and can handle any tensor dimension.	2025-05-22 10:48:08 -07:00
Daniel Hiltgen	fdd4d479a3	integration: add qwen2.5-vl (#10815 ) Replace the older llava model with qwen2.5 for vision tests Skip split-batch test on small VRAM systems to avoid excessive test time	2025-05-22 09:12:32 -07:00
Michael Yang	61aeaf7e81	remove support for multiple ggufs in a single file (#10722 ) * remove support for multiple ggufs in a single file this was an attempt to make it easier to import multimodal models into ollama. this was rarely used and error prone so remove it * fix: create fused model from blob	2025-05-21 13:55:31 -07:00
Daniel Hiltgen	7359b02707	win: detect background upgrade in progress (#10785 ) Give the user a helpful error instead of showing connection refused errors.	2025-05-21 10:46:56 -07:00
Michael Yang	c890011322	feat: port qwen2 model (#10782 )	2025-05-21 10:21:24 -07:00
Michael Yang	e0ed984cde	feat: qwen3 dense and sparse models (#10708 ) * feat: qwen3 dense * feat: qwen3moe * fix llama4 moe	2025-05-21 10:21:07 -07:00
Michael Yang	139f84cf21	fix cmakelists (#10804 ) this fixes an issue introduced in #10788	2025-05-21 09:52:52 -07:00
Michael Yang	375839ea2d	chore: disable debug in binary libraries (#10788 )	2025-05-21 09:39:38 -07:00
Michael Yang	69b2fe9282	fix: qwen25vl assign samebatch in multimodal input (#10789 ) setting samebatch on the vision start token is problematic because it will be shared with other inputs that also use images. this will cause the input to be cached and the runner will not see SameBatch. SameBatch will also be incorrect since it may be for a different image. assigning samebatch to the input tokens resolves this by ensure it's assigned correctly to inputs corresponding to the image. not setting same batch correctly may cause panics during inference since images are no longer guaranteed to be in the same batch.	2025-05-21 09:39:20 -07:00
Michael Yang	9ed8bf14cb	ml: add more rope options (#10775 )	2025-05-20 15:51:08 -07:00
DarkCaster	e6a800ca11	llama: fix incorrect initialization of C.struct_common_sampler_cparams.penalty_present (#10779 )	2025-05-20 10:41:15 -07:00
Michael Yang	ff180c3466	fix llama and mistral3 models (#10774 ) * fix llama model * fix mistral3.1 model do not set default vision layers	2025-05-19 15:06:35 -07:00
Jesse Gross	3fe74fba42	llm: Use first layer as memory buffer in estimation This is a partial revert of `0478d44` "Fixed over vram allcation dure to small initial layer sizes." Previously we used the size of the first layer as an extra reserved amount of space to buffer our memory estimates. The above commit changed this to use the largest layer. However, this had performance impacts on more models than the original commit was trying to fix. There is just a heuristic without an ideal solution so this goes back to the historic behavior. Fixes: #10765, #10756, #10752, #10726	2025-05-19 14:03:34 -07:00
Daniel Hiltgen	1a0cfd080a	avoid kv truncation during create (#10761 )	2025-05-19 13:54:54 -07:00
Jesse Gross	94ab428e3f	ggml: Seperate tensor load from backend creation Currently, when the backend is created, the tensors are loaded at the same time, which is a slow operation. This separates them to be two steps: - Create backend, including enumerating tensors and memory allocation - Loading tensor data This allows more flexibility in managing model loading.	2025-05-19 09:54:22 -07:00
Jesse Gross	d755577473	llm: Estimate projector memory correctly for Ollama engine The Llama engine always places vision projectors on the first GPU if one exists. However, the Ollama engine groups it with the output layer, which means the projector is only offloaded if all other layers are offloaded. The memory estimation code always assumes the former layout - this changes it to use the correct layout based on the engine. This addresses two impacts of the current behavior: - In multi-GPU setups, we can crash with OOM errors when we try to allocate memory on a full GPU while another still has space. - If the vision projector is large, it may prevent us from offloading anything when we could have fit some of the text layers.	2025-05-19 09:52:48 -07:00
Jesse Gross	a2cc8571c5	llm: Consistently track unassigned model data In some cases, if we fail to assign a piece of the model to a GPU then we lose track of this data. Although it doesn't change the memory allocation, it does affect the total size of the model reported by tools such as ollama ps (and also the percent offloaded). This makes it look like setting num_gpu isn't reflected in ollama ps, which isn't true but the offloading percent may appear to not change. Spreading the model across more GPUs will continue to impact the reported total size of the model.	2025-05-19 09:52:48 -07:00
Ronald Wilson	7edfdd2f5f	readme: add TinyNotepad to community integrations (#10763 ) This PR adds Tiny Notepad, a lightweight, notepad-like interface to chat with local LLMs via Ollama. - It’s designed as a simple, distraction-free alternative. - The app supports basic note-taking, timestamped logs, and model parameter controls. - Built with Tkinter, it runs entirely offline and available via PyPI. Aims to provide a lightweight easy to run and install interface for ollama.	2025-05-18 12:43:22 -07:00
Michael Yang	333e360422	model: handle multiple eos tokens (#10577 ) * get eos_token_id from generation_config.json * refactor * include both ids and strings in trace * comments * remove special case for gemma3 special vocab (#10743)	2025-05-16 13:40:23 -07:00
Daniel Hiltgen	27da2cddc5	Fix lingering Q4_0 help reference (#10720 )	2025-05-15 16:33:23 -07:00
Bruce MacDonald	feb8923ada	cmd: add ellipses to truncated show metadata (#10717 ) When a piece of information has been truncated in the show output an ellipses to indicate that more data has not been displayed	2025-05-15 15:45:52 -07:00
Jesse Gross	fe623c2cf4	ollamarunner: Multi-modal worst case graph We currently preallocate compute graph memory for the worst case batch of text tokens. This adds support for doing the same for images. Note that image models are more complicated than text models in how they process their inputs so there may be cases where this approach isn't completely generic for all models. It covers all currently supported models though.	2025-05-15 13:46:20 -07:00
Jesse Gross	3c14461d5d	ollamarunner: Separate text and multimodal graphs For some multimodal models (such as gemma3), we create a single graph that generates the image embedding and then use this in the text model. The embedding tensor is completely opaque to the runner. However, this doesn't work if we need to use the embedding in multiple batches. This can arise if the embedding is larger than the batch size. In these cases (as with llama4), we would like to create views that are more appropriately sized. However, if we do this then the original source tensor is used in multiple graphs, which isn't allowed. To avoid that problem, models with this pattern compute the embedding tensor on first use and recreate the individual views. There is no longer a single vision and text graph. This codifies the pattern of separating vision and text graphs. The logic of computing tensors on demand is moved to the runner, so models no longer have to worry about this. It also gives the runner visibility into the multimodal tensors, which is important for memory management.	2025-05-15 13:46:20 -07:00
Jesse Gross	499ae7311f	ollamarunner: Base cached tokens on current prompt When we restore a sequence from the cache, we split the prompt into the already used tokens (stored in the cache) and new tokens that need to be processed. Currently, the references to the used tokens are coming from the stored previous sequence. However, even though we know that the used tokens are semantically equivalent to the prefix of the prompt, tokens can contain pointers which are no longer valid. As a result, it is better to get the used tokens from the prompt, which has currently valid pointers. This doesn't currently have any impact because it isn't possible to reuse the pointers (which are tensors) anyways. However, it becomes an issue once we can.	2025-05-15 13:46:20 -07:00
Michael Yang	ef202789fa	fix pixel values padding (#10718 ) * panic if trying to pad 4d * fix pixel values padding	2025-05-15 13:44:44 -07:00
Michael Yang	55760195e6	fix mllama conversion (#10716 ) cross attention Q and K projections needs to have their heads swapped, similar to non-cross attention Q and K tensors	2025-05-15 12:15:01 -07:00
Bruce MacDonald	bd68d3ae50	ggml: update qwen25vl vision size estimate (#10711 )	2025-05-14 16:42:30 -07:00
Daniel Hiltgen	ff80718e9c	fix crash in old clients with quantization progress (#10710 ) Older clients assumed the digest was at least 19 characters long so increase the size of the dummy digest to avoid array out of bounds crashes.	2025-05-14 14:54:18 -07:00